4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * This file contains all the routines used when modifying on-disk SPA state.
29 * This includes opening, importing, destroying, exporting a pool, and syncing a
33 #include <sys/zfs_context.h>
34 #include <sys/fm/fs/zfs.h>
35 #include <sys/spa_impl.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/zio_compress.h>
40 #include <sys/dmu_tx.h>
43 #include <sys/vdev_impl.h>
44 #include <sys/metaslab.h>
45 #include <sys/uberblock_impl.h>
48 #include <sys/dmu_traverse.h>
49 #include <sys/dmu_objset.h>
50 #include <sys/unique.h>
51 #include <sys/dsl_pool.h>
52 #include <sys/dsl_dataset.h>
53 #include <sys/dsl_dir.h>
54 #include <sys/dsl_prop.h>
55 #include <sys/dsl_synctask.h>
56 #include <sys/fs/zfs.h>
58 #include <sys/callb.h>
59 #include <sys/systeminfo.h>
60 #include <sys/sunddi.h>
61 #include <sys/spa_boot.h>
62 #include <sys/zfs_ioctl.h>
69 #include "zfs_comutil.h"
72 zti_mode_fixed
, /* value is # of threads (min 1) */
73 zti_mode_online_percent
, /* value is % of online CPUs */
74 zti_mode_tune
, /* fill from zio_taskq_tune_* */
78 #define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) }
79 #define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) }
80 #define ZTI_THREAD_TUNE { zti_mode_tune, 0 }
82 #define ZTI_THREAD_ONE ZTI_THREAD_FIX(1)
84 typedef struct zio_taskq_info
{
87 enum zti_modes zti_mode
;
89 } zti_nthreads
[ZIO_TASKQ_TYPES
];
92 static const char *const zio_taskq_types
[ZIO_TASKQ_TYPES
] = {
96 const zio_taskq_info_t zio_taskqs
[ZIO_TYPES
] = {
98 { "spa_zio_null", { ZTI_THREAD_ONE
, ZTI_THREAD_ONE
} },
99 { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE
} },
100 { "spa_zio_write", { ZTI_THREAD_TUNE
, ZTI_THREAD_FIX(8) } },
101 { "spa_zio_free", { ZTI_THREAD_ONE
, ZTI_THREAD_ONE
} },
102 { "spa_zio_claim", { ZTI_THREAD_ONE
, ZTI_THREAD_ONE
} },
103 { "spa_zio_ioctl", { ZTI_THREAD_ONE
, ZTI_THREAD_ONE
} },
106 enum zti_modes zio_taskq_tune_mode
= zti_mode_online_percent
;
107 uint_t zio_taskq_tune_value
= 80; /* #threads = 80% of # online CPUs */
109 static void spa_sync_props(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
);
110 static boolean_t
spa_has_active_shared_spare(spa_t
*spa
);
113 * ==========================================================================
114 * SPA properties routines
115 * ==========================================================================
119 * Add a (source=src, propname=propval) list to an nvlist.
122 spa_prop_add_list(nvlist_t
*nvl
, zpool_prop_t prop
, char *strval
,
123 uint64_t intval
, zprop_source_t src
)
125 const char *propname
= zpool_prop_to_name(prop
);
128 VERIFY(nvlist_alloc(&propval
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
129 VERIFY(nvlist_add_uint64(propval
, ZPROP_SOURCE
, src
) == 0);
132 VERIFY(nvlist_add_string(propval
, ZPROP_VALUE
, strval
) == 0);
134 VERIFY(nvlist_add_uint64(propval
, ZPROP_VALUE
, intval
) == 0);
136 VERIFY(nvlist_add_nvlist(nvl
, propname
, propval
) == 0);
137 nvlist_free(propval
);
141 * Get property values from the spa configuration.
144 spa_prop_get_config(spa_t
*spa
, nvlist_t
**nvp
)
148 uint64_t cap
, version
;
149 zprop_source_t src
= ZPROP_SRC_NONE
;
150 spa_config_dirent_t
*dp
;
152 ASSERT(MUTEX_HELD(&spa
->spa_props_lock
));
154 if (spa
->spa_root_vdev
!= NULL
) {
155 size
= spa_get_space(spa
);
156 used
= spa_get_alloc(spa
);
157 spa_prop_add_list(*nvp
, ZPOOL_PROP_NAME
, spa_name(spa
), 0, src
);
158 spa_prop_add_list(*nvp
, ZPOOL_PROP_SIZE
, NULL
, size
, src
);
159 spa_prop_add_list(*nvp
, ZPOOL_PROP_USED
, NULL
, used
, src
);
160 spa_prop_add_list(*nvp
, ZPOOL_PROP_AVAILABLE
, NULL
,
163 cap
= (size
== 0) ? 0 : (used
* 100 / size
);
164 spa_prop_add_list(*nvp
, ZPOOL_PROP_CAPACITY
, NULL
, cap
, src
);
166 spa_prop_add_list(*nvp
, ZPOOL_PROP_HEALTH
, NULL
,
167 spa
->spa_root_vdev
->vdev_state
, src
);
169 version
= spa_version(spa
);
170 if (version
== zpool_prop_default_numeric(ZPOOL_PROP_VERSION
))
171 src
= ZPROP_SRC_DEFAULT
;
173 src
= ZPROP_SRC_LOCAL
;
174 spa_prop_add_list(*nvp
, ZPOOL_PROP_VERSION
, NULL
, version
, src
);
177 spa_prop_add_list(*nvp
, ZPOOL_PROP_GUID
, NULL
, spa_guid(spa
), src
);
179 if (spa
->spa_root
!= NULL
)
180 spa_prop_add_list(*nvp
, ZPOOL_PROP_ALTROOT
, spa
->spa_root
,
183 if ((dp
= list_head(&spa
->spa_config_list
)) != NULL
) {
184 if (dp
->scd_path
== NULL
) {
185 spa_prop_add_list(*nvp
, ZPOOL_PROP_CACHEFILE
,
186 "none", 0, ZPROP_SRC_LOCAL
);
187 } else if (strcmp(dp
->scd_path
, spa_config_path
) != 0) {
188 spa_prop_add_list(*nvp
, ZPOOL_PROP_CACHEFILE
,
189 dp
->scd_path
, 0, ZPROP_SRC_LOCAL
);
195 * Get zpool property values.
198 spa_prop_get(spa_t
*spa
, nvlist_t
**nvp
)
202 objset_t
*mos
= spa
->spa_meta_objset
;
205 VERIFY(nvlist_alloc(nvp
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
207 mutex_enter(&spa
->spa_props_lock
);
210 * Get properties from the spa config.
212 spa_prop_get_config(spa
, nvp
);
214 /* If no pool property object, no more prop to get. */
215 if (spa
->spa_pool_props_object
== 0) {
216 mutex_exit(&spa
->spa_props_lock
);
221 * Get properties from the MOS pool property object.
223 for (zap_cursor_init(&zc
, mos
, spa
->spa_pool_props_object
);
224 (err
= zap_cursor_retrieve(&zc
, &za
)) == 0;
225 zap_cursor_advance(&zc
)) {
228 zprop_source_t src
= ZPROP_SRC_DEFAULT
;
231 if ((prop
= zpool_name_to_prop(za
.za_name
)) == ZPROP_INVAL
)
234 switch (za
.za_integer_length
) {
236 /* integer property */
237 if (za
.za_first_integer
!=
238 zpool_prop_default_numeric(prop
))
239 src
= ZPROP_SRC_LOCAL
;
241 if (prop
== ZPOOL_PROP_BOOTFS
) {
243 dsl_dataset_t
*ds
= NULL
;
245 dp
= spa_get_dsl(spa
);
246 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
247 if (err
= dsl_dataset_hold_obj(dp
,
248 za
.za_first_integer
, FTAG
, &ds
)) {
249 rw_exit(&dp
->dp_config_rwlock
);
254 MAXNAMELEN
+ strlen(MOS_DIR_NAME
) + 1,
256 dsl_dataset_name(ds
, strval
);
257 dsl_dataset_rele(ds
, FTAG
);
258 rw_exit(&dp
->dp_config_rwlock
);
261 intval
= za
.za_first_integer
;
264 spa_prop_add_list(*nvp
, prop
, strval
, intval
, src
);
268 MAXNAMELEN
+ strlen(MOS_DIR_NAME
) + 1);
273 /* string property */
274 strval
= kmem_alloc(za
.za_num_integers
, KM_SLEEP
);
275 err
= zap_lookup(mos
, spa
->spa_pool_props_object
,
276 za
.za_name
, 1, za
.za_num_integers
, strval
);
278 kmem_free(strval
, za
.za_num_integers
);
281 spa_prop_add_list(*nvp
, prop
, strval
, 0, src
);
282 kmem_free(strval
, za
.za_num_integers
);
289 zap_cursor_fini(&zc
);
290 mutex_exit(&spa
->spa_props_lock
);
292 if (err
&& err
!= ENOENT
) {
302 * Validate the given pool properties nvlist and modify the list
303 * for the property values to be set.
306 spa_prop_validate(spa_t
*spa
, nvlist_t
*props
)
309 int error
= 0, reset_bootfs
= 0;
313 while ((elem
= nvlist_next_nvpair(props
, elem
)) != NULL
) {
315 char *propname
, *strval
;
320 propname
= nvpair_name(elem
);
322 if ((prop
= zpool_name_to_prop(propname
)) == ZPROP_INVAL
)
326 case ZPOOL_PROP_VERSION
:
327 error
= nvpair_value_uint64(elem
, &intval
);
329 (intval
< spa_version(spa
) || intval
> SPA_VERSION
))
333 case ZPOOL_PROP_DELEGATION
:
334 case ZPOOL_PROP_AUTOREPLACE
:
335 case ZPOOL_PROP_LISTSNAPS
:
336 case ZPOOL_PROP_AUTOEXPAND
:
337 error
= nvpair_value_uint64(elem
, &intval
);
338 if (!error
&& intval
> 1)
342 case ZPOOL_PROP_BOOTFS
:
344 * If the pool version is less than SPA_VERSION_BOOTFS,
345 * or the pool is still being created (version == 0),
346 * the bootfs property cannot be set.
348 if (spa_version(spa
) < SPA_VERSION_BOOTFS
) {
354 * Make sure the vdev config is bootable
356 if (!vdev_is_bootable(spa
->spa_root_vdev
)) {
363 error
= nvpair_value_string(elem
, &strval
);
368 if (strval
== NULL
|| strval
[0] == '\0') {
369 objnum
= zpool_prop_default_numeric(
374 if (error
= dmu_objset_open(strval
, DMU_OST_ZFS
,
375 DS_MODE_USER
| DS_MODE_READONLY
, &os
))
378 /* We don't support gzip bootable datasets */
379 if ((error
= dsl_prop_get_integer(strval
,
380 zfs_prop_to_name(ZFS_PROP_COMPRESSION
),
381 &compress
, NULL
)) == 0 &&
382 !BOOTFS_COMPRESS_VALID(compress
)) {
385 objnum
= dmu_objset_id(os
);
387 dmu_objset_close(os
);
391 case ZPOOL_PROP_FAILUREMODE
:
392 error
= nvpair_value_uint64(elem
, &intval
);
393 if (!error
&& (intval
< ZIO_FAILURE_MODE_WAIT
||
394 intval
> ZIO_FAILURE_MODE_PANIC
))
398 * This is a special case which only occurs when
399 * the pool has completely failed. This allows
400 * the user to change the in-core failmode property
401 * without syncing it out to disk (I/Os might
402 * currently be blocked). We do this by returning
403 * EIO to the caller (spa_prop_set) to trick it
404 * into thinking we encountered a property validation
407 if (!error
&& spa_suspended(spa
)) {
408 spa
->spa_failmode
= intval
;
413 case ZPOOL_PROP_CACHEFILE
:
414 if ((error
= nvpair_value_string(elem
, &strval
)) != 0)
417 if (strval
[0] == '\0')
420 if (strcmp(strval
, "none") == 0)
423 if (strval
[0] != '/') {
428 slash
= strrchr(strval
, '/');
429 ASSERT(slash
!= NULL
);
431 if (slash
[1] == '\0' || strcmp(slash
, "/.") == 0 ||
432 strcmp(slash
, "/..") == 0)
441 if (!error
&& reset_bootfs
) {
442 error
= nvlist_remove(props
,
443 zpool_prop_to_name(ZPOOL_PROP_BOOTFS
), DATA_TYPE_STRING
);
446 error
= nvlist_add_uint64(props
,
447 zpool_prop_to_name(ZPOOL_PROP_BOOTFS
), objnum
);
455 spa_configfile_set(spa_t
*spa
, nvlist_t
*nvp
, boolean_t need_sync
)
458 spa_config_dirent_t
*dp
;
460 if (nvlist_lookup_string(nvp
, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE
),
464 dp
= kmem_alloc(sizeof (spa_config_dirent_t
),
467 if (cachefile
[0] == '\0')
468 dp
->scd_path
= spa_strdup(spa_config_path
);
469 else if (strcmp(cachefile
, "none") == 0)
472 dp
->scd_path
= spa_strdup(cachefile
);
474 list_insert_head(&spa
->spa_config_list
, dp
);
476 spa_async_request(spa
, SPA_ASYNC_CONFIG_UPDATE
);
480 spa_prop_set(spa_t
*spa
, nvlist_t
*nvp
)
484 boolean_t need_sync
= B_FALSE
;
487 if ((error
= spa_prop_validate(spa
, nvp
)) != 0)
491 while ((elem
= nvlist_next_nvpair(nvp
, elem
)) != NULL
) {
492 if ((prop
= zpool_name_to_prop(
493 nvpair_name(elem
))) == ZPROP_INVAL
)
496 if (prop
== ZPOOL_PROP_CACHEFILE
|| prop
== ZPOOL_PROP_ALTROOT
)
504 return (dsl_sync_task_do(spa_get_dsl(spa
), NULL
, spa_sync_props
,
511 * If the bootfs property value is dsobj, clear it.
514 spa_prop_clear_bootfs(spa_t
*spa
, uint64_t dsobj
, dmu_tx_t
*tx
)
516 if (spa
->spa_bootfs
== dsobj
&& spa
->spa_pool_props_object
!= 0) {
517 VERIFY(zap_remove(spa
->spa_meta_objset
,
518 spa
->spa_pool_props_object
,
519 zpool_prop_to_name(ZPOOL_PROP_BOOTFS
), tx
) == 0);
525 * ==========================================================================
526 * SPA state manipulation (open/create/destroy/import/export)
527 * ==========================================================================
531 spa_error_entry_compare(const void *a
, const void *b
)
533 spa_error_entry_t
*sa
= (spa_error_entry_t
*)a
;
534 spa_error_entry_t
*sb
= (spa_error_entry_t
*)b
;
537 ret
= bcmp(&sa
->se_bookmark
, &sb
->se_bookmark
,
538 sizeof (zbookmark_t
));
549 * Utility function which retrieves copies of the current logs and
550 * re-initializes them in the process.
553 spa_get_errlists(spa_t
*spa
, avl_tree_t
*last
, avl_tree_t
*scrub
)
555 ASSERT(MUTEX_HELD(&spa
->spa_errlist_lock
));
557 bcopy(&spa
->spa_errlist_last
, last
, sizeof (avl_tree_t
));
558 bcopy(&spa
->spa_errlist_scrub
, scrub
, sizeof (avl_tree_t
));
560 avl_create(&spa
->spa_errlist_scrub
,
561 spa_error_entry_compare
, sizeof (spa_error_entry_t
),
562 offsetof(spa_error_entry_t
, se_avl
));
563 avl_create(&spa
->spa_errlist_last
,
564 spa_error_entry_compare
, sizeof (spa_error_entry_t
),
565 offsetof(spa_error_entry_t
, se_avl
));
569 * Activate an uninitialized pool.
572 spa_activate(spa_t
*spa
, int mode
)
574 ASSERT(spa
->spa_state
== POOL_STATE_UNINITIALIZED
);
576 spa
->spa_state
= POOL_STATE_ACTIVE
;
577 spa
->spa_mode
= mode
;
579 spa
->spa_normal_class
= metaslab_class_create(zfs_metaslab_ops
);
580 spa
->spa_log_class
= metaslab_class_create(zfs_metaslab_ops
);
582 for (int t
= 0; t
< ZIO_TYPES
; t
++) {
583 const zio_taskq_info_t
*ztip
= &zio_taskqs
[t
];
584 for (int q
= 0; q
< ZIO_TASKQ_TYPES
; q
++) {
585 enum zti_modes mode
= ztip
->zti_nthreads
[q
].zti_mode
;
586 uint_t value
= ztip
->zti_nthreads
[q
].zti_value
;
589 (void) snprintf(name
, sizeof (name
),
590 "%s_%s", ztip
->zti_name
, zio_taskq_types
[q
]);
592 if (mode
== zti_mode_tune
) {
593 mode
= zio_taskq_tune_mode
;
594 value
= zio_taskq_tune_value
;
595 if (mode
== zti_mode_tune
)
596 mode
= zti_mode_online_percent
;
601 ASSERT3U(value
, >=, 1);
602 value
= MAX(value
, 1);
604 spa
->spa_zio_taskq
[t
][q
] = taskq_create(name
,
605 value
, maxclsyspri
, 50, INT_MAX
,
609 case zti_mode_online_percent
:
610 spa
->spa_zio_taskq
[t
][q
] = taskq_create(name
,
611 value
, maxclsyspri
, 50, INT_MAX
,
612 TASKQ_PREPOPULATE
| TASKQ_THREADS_CPU_PCT
);
617 panic("unrecognized mode for "
618 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
626 list_create(&spa
->spa_config_dirty_list
, sizeof (vdev_t
),
627 offsetof(vdev_t
, vdev_config_dirty_node
));
628 list_create(&spa
->spa_state_dirty_list
, sizeof (vdev_t
),
629 offsetof(vdev_t
, vdev_state_dirty_node
));
631 txg_list_create(&spa
->spa_vdev_txg_list
,
632 offsetof(struct vdev
, vdev_txg_node
));
634 avl_create(&spa
->spa_errlist_scrub
,
635 spa_error_entry_compare
, sizeof (spa_error_entry_t
),
636 offsetof(spa_error_entry_t
, se_avl
));
637 avl_create(&spa
->spa_errlist_last
,
638 spa_error_entry_compare
, sizeof (spa_error_entry_t
),
639 offsetof(spa_error_entry_t
, se_avl
));
643 * Opposite of spa_activate().
646 spa_deactivate(spa_t
*spa
)
648 ASSERT(spa
->spa_sync_on
== B_FALSE
);
649 ASSERT(spa
->spa_dsl_pool
== NULL
);
650 ASSERT(spa
->spa_root_vdev
== NULL
);
651 ASSERT(spa
->spa_async_zio_root
== NULL
);
652 ASSERT(spa
->spa_state
!= POOL_STATE_UNINITIALIZED
);
654 txg_list_destroy(&spa
->spa_vdev_txg_list
);
656 list_destroy(&spa
->spa_config_dirty_list
);
657 list_destroy(&spa
->spa_state_dirty_list
);
659 for (int t
= 0; t
< ZIO_TYPES
; t
++) {
660 for (int q
= 0; q
< ZIO_TASKQ_TYPES
; q
++) {
661 taskq_destroy(spa
->spa_zio_taskq
[t
][q
]);
662 spa
->spa_zio_taskq
[t
][q
] = NULL
;
666 metaslab_class_destroy(spa
->spa_normal_class
);
667 spa
->spa_normal_class
= NULL
;
669 metaslab_class_destroy(spa
->spa_log_class
);
670 spa
->spa_log_class
= NULL
;
673 * If this was part of an import or the open otherwise failed, we may
674 * still have errors left in the queues. Empty them just in case.
676 spa_errlog_drain(spa
);
678 avl_destroy(&spa
->spa_errlist_scrub
);
679 avl_destroy(&spa
->spa_errlist_last
);
681 spa
->spa_state
= POOL_STATE_UNINITIALIZED
;
685 * Verify a pool configuration, and construct the vdev tree appropriately. This
686 * will create all the necessary vdevs in the appropriate layout, with each vdev
687 * in the CLOSED state. This will prep the pool before open/creation/import.
688 * All vdev validation is done by the vdev_alloc() routine.
691 spa_config_parse(spa_t
*spa
, vdev_t
**vdp
, nvlist_t
*nv
, vdev_t
*parent
,
692 uint_t id
, int atype
)
698 if ((error
= vdev_alloc(spa
, vdp
, nv
, parent
, id
, atype
)) != 0)
701 if ((*vdp
)->vdev_ops
->vdev_op_leaf
)
704 error
= nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
716 for (int c
= 0; c
< children
; c
++) {
718 if ((error
= spa_config_parse(spa
, &vd
, child
[c
], *vdp
, c
,
726 ASSERT(*vdp
!= NULL
);
732 * Opposite of spa_load().
735 spa_unload(spa_t
*spa
)
739 ASSERT(MUTEX_HELD(&spa_namespace_lock
));
744 spa_async_suspend(spa
);
749 if (spa
->spa_sync_on
) {
750 txg_sync_stop(spa
->spa_dsl_pool
);
751 spa
->spa_sync_on
= B_FALSE
;
755 * Wait for any outstanding async I/O to complete.
757 if (spa
->spa_async_zio_root
!= NULL
) {
758 (void) zio_wait(spa
->spa_async_zio_root
);
759 spa
->spa_async_zio_root
= NULL
;
763 * Close the dsl pool.
765 if (spa
->spa_dsl_pool
) {
766 dsl_pool_close(spa
->spa_dsl_pool
);
767 spa
->spa_dsl_pool
= NULL
;
770 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
773 * Drop and purge level 2 cache
775 spa_l2cache_drop(spa
);
780 if (spa
->spa_root_vdev
)
781 vdev_free(spa
->spa_root_vdev
);
782 ASSERT(spa
->spa_root_vdev
== NULL
);
784 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++)
785 vdev_free(spa
->spa_spares
.sav_vdevs
[i
]);
786 if (spa
->spa_spares
.sav_vdevs
) {
787 kmem_free(spa
->spa_spares
.sav_vdevs
,
788 spa
->spa_spares
.sav_count
* sizeof (void *));
789 spa
->spa_spares
.sav_vdevs
= NULL
;
791 if (spa
->spa_spares
.sav_config
) {
792 nvlist_free(spa
->spa_spares
.sav_config
);
793 spa
->spa_spares
.sav_config
= NULL
;
795 spa
->spa_spares
.sav_count
= 0;
797 for (i
= 0; i
< spa
->spa_l2cache
.sav_count
; i
++)
798 vdev_free(spa
->spa_l2cache
.sav_vdevs
[i
]);
799 if (spa
->spa_l2cache
.sav_vdevs
) {
800 kmem_free(spa
->spa_l2cache
.sav_vdevs
,
801 spa
->spa_l2cache
.sav_count
* sizeof (void *));
802 spa
->spa_l2cache
.sav_vdevs
= NULL
;
804 if (spa
->spa_l2cache
.sav_config
) {
805 nvlist_free(spa
->spa_l2cache
.sav_config
);
806 spa
->spa_l2cache
.sav_config
= NULL
;
808 spa
->spa_l2cache
.sav_count
= 0;
810 spa
->spa_async_suspended
= 0;
812 spa_config_exit(spa
, SCL_ALL
, FTAG
);
816 * Load (or re-load) the current list of vdevs describing the active spares for
817 * this pool. When this is called, we have some form of basic information in
818 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
819 * then re-generate a more complete list including status information.
822 spa_load_spares(spa_t
*spa
)
829 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
832 * First, close and free any existing spare vdevs.
834 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++) {
835 vd
= spa
->spa_spares
.sav_vdevs
[i
];
837 /* Undo the call to spa_activate() below */
838 if ((tvd
= spa_lookup_by_guid(spa
, vd
->vdev_guid
,
839 B_FALSE
)) != NULL
&& tvd
->vdev_isspare
)
840 spa_spare_remove(tvd
);
845 if (spa
->spa_spares
.sav_vdevs
)
846 kmem_free(spa
->spa_spares
.sav_vdevs
,
847 spa
->spa_spares
.sav_count
* sizeof (void *));
849 if (spa
->spa_spares
.sav_config
== NULL
)
852 VERIFY(nvlist_lookup_nvlist_array(spa
->spa_spares
.sav_config
,
853 ZPOOL_CONFIG_SPARES
, &spares
, &nspares
) == 0);
855 spa
->spa_spares
.sav_count
= (int)nspares
;
856 spa
->spa_spares
.sav_vdevs
= NULL
;
862 * Construct the array of vdevs, opening them to get status in the
863 * process. For each spare, there is potentially two different vdev_t
864 * structures associated with it: one in the list of spares (used only
865 * for basic validation purposes) and one in the active vdev
866 * configuration (if it's spared in). During this phase we open and
867 * validate each vdev on the spare list. If the vdev also exists in the
868 * active configuration, then we also mark this vdev as an active spare.
870 spa
->spa_spares
.sav_vdevs
= kmem_alloc(nspares
* sizeof (void *),
872 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++) {
873 VERIFY(spa_config_parse(spa
, &vd
, spares
[i
], NULL
, 0,
874 VDEV_ALLOC_SPARE
) == 0);
877 spa
->spa_spares
.sav_vdevs
[i
] = vd
;
879 if ((tvd
= spa_lookup_by_guid(spa
, vd
->vdev_guid
,
881 if (!tvd
->vdev_isspare
)
885 * We only mark the spare active if we were successfully
886 * able to load the vdev. Otherwise, importing a pool
887 * with a bad active spare would result in strange
888 * behavior, because multiple pool would think the spare
889 * is actively in use.
891 * There is a vulnerability here to an equally bizarre
892 * circumstance, where a dead active spare is later
893 * brought back to life (onlined or otherwise). Given
894 * the rarity of this scenario, and the extra complexity
895 * it adds, we ignore the possibility.
897 if (!vdev_is_dead(tvd
))
898 spa_spare_activate(tvd
);
902 vd
->vdev_aux
= &spa
->spa_spares
;
904 if (vdev_open(vd
) != 0)
907 if (vdev_validate_aux(vd
) == 0)
912 * Recompute the stashed list of spares, with status information
915 VERIFY(nvlist_remove(spa
->spa_spares
.sav_config
, ZPOOL_CONFIG_SPARES
,
916 DATA_TYPE_NVLIST_ARRAY
) == 0);
918 spares
= kmem_alloc(spa
->spa_spares
.sav_count
* sizeof (void *),
920 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++)
921 spares
[i
] = vdev_config_generate(spa
,
922 spa
->spa_spares
.sav_vdevs
[i
], B_TRUE
, B_TRUE
, B_FALSE
);
923 VERIFY(nvlist_add_nvlist_array(spa
->spa_spares
.sav_config
,
924 ZPOOL_CONFIG_SPARES
, spares
, spa
->spa_spares
.sav_count
) == 0);
925 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++)
926 nvlist_free(spares
[i
]);
927 kmem_free(spares
, spa
->spa_spares
.sav_count
* sizeof (void *));
931 * Load (or re-load) the current list of vdevs describing the active l2cache for
932 * this pool. When this is called, we have some form of basic information in
933 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
934 * then re-generate a more complete list including status information.
935 * Devices which are already active have their details maintained, and are
939 spa_load_l2cache(spa_t
*spa
)
945 vdev_t
*vd
, **oldvdevs
, **newvdevs
;
946 spa_aux_vdev_t
*sav
= &spa
->spa_l2cache
;
948 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
950 if (sav
->sav_config
!= NULL
) {
951 VERIFY(nvlist_lookup_nvlist_array(sav
->sav_config
,
952 ZPOOL_CONFIG_L2CACHE
, &l2cache
, &nl2cache
) == 0);
953 newvdevs
= kmem_alloc(nl2cache
* sizeof (void *), KM_SLEEP
);
958 oldvdevs
= sav
->sav_vdevs
;
959 oldnvdevs
= sav
->sav_count
;
960 sav
->sav_vdevs
= NULL
;
964 * Process new nvlist of vdevs.
966 for (i
= 0; i
< nl2cache
; i
++) {
967 VERIFY(nvlist_lookup_uint64(l2cache
[i
], ZPOOL_CONFIG_GUID
,
971 for (j
= 0; j
< oldnvdevs
; j
++) {
973 if (vd
!= NULL
&& guid
== vd
->vdev_guid
) {
975 * Retain previous vdev for add/remove ops.
983 if (newvdevs
[i
] == NULL
) {
987 VERIFY(spa_config_parse(spa
, &vd
, l2cache
[i
], NULL
, 0,
988 VDEV_ALLOC_L2CACHE
) == 0);
993 * Commit this vdev as an l2cache device,
994 * even if it fails to open.
1001 spa_l2cache_activate(vd
);
1003 if (vdev_open(vd
) != 0)
1006 (void) vdev_validate_aux(vd
);
1008 if (!vdev_is_dead(vd
))
1009 l2arc_add_vdev(spa
, vd
);
1014 * Purge vdevs that were dropped
1016 for (i
= 0; i
< oldnvdevs
; i
++) {
1021 if (spa_l2cache_exists(vd
->vdev_guid
, &pool
) &&
1022 pool
!= 0ULL && l2arc_vdev_present(vd
))
1023 l2arc_remove_vdev(vd
);
1024 (void) vdev_close(vd
);
1025 spa_l2cache_remove(vd
);
1030 kmem_free(oldvdevs
, oldnvdevs
* sizeof (void *));
1032 if (sav
->sav_config
== NULL
)
1035 sav
->sav_vdevs
= newvdevs
;
1036 sav
->sav_count
= (int)nl2cache
;
1039 * Recompute the stashed list of l2cache devices, with status
1040 * information this time.
1042 VERIFY(nvlist_remove(sav
->sav_config
, ZPOOL_CONFIG_L2CACHE
,
1043 DATA_TYPE_NVLIST_ARRAY
) == 0);
1045 l2cache
= kmem_alloc(sav
->sav_count
* sizeof (void *), KM_SLEEP
);
1046 for (i
= 0; i
< sav
->sav_count
; i
++)
1047 l2cache
[i
] = vdev_config_generate(spa
,
1048 sav
->sav_vdevs
[i
], B_TRUE
, B_FALSE
, B_TRUE
);
1049 VERIFY(nvlist_add_nvlist_array(sav
->sav_config
,
1050 ZPOOL_CONFIG_L2CACHE
, l2cache
, sav
->sav_count
) == 0);
1052 for (i
= 0; i
< sav
->sav_count
; i
++)
1053 nvlist_free(l2cache
[i
]);
1055 kmem_free(l2cache
, sav
->sav_count
* sizeof (void *));
1059 load_nvlist(spa_t
*spa
, uint64_t obj
, nvlist_t
**value
)
1062 char *packed
= NULL
;
1067 VERIFY(0 == dmu_bonus_hold(spa
->spa_meta_objset
, obj
, FTAG
, &db
));
1068 nvsize
= *(uint64_t *)db
->db_data
;
1069 dmu_buf_rele(db
, FTAG
);
1071 packed
= kmem_alloc(nvsize
, KM_SLEEP
);
1072 error
= dmu_read(spa
->spa_meta_objset
, obj
, 0, nvsize
, packed
,
1075 error
= nvlist_unpack(packed
, nvsize
, value
, 0);
1076 kmem_free(packed
, nvsize
);
1082 * Checks to see if the given vdev could not be opened, in which case we post a
1083 * sysevent to notify the autoreplace code that the device has been removed.
1086 spa_check_removed(vdev_t
*vd
)
1088 for (int c
= 0; c
< vd
->vdev_children
; c
++)
1089 spa_check_removed(vd
->vdev_child
[c
]);
1091 if (vd
->vdev_ops
->vdev_op_leaf
&& vdev_is_dead(vd
)) {
1092 zfs_post_autoreplace(vd
->vdev_spa
, vd
);
1093 spa_event_notify(vd
->vdev_spa
, vd
, ESC_ZFS_VDEV_CHECK
);
1098 * Load the slog device state from the config object since it's possible
1099 * that the label does not contain the most up-to-date information.
1102 spa_load_log_state(spa_t
*spa
)
1104 nvlist_t
*nv
, *nvroot
, **child
;
1107 vdev_t
*rvd
= spa
->spa_root_vdev
;
1109 VERIFY(load_nvlist(spa
, spa
->spa_config_object
, &nv
) == 0);
1110 VERIFY(nvlist_lookup_nvlist(nv
, ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) == 0);
1111 VERIFY(nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
1112 &child
, &children
) == 0);
1114 for (int c
= 0; c
< children
; c
++) {
1115 vdev_t
*tvd
= rvd
->vdev_child
[c
];
1117 if (nvlist_lookup_uint64(child
[c
], ZPOOL_CONFIG_IS_LOG
,
1118 &is_log
) == 0 && is_log
)
1119 vdev_load_log_state(tvd
, child
[c
]);
1125 * Check for missing log devices
1128 spa_check_logs(spa_t
*spa
)
1130 switch (spa
->spa_log_state
) {
1131 case SPA_LOG_MISSING
:
1132 /* need to recheck in case slog has been restored */
1133 case SPA_LOG_UNKNOWN
:
1134 if (dmu_objset_find(spa
->spa_name
, zil_check_log_chain
, NULL
,
1135 DS_FIND_CHILDREN
)) {
1136 spa
->spa_log_state
= SPA_LOG_MISSING
;
1145 * Load an existing storage pool, using the pool's builtin spa_config as a
1146 * source of configuration information.
1149 spa_load(spa_t
*spa
, nvlist_t
*config
, spa_load_state_t state
, int mosconfig
)
1152 nvlist_t
*nvroot
= NULL
;
1154 uberblock_t
*ub
= &spa
->spa_uberblock
;
1155 uint64_t config_cache_txg
= spa
->spa_config_txg
;
1158 uint64_t autoreplace
= 0;
1159 int orig_mode
= spa
->spa_mode
;
1160 char *ereport
= FM_EREPORT_ZFS_POOL
;
1163 * If this is an untrusted config, access the pool in read-only mode.
1164 * This prevents things like resilvering recently removed devices.
1167 spa
->spa_mode
= FREAD
;
1169 ASSERT(MUTEX_HELD(&spa_namespace_lock
));
1171 spa
->spa_load_state
= state
;
1173 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) ||
1174 nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
, &pool_guid
)) {
1180 * Versioning wasn't explicitly added to the label until later, so if
1181 * it's not present treat it as the initial version.
1183 if (nvlist_lookup_uint64(config
, ZPOOL_CONFIG_VERSION
, &version
) != 0)
1184 version
= SPA_VERSION_INITIAL
;
1186 (void) nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
,
1187 &spa
->spa_config_txg
);
1189 if ((state
== SPA_LOAD_IMPORT
|| state
== SPA_LOAD_TRYIMPORT
) &&
1190 spa_guid_exists(pool_guid
, 0)) {
1195 spa
->spa_load_guid
= pool_guid
;
1198 * Create "The Godfather" zio to hold all async IOs
1200 spa
->spa_async_zio_root
= zio_root(spa
, NULL
, NULL
,
1201 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_GODFATHER
);
1204 * Parse the configuration into a vdev tree. We explicitly set the
1205 * value that will be returned by spa_version() since parsing the
1206 * configuration requires knowing the version number.
1208 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1209 spa
->spa_ubsync
.ub_version
= version
;
1210 error
= spa_config_parse(spa
, &rvd
, nvroot
, NULL
, 0, VDEV_ALLOC_LOAD
);
1211 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1216 ASSERT(spa
->spa_root_vdev
== rvd
);
1217 ASSERT(spa_guid(spa
) == pool_guid
);
1220 * Try to open all vdevs, loading each label in the process.
1222 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1223 error
= vdev_open(rvd
);
1224 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1229 * We need to validate the vdev labels against the configuration that
1230 * we have in hand, which is dependent on the setting of mosconfig. If
1231 * mosconfig is true then we're validating the vdev labels based on
1232 * that config. Otherwise, we're validating against the cached config
1233 * (zpool.cache) that was read when we loaded the zfs module, and then
1234 * later we will recursively call spa_load() and validate against
1237 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1238 error
= vdev_validate(rvd
);
1239 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1243 if (rvd
->vdev_state
<= VDEV_STATE_CANT_OPEN
) {
1249 * Find the best uberblock.
1251 vdev_uberblock_load(NULL
, rvd
, ub
);
1254 * If we weren't able to find a single valid uberblock, return failure.
1256 if (ub
->ub_txg
== 0) {
1257 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1258 VDEV_AUX_CORRUPT_DATA
);
1264 * If the pool is newer than the code, we can't open it.
1266 if (ub
->ub_version
> SPA_VERSION
) {
1267 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1268 VDEV_AUX_VERSION_NEWER
);
1274 * If the vdev guid sum doesn't match the uberblock, we have an
1275 * incomplete configuration.
1277 if (rvd
->vdev_guid_sum
!= ub
->ub_guid_sum
&& mosconfig
) {
1278 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1279 VDEV_AUX_BAD_GUID_SUM
);
1285 * Initialize internal SPA structures.
1287 spa
->spa_state
= POOL_STATE_ACTIVE
;
1288 spa
->spa_ubsync
= spa
->spa_uberblock
;
1289 spa
->spa_first_txg
= spa_last_synced_txg(spa
) + 1;
1290 error
= dsl_pool_open(spa
, spa
->spa_first_txg
, &spa
->spa_dsl_pool
);
1292 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1293 VDEV_AUX_CORRUPT_DATA
);
1296 spa
->spa_meta_objset
= spa
->spa_dsl_pool
->dp_meta_objset
;
1298 if (zap_lookup(spa
->spa_meta_objset
,
1299 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_CONFIG
,
1300 sizeof (uint64_t), 1, &spa
->spa_config_object
) != 0) {
1301 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1302 VDEV_AUX_CORRUPT_DATA
);
1308 nvlist_t
*newconfig
;
1311 if (load_nvlist(spa
, spa
->spa_config_object
, &newconfig
) != 0) {
1312 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1313 VDEV_AUX_CORRUPT_DATA
);
1318 if (!spa_is_root(spa
) && nvlist_lookup_uint64(newconfig
,
1319 ZPOOL_CONFIG_HOSTID
, &hostid
) == 0) {
1321 unsigned long myhostid
= 0;
1323 VERIFY(nvlist_lookup_string(newconfig
,
1324 ZPOOL_CONFIG_HOSTNAME
, &hostname
) == 0);
1327 myhostid
= zone_get_hostid(NULL
);
1330 * We're emulating the system's hostid in userland, so
1331 * we can't use zone_get_hostid().
1333 (void) ddi_strtoul(hw_serial
, NULL
, 10, &myhostid
);
1334 #endif /* _KERNEL */
1335 if (hostid
!= 0 && myhostid
!= 0 &&
1336 hostid
!= myhostid
) {
1337 cmn_err(CE_WARN
, "pool '%s' could not be "
1338 "loaded as it was last accessed by "
1339 "another system (host: %s hostid: 0x%lx). "
1340 "See: http://www.sun.com/msg/ZFS-8000-EY",
1341 spa_name(spa
), hostname
,
1342 (unsigned long)hostid
);
1348 spa_config_set(spa
, newconfig
);
1350 spa_deactivate(spa
);
1351 spa_activate(spa
, orig_mode
);
1353 return (spa_load(spa
, newconfig
, state
, B_TRUE
));
1356 if (zap_lookup(spa
->spa_meta_objset
,
1357 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_SYNC_BPLIST
,
1358 sizeof (uint64_t), 1, &spa
->spa_sync_bplist_obj
) != 0) {
1359 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1360 VDEV_AUX_CORRUPT_DATA
);
1366 * Load the bit that tells us to use the new accounting function
1367 * (raid-z deflation). If we have an older pool, this will not
1370 error
= zap_lookup(spa
->spa_meta_objset
,
1371 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_DEFLATE
,
1372 sizeof (uint64_t), 1, &spa
->spa_deflate
);
1373 if (error
!= 0 && error
!= ENOENT
) {
1374 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1375 VDEV_AUX_CORRUPT_DATA
);
1381 * Load the persistent error log. If we have an older pool, this will
1384 error
= zap_lookup(spa
->spa_meta_objset
,
1385 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_ERRLOG_LAST
,
1386 sizeof (uint64_t), 1, &spa
->spa_errlog_last
);
1387 if (error
!= 0 && error
!= ENOENT
) {
1388 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1389 VDEV_AUX_CORRUPT_DATA
);
1394 error
= zap_lookup(spa
->spa_meta_objset
,
1395 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_ERRLOG_SCRUB
,
1396 sizeof (uint64_t), 1, &spa
->spa_errlog_scrub
);
1397 if (error
!= 0 && error
!= ENOENT
) {
1398 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1399 VDEV_AUX_CORRUPT_DATA
);
1405 * Load the history object. If we have an older pool, this
1406 * will not be present.
1408 error
= zap_lookup(spa
->spa_meta_objset
,
1409 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_HISTORY
,
1410 sizeof (uint64_t), 1, &spa
->spa_history
);
1411 if (error
!= 0 && error
!= ENOENT
) {
1412 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1413 VDEV_AUX_CORRUPT_DATA
);
1419 * Load any hot spares for this pool.
1421 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
1422 DMU_POOL_SPARES
, sizeof (uint64_t), 1, &spa
->spa_spares
.sav_object
);
1423 if (error
!= 0 && error
!= ENOENT
) {
1424 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1425 VDEV_AUX_CORRUPT_DATA
);
1430 ASSERT(spa_version(spa
) >= SPA_VERSION_SPARES
);
1431 if (load_nvlist(spa
, spa
->spa_spares
.sav_object
,
1432 &spa
->spa_spares
.sav_config
) != 0) {
1433 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1434 VDEV_AUX_CORRUPT_DATA
);
1439 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1440 spa_load_spares(spa
);
1441 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1445 * Load any level 2 ARC devices for this pool.
1447 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
1448 DMU_POOL_L2CACHE
, sizeof (uint64_t), 1,
1449 &spa
->spa_l2cache
.sav_object
);
1450 if (error
!= 0 && error
!= ENOENT
) {
1451 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1452 VDEV_AUX_CORRUPT_DATA
);
1457 ASSERT(spa_version(spa
) >= SPA_VERSION_L2CACHE
);
1458 if (load_nvlist(spa
, spa
->spa_l2cache
.sav_object
,
1459 &spa
->spa_l2cache
.sav_config
) != 0) {
1460 vdev_set_state(rvd
, B_TRUE
,
1461 VDEV_STATE_CANT_OPEN
,
1462 VDEV_AUX_CORRUPT_DATA
);
1467 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1468 spa_load_l2cache(spa
);
1469 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1472 spa_load_log_state(spa
);
1474 if (spa_check_logs(spa
)) {
1475 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1478 ereport
= FM_EREPORT_ZFS_LOG_REPLAY
;
1483 spa
->spa_delegation
= zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION
);
1485 error
= zap_lookup(spa
->spa_meta_objset
, DMU_POOL_DIRECTORY_OBJECT
,
1486 DMU_POOL_PROPS
, sizeof (uint64_t), 1, &spa
->spa_pool_props_object
);
1488 if (error
&& error
!= ENOENT
) {
1489 vdev_set_state(rvd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1490 VDEV_AUX_CORRUPT_DATA
);
1496 (void) zap_lookup(spa
->spa_meta_objset
,
1497 spa
->spa_pool_props_object
,
1498 zpool_prop_to_name(ZPOOL_PROP_BOOTFS
),
1499 sizeof (uint64_t), 1, &spa
->spa_bootfs
);
1500 (void) zap_lookup(spa
->spa_meta_objset
,
1501 spa
->spa_pool_props_object
,
1502 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE
),
1503 sizeof (uint64_t), 1, &autoreplace
);
1504 (void) zap_lookup(spa
->spa_meta_objset
,
1505 spa
->spa_pool_props_object
,
1506 zpool_prop_to_name(ZPOOL_PROP_DELEGATION
),
1507 sizeof (uint64_t), 1, &spa
->spa_delegation
);
1508 (void) zap_lookup(spa
->spa_meta_objset
,
1509 spa
->spa_pool_props_object
,
1510 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE
),
1511 sizeof (uint64_t), 1, &spa
->spa_failmode
);
1512 (void) zap_lookup(spa
->spa_meta_objset
,
1513 spa
->spa_pool_props_object
,
1514 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND
),
1515 sizeof (uint64_t), 1, &spa
->spa_autoexpand
);
1519 * If the 'autoreplace' property is set, then post a resource notifying
1520 * the ZFS DE that it should not issue any faults for unopenable
1521 * devices. We also iterate over the vdevs, and post a sysevent for any
1522 * unopenable vdevs so that the normal autoreplace handler can take
1525 if (autoreplace
&& state
!= SPA_LOAD_TRYIMPORT
)
1526 spa_check_removed(spa
->spa_root_vdev
);
1529 * Load the vdev state for all toplevel vdevs.
1534 * Propagate the leaf DTLs we just loaded all the way up the tree.
1536 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
1537 vdev_dtl_reassess(rvd
, 0, 0, B_FALSE
);
1538 spa_config_exit(spa
, SCL_ALL
, FTAG
);
1541 * Check the state of the root vdev. If it can't be opened, it
1542 * indicates one or more toplevel vdevs are faulted.
1544 if (rvd
->vdev_state
<= VDEV_STATE_CANT_OPEN
) {
1549 if (spa_writeable(spa
)) {
1551 int need_update
= B_FALSE
;
1553 ASSERT(state
!= SPA_LOAD_TRYIMPORT
);
1556 * Claim log blocks that haven't been committed yet.
1557 * This must all happen in a single txg.
1559 tx
= dmu_tx_create_assigned(spa_get_dsl(spa
),
1560 spa_first_txg(spa
));
1561 (void) dmu_objset_find(spa_name(spa
),
1562 zil_claim
, tx
, DS_FIND_CHILDREN
);
1565 spa
->spa_log_state
= SPA_LOG_GOOD
;
1566 spa
->spa_sync_on
= B_TRUE
;
1567 txg_sync_start(spa
->spa_dsl_pool
);
1570 * Wait for all claims to sync.
1572 txg_wait_synced(spa
->spa_dsl_pool
, 0);
1575 * If the config cache is stale, or we have uninitialized
1576 * metaslabs (see spa_vdev_add()), then update the config.
1578 if (config_cache_txg
!= spa
->spa_config_txg
||
1579 state
== SPA_LOAD_IMPORT
)
1580 need_update
= B_TRUE
;
1582 for (int c
= 0; c
< rvd
->vdev_children
; c
++)
1583 if (rvd
->vdev_child
[c
]->vdev_ms_array
== 0)
1584 need_update
= B_TRUE
;
1587 * Update the config cache asychronously in case we're the
1588 * root pool, in which case the config cache isn't writable yet.
1591 spa_async_request(spa
, SPA_ASYNC_CONFIG_UPDATE
);
1594 * Check all DTLs to see if anything needs resilvering.
1596 if (vdev_resilver_needed(rvd
, NULL
, NULL
))
1597 spa_async_request(spa
, SPA_ASYNC_RESILVER
);
1602 spa
->spa_minref
= refcount_count(&spa
->spa_refcount
);
1603 if (error
&& error
!= EBADF
)
1604 zfs_ereport_post(ereport
, spa
, NULL
, NULL
, 0, 0);
1605 spa
->spa_load_state
= SPA_LOAD_NONE
;
1614 * The import case is identical to an open except that the configuration is sent
1615 * down from userland, instead of grabbed from the configuration cache. For the
1616 * case of an open, the pool configuration will exist in the
1617 * POOL_STATE_UNINITIALIZED state.
1619 * The stats information (gen/count/ustats) is used to gather vdev statistics at
1620 * the same time open the pool, without having to keep around the spa_t in some
1624 spa_open_common(const char *pool
, spa_t
**spapp
, void *tag
, nvlist_t
**config
)
1628 int locked
= B_FALSE
;
1633 * As disgusting as this is, we need to support recursive calls to this
1634 * function because dsl_dir_open() is called during spa_load(), and ends
1635 * up calling spa_open() again. The real fix is to figure out how to
1636 * avoid dsl_dir_open() calling this in the first place.
1638 if (mutex_owner(&spa_namespace_lock
) != curthread
) {
1639 mutex_enter(&spa_namespace_lock
);
1643 if ((spa
= spa_lookup(pool
)) == NULL
) {
1645 mutex_exit(&spa_namespace_lock
);
1648 if (spa
->spa_state
== POOL_STATE_UNINITIALIZED
) {
1650 spa_activate(spa
, spa_mode_global
);
1652 error
= spa_load(spa
, spa
->spa_config
, SPA_LOAD_OPEN
, B_FALSE
);
1654 if (error
== EBADF
) {
1656 * If vdev_validate() returns failure (indicated by
1657 * EBADF), it indicates that one of the vdevs indicates
1658 * that the pool has been exported or destroyed. If
1659 * this is the case, the config cache is out of sync and
1660 * we should remove the pool from the namespace.
1663 spa_deactivate(spa
);
1664 spa_config_sync(spa
, B_TRUE
, B_TRUE
);
1667 mutex_exit(&spa_namespace_lock
);
1673 * We can't open the pool, but we still have useful
1674 * information: the state of each vdev after the
1675 * attempted vdev_open(). Return this to the user.
1677 if (config
!= NULL
&& spa
->spa_root_vdev
!= NULL
)
1678 *config
= spa_config_generate(spa
, NULL
, -1ULL,
1681 spa_deactivate(spa
);
1682 spa
->spa_last_open_failed
= B_TRUE
;
1684 mutex_exit(&spa_namespace_lock
);
1688 spa
->spa_last_open_failed
= B_FALSE
;
1692 spa_open_ref(spa
, tag
);
1695 mutex_exit(&spa_namespace_lock
);
1700 *config
= spa_config_generate(spa
, NULL
, -1ULL, B_TRUE
);
1706 spa_open(const char *name
, spa_t
**spapp
, void *tag
)
1708 return (spa_open_common(name
, spapp
, tag
, NULL
));
1712 * Lookup the given spa_t, incrementing the inject count in the process,
1713 * preventing it from being exported or destroyed.
1716 spa_inject_addref(char *name
)
1720 mutex_enter(&spa_namespace_lock
);
1721 if ((spa
= spa_lookup(name
)) == NULL
) {
1722 mutex_exit(&spa_namespace_lock
);
1725 spa
->spa_inject_ref
++;
1726 mutex_exit(&spa_namespace_lock
);
1732 spa_inject_delref(spa_t
*spa
)
1734 mutex_enter(&spa_namespace_lock
);
1735 spa
->spa_inject_ref
--;
1736 mutex_exit(&spa_namespace_lock
);
1740 * Add spares device information to the nvlist.
1743 spa_add_spares(spa_t
*spa
, nvlist_t
*config
)
1753 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_READER
));
1755 if (spa
->spa_spares
.sav_count
== 0)
1758 VERIFY(nvlist_lookup_nvlist(config
,
1759 ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) == 0);
1760 VERIFY(nvlist_lookup_nvlist_array(spa
->spa_spares
.sav_config
,
1761 ZPOOL_CONFIG_SPARES
, &spares
, &nspares
) == 0);
1763 VERIFY(nvlist_add_nvlist_array(nvroot
,
1764 ZPOOL_CONFIG_SPARES
, spares
, nspares
) == 0);
1765 VERIFY(nvlist_lookup_nvlist_array(nvroot
,
1766 ZPOOL_CONFIG_SPARES
, &spares
, &nspares
) == 0);
1769 * Go through and find any spares which have since been
1770 * repurposed as an active spare. If this is the case, update
1771 * their status appropriately.
1773 for (i
= 0; i
< nspares
; i
++) {
1774 VERIFY(nvlist_lookup_uint64(spares
[i
],
1775 ZPOOL_CONFIG_GUID
, &guid
) == 0);
1776 if (spa_spare_exists(guid
, &pool
, NULL
) &&
1778 VERIFY(nvlist_lookup_uint64_array(
1779 spares
[i
], ZPOOL_CONFIG_STATS
,
1780 (uint64_t **)&vs
, &vsc
) == 0);
1781 vs
->vs_state
= VDEV_STATE_CANT_OPEN
;
1782 vs
->vs_aux
= VDEV_AUX_SPARED
;
1789 * Add l2cache device information to the nvlist, including vdev stats.
1792 spa_add_l2cache(spa_t
*spa
, nvlist_t
*config
)
1795 uint_t i
, j
, nl2cache
;
1802 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_READER
));
1804 if (spa
->spa_l2cache
.sav_count
== 0)
1807 VERIFY(nvlist_lookup_nvlist(config
,
1808 ZPOOL_CONFIG_VDEV_TREE
, &nvroot
) == 0);
1809 VERIFY(nvlist_lookup_nvlist_array(spa
->spa_l2cache
.sav_config
,
1810 ZPOOL_CONFIG_L2CACHE
, &l2cache
, &nl2cache
) == 0);
1811 if (nl2cache
!= 0) {
1812 VERIFY(nvlist_add_nvlist_array(nvroot
,
1813 ZPOOL_CONFIG_L2CACHE
, l2cache
, nl2cache
) == 0);
1814 VERIFY(nvlist_lookup_nvlist_array(nvroot
,
1815 ZPOOL_CONFIG_L2CACHE
, &l2cache
, &nl2cache
) == 0);
1818 * Update level 2 cache device stats.
1821 for (i
= 0; i
< nl2cache
; i
++) {
1822 VERIFY(nvlist_lookup_uint64(l2cache
[i
],
1823 ZPOOL_CONFIG_GUID
, &guid
) == 0);
1826 for (j
= 0; j
< spa
->spa_l2cache
.sav_count
; j
++) {
1828 spa
->spa_l2cache
.sav_vdevs
[j
]->vdev_guid
) {
1829 vd
= spa
->spa_l2cache
.sav_vdevs
[j
];
1835 VERIFY(nvlist_lookup_uint64_array(l2cache
[i
],
1836 ZPOOL_CONFIG_STATS
, (uint64_t **)&vs
, &vsc
) == 0);
1837 vdev_get_stats(vd
, vs
);
1843 spa_get_stats(const char *name
, nvlist_t
**config
, char *altroot
, size_t buflen
)
1849 error
= spa_open_common(name
, &spa
, FTAG
, config
);
1853 * This still leaves a window of inconsistency where the spares
1854 * or l2cache devices could change and the config would be
1855 * self-inconsistent.
1857 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
1859 if (*config
!= NULL
) {
1860 VERIFY(nvlist_add_uint64(*config
,
1861 ZPOOL_CONFIG_ERRCOUNT
,
1862 spa_get_errlog_size(spa
)) == 0);
1864 if (spa_suspended(spa
))
1865 VERIFY(nvlist_add_uint64(*config
,
1866 ZPOOL_CONFIG_SUSPENDED
,
1867 spa
->spa_failmode
) == 0);
1869 spa_add_spares(spa
, *config
);
1870 spa_add_l2cache(spa
, *config
);
1875 * We want to get the alternate root even for faulted pools, so we cheat
1876 * and call spa_lookup() directly.
1880 mutex_enter(&spa_namespace_lock
);
1881 spa
= spa_lookup(name
);
1883 spa_altroot(spa
, altroot
, buflen
);
1887 mutex_exit(&spa_namespace_lock
);
1889 spa_altroot(spa
, altroot
, buflen
);
1894 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
1895 spa_close(spa
, FTAG
);
1902 * Validate that the auxiliary device array is well formed. We must have an
1903 * array of nvlists, each which describes a valid leaf vdev. If this is an
1904 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1905 * specified, as long as they are well-formed.
1908 spa_validate_aux_devs(spa_t
*spa
, nvlist_t
*nvroot
, uint64_t crtxg
, int mode
,
1909 spa_aux_vdev_t
*sav
, const char *config
, uint64_t version
,
1910 vdev_labeltype_t label
)
1917 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
1920 * It's acceptable to have no devs specified.
1922 if (nvlist_lookup_nvlist_array(nvroot
, config
, &dev
, &ndev
) != 0)
1929 * Make sure the pool is formatted with a version that supports this
1932 if (spa_version(spa
) < version
)
1936 * Set the pending device list so we correctly handle device in-use
1939 sav
->sav_pending
= dev
;
1940 sav
->sav_npending
= ndev
;
1942 for (i
= 0; i
< ndev
; i
++) {
1943 if ((error
= spa_config_parse(spa
, &vd
, dev
[i
], NULL
, 0,
1947 if (!vd
->vdev_ops
->vdev_op_leaf
) {
1954 * The L2ARC currently only supports disk devices in
1955 * kernel context. For user-level testing, we allow it.
1958 if ((strcmp(config
, ZPOOL_CONFIG_L2CACHE
) == 0) &&
1959 strcmp(vd
->vdev_ops
->vdev_op_type
, VDEV_TYPE_DISK
) != 0) {
1966 if ((error
= vdev_open(vd
)) == 0 &&
1967 (error
= vdev_label_init(vd
, crtxg
, label
)) == 0) {
1968 VERIFY(nvlist_add_uint64(dev
[i
], ZPOOL_CONFIG_GUID
,
1969 vd
->vdev_guid
) == 0);
1975 (mode
!= VDEV_ALLOC_SPARE
&& mode
!= VDEV_ALLOC_L2CACHE
))
1982 sav
->sav_pending
= NULL
;
1983 sav
->sav_npending
= 0;
1988 spa_validate_aux(spa_t
*spa
, nvlist_t
*nvroot
, uint64_t crtxg
, int mode
)
1992 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
1994 if ((error
= spa_validate_aux_devs(spa
, nvroot
, crtxg
, mode
,
1995 &spa
->spa_spares
, ZPOOL_CONFIG_SPARES
, SPA_VERSION_SPARES
,
1996 VDEV_LABEL_SPARE
)) != 0) {
2000 return (spa_validate_aux_devs(spa
, nvroot
, crtxg
, mode
,
2001 &spa
->spa_l2cache
, ZPOOL_CONFIG_L2CACHE
, SPA_VERSION_L2CACHE
,
2002 VDEV_LABEL_L2CACHE
));
2006 spa_set_aux_vdevs(spa_aux_vdev_t
*sav
, nvlist_t
**devs
, int ndevs
,
2011 if (sav
->sav_config
!= NULL
) {
2017 * Generate new dev list by concatentating with the
2020 VERIFY(nvlist_lookup_nvlist_array(sav
->sav_config
, config
,
2021 &olddevs
, &oldndevs
) == 0);
2023 newdevs
= kmem_alloc(sizeof (void *) *
2024 (ndevs
+ oldndevs
), KM_SLEEP
);
2025 for (i
= 0; i
< oldndevs
; i
++)
2026 VERIFY(nvlist_dup(olddevs
[i
], &newdevs
[i
],
2028 for (i
= 0; i
< ndevs
; i
++)
2029 VERIFY(nvlist_dup(devs
[i
], &newdevs
[i
+ oldndevs
],
2032 VERIFY(nvlist_remove(sav
->sav_config
, config
,
2033 DATA_TYPE_NVLIST_ARRAY
) == 0);
2035 VERIFY(nvlist_add_nvlist_array(sav
->sav_config
,
2036 config
, newdevs
, ndevs
+ oldndevs
) == 0);
2037 for (i
= 0; i
< oldndevs
+ ndevs
; i
++)
2038 nvlist_free(newdevs
[i
]);
2039 kmem_free(newdevs
, (oldndevs
+ ndevs
) * sizeof (void *));
2042 * Generate a new dev list.
2044 VERIFY(nvlist_alloc(&sav
->sav_config
, NV_UNIQUE_NAME
,
2046 VERIFY(nvlist_add_nvlist_array(sav
->sav_config
, config
,
2052 * Stop and drop level 2 ARC devices
2055 spa_l2cache_drop(spa_t
*spa
)
2059 spa_aux_vdev_t
*sav
= &spa
->spa_l2cache
;
2061 for (i
= 0; i
< sav
->sav_count
; i
++) {
2064 vd
= sav
->sav_vdevs
[i
];
2067 if (spa_l2cache_exists(vd
->vdev_guid
, &pool
) &&
2068 pool
!= 0ULL && l2arc_vdev_present(vd
))
2069 l2arc_remove_vdev(vd
);
2070 if (vd
->vdev_isl2cache
)
2071 spa_l2cache_remove(vd
);
2072 vdev_clear_stats(vd
);
2073 (void) vdev_close(vd
);
2081 spa_create(const char *pool
, nvlist_t
*nvroot
, nvlist_t
*props
,
2082 const char *history_str
, nvlist_t
*zplprops
)
2085 char *altroot
= NULL
;
2090 uint64_t txg
= TXG_INITIAL
;
2091 nvlist_t
**spares
, **l2cache
;
2092 uint_t nspares
, nl2cache
;
2096 * If this pool already exists, return failure.
2098 mutex_enter(&spa_namespace_lock
);
2099 if (spa_lookup(pool
) != NULL
) {
2100 mutex_exit(&spa_namespace_lock
);
2105 * Allocate a new spa_t structure.
2107 (void) nvlist_lookup_string(props
,
2108 zpool_prop_to_name(ZPOOL_PROP_ALTROOT
), &altroot
);
2109 spa
= spa_add(pool
, altroot
);
2110 spa_activate(spa
, spa_mode_global
);
2112 spa
->spa_uberblock
.ub_txg
= txg
- 1;
2114 if (props
&& (error
= spa_prop_validate(spa
, props
))) {
2115 spa_deactivate(spa
);
2117 mutex_exit(&spa_namespace_lock
);
2121 if (nvlist_lookup_uint64(props
, zpool_prop_to_name(ZPOOL_PROP_VERSION
),
2123 version
= SPA_VERSION
;
2124 ASSERT(version
<= SPA_VERSION
);
2125 spa
->spa_uberblock
.ub_version
= version
;
2126 spa
->spa_ubsync
= spa
->spa_uberblock
;
2129 * Create "The Godfather" zio to hold all async IOs
2131 spa
->spa_async_zio_root
= zio_root(spa
, NULL
, NULL
,
2132 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
| ZIO_FLAG_GODFATHER
);
2135 * Create the root vdev.
2137 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2139 error
= spa_config_parse(spa
, &rvd
, nvroot
, NULL
, 0, VDEV_ALLOC_ADD
);
2141 ASSERT(error
!= 0 || rvd
!= NULL
);
2142 ASSERT(error
!= 0 || spa
->spa_root_vdev
== rvd
);
2144 if (error
== 0 && !zfs_allocatable_devs(nvroot
))
2148 (error
= vdev_create(rvd
, txg
, B_FALSE
)) == 0 &&
2149 (error
= spa_validate_aux(spa
, nvroot
, txg
,
2150 VDEV_ALLOC_ADD
)) == 0) {
2151 for (int c
= 0; c
< rvd
->vdev_children
; c
++) {
2152 vdev_metaslab_set_size(rvd
->vdev_child
[c
]);
2153 vdev_expand(rvd
->vdev_child
[c
], txg
);
2157 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2161 spa_deactivate(spa
);
2163 mutex_exit(&spa_namespace_lock
);
2168 * Get the list of spares, if specified.
2170 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
2171 &spares
, &nspares
) == 0) {
2172 VERIFY(nvlist_alloc(&spa
->spa_spares
.sav_config
, NV_UNIQUE_NAME
,
2174 VERIFY(nvlist_add_nvlist_array(spa
->spa_spares
.sav_config
,
2175 ZPOOL_CONFIG_SPARES
, spares
, nspares
) == 0);
2176 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2177 spa_load_spares(spa
);
2178 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2179 spa
->spa_spares
.sav_sync
= B_TRUE
;
2183 * Get the list of level 2 cache devices, if specified.
2185 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_L2CACHE
,
2186 &l2cache
, &nl2cache
) == 0) {
2187 VERIFY(nvlist_alloc(&spa
->spa_l2cache
.sav_config
,
2188 NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
2189 VERIFY(nvlist_add_nvlist_array(spa
->spa_l2cache
.sav_config
,
2190 ZPOOL_CONFIG_L2CACHE
, l2cache
, nl2cache
) == 0);
2191 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2192 spa_load_l2cache(spa
);
2193 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2194 spa
->spa_l2cache
.sav_sync
= B_TRUE
;
2197 spa
->spa_dsl_pool
= dp
= dsl_pool_create(spa
, zplprops
, txg
);
2198 spa
->spa_meta_objset
= dp
->dp_meta_objset
;
2200 tx
= dmu_tx_create_assigned(dp
, txg
);
2203 * Create the pool config object.
2205 spa
->spa_config_object
= dmu_object_alloc(spa
->spa_meta_objset
,
2206 DMU_OT_PACKED_NVLIST
, SPA_CONFIG_BLOCKSIZE
,
2207 DMU_OT_PACKED_NVLIST_SIZE
, sizeof (uint64_t), tx
);
2209 if (zap_add(spa
->spa_meta_objset
,
2210 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_CONFIG
,
2211 sizeof (uint64_t), 1, &spa
->spa_config_object
, tx
) != 0) {
2212 cmn_err(CE_PANIC
, "failed to add pool config");
2215 /* Newly created pools with the right version are always deflated. */
2216 if (version
>= SPA_VERSION_RAIDZ_DEFLATE
) {
2217 spa
->spa_deflate
= TRUE
;
2218 if (zap_add(spa
->spa_meta_objset
,
2219 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_DEFLATE
,
2220 sizeof (uint64_t), 1, &spa
->spa_deflate
, tx
) != 0) {
2221 cmn_err(CE_PANIC
, "failed to add deflate");
2226 * Create the deferred-free bplist object. Turn off compression
2227 * because sync-to-convergence takes longer if the blocksize
2230 spa
->spa_sync_bplist_obj
= bplist_create(spa
->spa_meta_objset
,
2232 dmu_object_set_compress(spa
->spa_meta_objset
, spa
->spa_sync_bplist_obj
,
2233 ZIO_COMPRESS_OFF
, tx
);
2235 if (zap_add(spa
->spa_meta_objset
,
2236 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_SYNC_BPLIST
,
2237 sizeof (uint64_t), 1, &spa
->spa_sync_bplist_obj
, tx
) != 0) {
2238 cmn_err(CE_PANIC
, "failed to add bplist");
2242 * Create the pool's history object.
2244 if (version
>= SPA_VERSION_ZPOOL_HISTORY
)
2245 spa_history_create_obj(spa
, tx
);
2248 * Set pool properties.
2250 spa
->spa_bootfs
= zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS
);
2251 spa
->spa_delegation
= zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION
);
2252 spa
->spa_failmode
= zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE
);
2253 spa
->spa_autoexpand
= zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND
);
2254 if (props
!= NULL
) {
2255 spa_configfile_set(spa
, props
, B_FALSE
);
2256 spa_sync_props(spa
, props
, CRED(), tx
);
2261 spa
->spa_sync_on
= B_TRUE
;
2262 txg_sync_start(spa
->spa_dsl_pool
);
2265 * We explicitly wait for the first transaction to complete so that our
2266 * bean counters are appropriately updated.
2268 txg_wait_synced(spa
->spa_dsl_pool
, txg
);
2270 spa_config_sync(spa
, B_FALSE
, B_TRUE
);
2272 if (version
>= SPA_VERSION_ZPOOL_HISTORY
&& history_str
!= NULL
)
2273 (void) spa_history_log(spa
, history_str
, LOG_CMD_POOL_CREATE
);
2275 spa
->spa_minref
= refcount_count(&spa
->spa_refcount
);
2277 mutex_exit(&spa_namespace_lock
);
2284 * Get the root pool information from the root disk, then import the root pool
2285 * during the system boot up time.
2287 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t
**);
2290 spa_generate_rootconf(char *devpath
, char *devid
, uint64_t *guid
)
2293 nvlist_t
*nvtop
, *nvroot
;
2296 if (vdev_disk_read_rootlabel(devpath
, devid
, &config
) != 0)
2300 * Add this top-level vdev to the child array.
2302 VERIFY(nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
2304 VERIFY(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_GUID
,
2306 VERIFY(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_GUID
, guid
) == 0);
2309 * Put this pool's top-level vdevs into a root vdev.
2311 VERIFY(nvlist_alloc(&nvroot
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
2312 VERIFY(nvlist_add_string(nvroot
, ZPOOL_CONFIG_TYPE
,
2313 VDEV_TYPE_ROOT
) == 0);
2314 VERIFY(nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_ID
, 0ULL) == 0);
2315 VERIFY(nvlist_add_uint64(nvroot
, ZPOOL_CONFIG_GUID
, pgid
) == 0);
2316 VERIFY(nvlist_add_nvlist_array(nvroot
, ZPOOL_CONFIG_CHILDREN
,
2320 * Replace the existing vdev_tree with the new root vdev in
2321 * this pool's configuration (remove the old, add the new).
2323 VERIFY(nvlist_add_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
, nvroot
) == 0);
2324 nvlist_free(nvroot
);
2329 * Walk the vdev tree and see if we can find a device with "better"
2330 * configuration. A configuration is "better" if the label on that
2331 * device has a more recent txg.
2334 spa_alt_rootvdev(vdev_t
*vd
, vdev_t
**avd
, uint64_t *txg
)
2336 for (int c
= 0; c
< vd
->vdev_children
; c
++)
2337 spa_alt_rootvdev(vd
->vdev_child
[c
], avd
, txg
);
2339 if (vd
->vdev_ops
->vdev_op_leaf
) {
2343 if (vdev_disk_read_rootlabel(vd
->vdev_physpath
, vd
->vdev_devid
,
2347 VERIFY(nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_TXG
,
2351 * Do we have a better boot device?
2353 if (label_txg
> *txg
) {
2362 * Import a root pool.
2364 * For x86. devpath_list will consist of devid and/or physpath name of
2365 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
2366 * The GRUB "findroot" command will return the vdev we should boot.
2368 * For Sparc, devpath_list consists the physpath name of the booting device
2369 * no matter the rootpool is a single device pool or a mirrored pool.
2371 * "/pci@1f,0/ide@d/disk@0,0:a"
2374 spa_import_rootpool(char *devpath
, char *devid
)
2377 vdev_t
*rvd
, *bvd
, *avd
= NULL
;
2378 nvlist_t
*config
, *nvtop
;
2384 * Read the label from the boot device and generate a configuration.
2386 if ((config
= spa_generate_rootconf(devpath
, devid
, &guid
)) == NULL
) {
2387 cmn_err(CE_NOTE
, "Can not read the pool label from '%s'",
2392 VERIFY(nvlist_lookup_string(config
, ZPOOL_CONFIG_POOL_NAME
,
2394 VERIFY(nvlist_lookup_uint64(config
, ZPOOL_CONFIG_POOL_TXG
, &txg
) == 0);
2396 mutex_enter(&spa_namespace_lock
);
2397 if ((spa
= spa_lookup(pname
)) != NULL
) {
2399 * Remove the existing root pool from the namespace so that we
2400 * can replace it with the correct config we just read in.
2405 spa
= spa_add(pname
, NULL
);
2406 spa
->spa_is_root
= B_TRUE
;
2409 * Build up a vdev tree based on the boot device's label config.
2411 VERIFY(nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
2413 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2414 error
= spa_config_parse(spa
, &rvd
, nvtop
, NULL
, 0,
2415 VDEV_ALLOC_ROOTPOOL
);
2416 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2418 mutex_exit(&spa_namespace_lock
);
2419 nvlist_free(config
);
2420 cmn_err(CE_NOTE
, "Can not parse the config for pool '%s'",
2426 * Get the boot vdev.
2428 if ((bvd
= vdev_lookup_by_guid(rvd
, guid
)) == NULL
) {
2429 cmn_err(CE_NOTE
, "Can not find the boot vdev for guid %llu",
2430 (u_longlong_t
)guid
);
2436 * Determine if there is a better boot device.
2439 spa_alt_rootvdev(rvd
, &avd
, &txg
);
2441 cmn_err(CE_NOTE
, "The boot device is 'degraded'. Please "
2442 "try booting from '%s'", avd
->vdev_path
);
2448 * If the boot device is part of a spare vdev then ensure that
2449 * we're booting off the active spare.
2451 if (bvd
->vdev_parent
->vdev_ops
== &vdev_spare_ops
&&
2452 !bvd
->vdev_isspare
) {
2453 cmn_err(CE_NOTE
, "The boot device is currently spared. Please "
2454 "try booting from '%s'",
2455 bvd
->vdev_parent
->vdev_child
[1]->vdev_path
);
2460 VERIFY(nvlist_dup(config
, &spa
->spa_config
, 0) == 0);
2463 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2465 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2466 mutex_exit(&spa_namespace_lock
);
2468 nvlist_free(config
);
2475 * Take a pool and insert it into the namespace as if it had been loaded at
2479 spa_import_verbatim(const char *pool
, nvlist_t
*config
, nvlist_t
*props
)
2482 char *altroot
= NULL
;
2484 mutex_enter(&spa_namespace_lock
);
2485 if (spa_lookup(pool
) != NULL
) {
2486 mutex_exit(&spa_namespace_lock
);
2490 (void) nvlist_lookup_string(props
,
2491 zpool_prop_to_name(ZPOOL_PROP_ALTROOT
), &altroot
);
2492 spa
= spa_add(pool
, altroot
);
2494 VERIFY(nvlist_dup(config
, &spa
->spa_config
, 0) == 0);
2497 spa_configfile_set(spa
, props
, B_FALSE
);
2499 spa_config_sync(spa
, B_FALSE
, B_TRUE
);
2501 mutex_exit(&spa_namespace_lock
);
2507 * Import a non-root pool into the system.
2510 spa_import(const char *pool
, nvlist_t
*config
, nvlist_t
*props
)
2513 char *altroot
= NULL
;
2516 nvlist_t
**spares
, **l2cache
;
2517 uint_t nspares
, nl2cache
;
2520 * If a pool with this name exists, return failure.
2522 mutex_enter(&spa_namespace_lock
);
2523 if ((spa
= spa_lookup(pool
)) != NULL
) {
2524 mutex_exit(&spa_namespace_lock
);
2529 * Create and initialize the spa structure.
2531 (void) nvlist_lookup_string(props
,
2532 zpool_prop_to_name(ZPOOL_PROP_ALTROOT
), &altroot
);
2533 spa
= spa_add(pool
, altroot
);
2534 spa_activate(spa
, spa_mode_global
);
2537 * Don't start async tasks until we know everything is healthy.
2539 spa_async_suspend(spa
);
2542 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
2543 * because the user-supplied config is actually the one to trust when
2546 error
= spa_load(spa
, config
, SPA_LOAD_IMPORT
, B_TRUE
);
2548 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2550 * Toss any existing sparelist, as it doesn't have any validity
2551 * anymore, and conflicts with spa_has_spare().
2553 if (spa
->spa_spares
.sav_config
) {
2554 nvlist_free(spa
->spa_spares
.sav_config
);
2555 spa
->spa_spares
.sav_config
= NULL
;
2556 spa_load_spares(spa
);
2558 if (spa
->spa_l2cache
.sav_config
) {
2559 nvlist_free(spa
->spa_l2cache
.sav_config
);
2560 spa
->spa_l2cache
.sav_config
= NULL
;
2561 spa_load_l2cache(spa
);
2564 VERIFY(nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
2567 error
= spa_validate_aux(spa
, nvroot
, -1ULL,
2570 error
= spa_validate_aux(spa
, nvroot
, -1ULL,
2571 VDEV_ALLOC_L2CACHE
);
2572 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2575 spa_configfile_set(spa
, props
, B_FALSE
);
2577 if (error
!= 0 || (props
&& spa_writeable(spa
) &&
2578 (error
= spa_prop_set(spa
, props
)))) {
2580 spa_deactivate(spa
);
2582 mutex_exit(&spa_namespace_lock
);
2586 spa_async_resume(spa
);
2589 * Override any spares and level 2 cache devices as specified by
2590 * the user, as these may have correct device names/devids, etc.
2592 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
2593 &spares
, &nspares
) == 0) {
2594 if (spa
->spa_spares
.sav_config
)
2595 VERIFY(nvlist_remove(spa
->spa_spares
.sav_config
,
2596 ZPOOL_CONFIG_SPARES
, DATA_TYPE_NVLIST_ARRAY
) == 0);
2598 VERIFY(nvlist_alloc(&spa
->spa_spares
.sav_config
,
2599 NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
2600 VERIFY(nvlist_add_nvlist_array(spa
->spa_spares
.sav_config
,
2601 ZPOOL_CONFIG_SPARES
, spares
, nspares
) == 0);
2602 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2603 spa_load_spares(spa
);
2604 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2605 spa
->spa_spares
.sav_sync
= B_TRUE
;
2607 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_L2CACHE
,
2608 &l2cache
, &nl2cache
) == 0) {
2609 if (spa
->spa_l2cache
.sav_config
)
2610 VERIFY(nvlist_remove(spa
->spa_l2cache
.sav_config
,
2611 ZPOOL_CONFIG_L2CACHE
, DATA_TYPE_NVLIST_ARRAY
) == 0);
2613 VERIFY(nvlist_alloc(&spa
->spa_l2cache
.sav_config
,
2614 NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
2615 VERIFY(nvlist_add_nvlist_array(spa
->spa_l2cache
.sav_config
,
2616 ZPOOL_CONFIG_L2CACHE
, l2cache
, nl2cache
) == 0);
2617 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2618 spa_load_l2cache(spa
);
2619 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2620 spa
->spa_l2cache
.sav_sync
= B_TRUE
;
2623 if (spa_writeable(spa
)) {
2625 * Update the config cache to include the newly-imported pool.
2627 spa_config_update_common(spa
, SPA_CONFIG_UPDATE_POOL
, B_FALSE
);
2631 * It's possible that the pool was expanded while it was exported.
2632 * We kick off an async task to handle this for us.
2634 spa_async_request(spa
, SPA_ASYNC_AUTOEXPAND
);
2636 mutex_exit(&spa_namespace_lock
);
2643 * This (illegal) pool name is used when temporarily importing a spa_t in order
2644 * to get the vdev stats associated with the imported devices.
2646 #define TRYIMPORT_NAME "$import"
2649 spa_tryimport(nvlist_t
*tryconfig
)
2651 nvlist_t
*config
= NULL
;
2657 if (nvlist_lookup_string(tryconfig
, ZPOOL_CONFIG_POOL_NAME
, &poolname
))
2660 if (nvlist_lookup_uint64(tryconfig
, ZPOOL_CONFIG_POOL_STATE
, &state
))
2664 * Create and initialize the spa structure.
2666 mutex_enter(&spa_namespace_lock
);
2667 spa
= spa_add(TRYIMPORT_NAME
, NULL
);
2668 spa_activate(spa
, FREAD
);
2671 * Pass off the heavy lifting to spa_load().
2672 * Pass TRUE for mosconfig because the user-supplied config
2673 * is actually the one to trust when doing an import.
2675 error
= spa_load(spa
, tryconfig
, SPA_LOAD_TRYIMPORT
, B_TRUE
);
2678 * If 'tryconfig' was at least parsable, return the current config.
2680 if (spa
->spa_root_vdev
!= NULL
) {
2681 config
= spa_config_generate(spa
, NULL
, -1ULL, B_TRUE
);
2682 VERIFY(nvlist_add_string(config
, ZPOOL_CONFIG_POOL_NAME
,
2684 VERIFY(nvlist_add_uint64(config
, ZPOOL_CONFIG_POOL_STATE
,
2686 VERIFY(nvlist_add_uint64(config
, ZPOOL_CONFIG_TIMESTAMP
,
2687 spa
->spa_uberblock
.ub_timestamp
) == 0);
2690 * If the bootfs property exists on this pool then we
2691 * copy it out so that external consumers can tell which
2692 * pools are bootable.
2694 if ((!error
|| error
== EEXIST
) && spa
->spa_bootfs
) {
2695 char *tmpname
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
2698 * We have to play games with the name since the
2699 * pool was opened as TRYIMPORT_NAME.
2701 if (dsl_dsobj_to_dsname(spa_name(spa
),
2702 spa
->spa_bootfs
, tmpname
) == 0) {
2704 char *dsname
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
2706 cp
= strchr(tmpname
, '/');
2708 (void) strlcpy(dsname
, tmpname
,
2711 (void) snprintf(dsname
, MAXPATHLEN
,
2712 "%s/%s", poolname
, ++cp
);
2714 VERIFY(nvlist_add_string(config
,
2715 ZPOOL_CONFIG_BOOTFS
, dsname
) == 0);
2716 kmem_free(dsname
, MAXPATHLEN
);
2718 kmem_free(tmpname
, MAXPATHLEN
);
2722 * Add the list of hot spares and level 2 cache devices.
2724 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
2725 spa_add_spares(spa
, config
);
2726 spa_add_l2cache(spa
, config
);
2727 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
2731 spa_deactivate(spa
);
2733 mutex_exit(&spa_namespace_lock
);
2739 * Pool export/destroy
2741 * The act of destroying or exporting a pool is very simple. We make sure there
2742 * is no more pending I/O and any references to the pool are gone. Then, we
2743 * update the pool state and sync all the labels to disk, removing the
2744 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
2745 * we don't sync the labels or remove the configuration cache.
2748 spa_export_common(char *pool
, int new_state
, nvlist_t
**oldconfig
,
2749 boolean_t force
, boolean_t hardforce
)
2756 if (!(spa_mode_global
& FWRITE
))
2759 mutex_enter(&spa_namespace_lock
);
2760 if ((spa
= spa_lookup(pool
)) == NULL
) {
2761 mutex_exit(&spa_namespace_lock
);
2766 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2767 * reacquire the namespace lock, and see if we can export.
2769 spa_open_ref(spa
, FTAG
);
2770 mutex_exit(&spa_namespace_lock
);
2771 spa_async_suspend(spa
);
2772 mutex_enter(&spa_namespace_lock
);
2773 spa_close(spa
, FTAG
);
2776 * The pool will be in core if it's openable,
2777 * in which case we can modify its state.
2779 if (spa
->spa_state
!= POOL_STATE_UNINITIALIZED
&& spa
->spa_sync_on
) {
2781 * Objsets may be open only because they're dirty, so we
2782 * have to force it to sync before checking spa_refcnt.
2784 txg_wait_synced(spa
->spa_dsl_pool
, 0);
2787 * A pool cannot be exported or destroyed if there are active
2788 * references. If we are resetting a pool, allow references by
2789 * fault injection handlers.
2791 if (!spa_refcount_zero(spa
) ||
2792 (spa
->spa_inject_ref
!= 0 &&
2793 new_state
!= POOL_STATE_UNINITIALIZED
)) {
2794 spa_async_resume(spa
);
2795 mutex_exit(&spa_namespace_lock
);
2800 * A pool cannot be exported if it has an active shared spare.
2801 * This is to prevent other pools stealing the active spare
2802 * from an exported pool. At user's own will, such pool can
2803 * be forcedly exported.
2805 if (!force
&& new_state
== POOL_STATE_EXPORTED
&&
2806 spa_has_active_shared_spare(spa
)) {
2807 spa_async_resume(spa
);
2808 mutex_exit(&spa_namespace_lock
);
2813 * We want this to be reflected on every label,
2814 * so mark them all dirty. spa_unload() will do the
2815 * final sync that pushes these changes out.
2817 if (new_state
!= POOL_STATE_UNINITIALIZED
&& !hardforce
) {
2818 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
2819 spa
->spa_state
= new_state
;
2820 spa
->spa_final_txg
= spa_last_synced_txg(spa
) + 1;
2821 vdev_config_dirty(spa
->spa_root_vdev
);
2822 spa_config_exit(spa
, SCL_ALL
, FTAG
);
2826 spa_event_notify(spa
, NULL
, ESC_ZFS_POOL_DESTROY
);
2828 if (spa
->spa_state
!= POOL_STATE_UNINITIALIZED
) {
2830 spa_deactivate(spa
);
2833 if (oldconfig
&& spa
->spa_config
)
2834 VERIFY(nvlist_dup(spa
->spa_config
, oldconfig
, 0) == 0);
2836 if (new_state
!= POOL_STATE_UNINITIALIZED
) {
2838 spa_config_sync(spa
, B_TRUE
, B_TRUE
);
2841 mutex_exit(&spa_namespace_lock
);
2847 * Destroy a storage pool.
2850 spa_destroy(char *pool
)
2852 return (spa_export_common(pool
, POOL_STATE_DESTROYED
, NULL
,
2857 * Export a storage pool.
2860 spa_export(char *pool
, nvlist_t
**oldconfig
, boolean_t force
,
2861 boolean_t hardforce
)
2863 return (spa_export_common(pool
, POOL_STATE_EXPORTED
, oldconfig
,
2868 * Similar to spa_export(), this unloads the spa_t without actually removing it
2869 * from the namespace in any way.
2872 spa_reset(char *pool
)
2874 return (spa_export_common(pool
, POOL_STATE_UNINITIALIZED
, NULL
,
2879 * ==========================================================================
2880 * Device manipulation
2881 * ==========================================================================
2885 * Add a device to a storage pool.
2888 spa_vdev_add(spa_t
*spa
, nvlist_t
*nvroot
)
2892 vdev_t
*rvd
= spa
->spa_root_vdev
;
2894 nvlist_t
**spares
, **l2cache
;
2895 uint_t nspares
, nl2cache
;
2897 txg
= spa_vdev_enter(spa
);
2899 if ((error
= spa_config_parse(spa
, &vd
, nvroot
, NULL
, 0,
2900 VDEV_ALLOC_ADD
)) != 0)
2901 return (spa_vdev_exit(spa
, NULL
, txg
, error
));
2903 spa
->spa_pending_vdev
= vd
; /* spa_vdev_exit() will clear this */
2905 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
, &spares
,
2909 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_L2CACHE
, &l2cache
,
2913 if (vd
->vdev_children
== 0 && nspares
== 0 && nl2cache
== 0)
2914 return (spa_vdev_exit(spa
, vd
, txg
, EINVAL
));
2916 if (vd
->vdev_children
!= 0 &&
2917 (error
= vdev_create(vd
, txg
, B_FALSE
)) != 0)
2918 return (spa_vdev_exit(spa
, vd
, txg
, error
));
2921 * We must validate the spares and l2cache devices after checking the
2922 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
2924 if ((error
= spa_validate_aux(spa
, nvroot
, txg
, VDEV_ALLOC_ADD
)) != 0)
2925 return (spa_vdev_exit(spa
, vd
, txg
, error
));
2928 * Transfer each new top-level vdev from vd to rvd.
2930 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
2931 tvd
= vd
->vdev_child
[c
];
2932 vdev_remove_child(vd
, tvd
);
2933 tvd
->vdev_id
= rvd
->vdev_children
;
2934 vdev_add_child(rvd
, tvd
);
2935 vdev_config_dirty(tvd
);
2939 spa_set_aux_vdevs(&spa
->spa_spares
, spares
, nspares
,
2940 ZPOOL_CONFIG_SPARES
);
2941 spa_load_spares(spa
);
2942 spa
->spa_spares
.sav_sync
= B_TRUE
;
2945 if (nl2cache
!= 0) {
2946 spa_set_aux_vdevs(&spa
->spa_l2cache
, l2cache
, nl2cache
,
2947 ZPOOL_CONFIG_L2CACHE
);
2948 spa_load_l2cache(spa
);
2949 spa
->spa_l2cache
.sav_sync
= B_TRUE
;
2953 * We have to be careful when adding new vdevs to an existing pool.
2954 * If other threads start allocating from these vdevs before we
2955 * sync the config cache, and we lose power, then upon reboot we may
2956 * fail to open the pool because there are DVAs that the config cache
2957 * can't translate. Therefore, we first add the vdevs without
2958 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2959 * and then let spa_config_update() initialize the new metaslabs.
2961 * spa_load() checks for added-but-not-initialized vdevs, so that
2962 * if we lose power at any point in this sequence, the remaining
2963 * steps will be completed the next time we load the pool.
2965 (void) spa_vdev_exit(spa
, vd
, txg
, 0);
2967 mutex_enter(&spa_namespace_lock
);
2968 spa_config_update(spa
, SPA_CONFIG_UPDATE_POOL
);
2969 mutex_exit(&spa_namespace_lock
);
2975 * Attach a device to a mirror. The arguments are the path to any device
2976 * in the mirror, and the nvroot for the new device. If the path specifies
2977 * a device that is not mirrored, we automatically insert the mirror vdev.
2979 * If 'replacing' is specified, the new device is intended to replace the
2980 * existing device; in this case the two devices are made into their own
2981 * mirror using the 'replacing' vdev, which is functionally identical to
2982 * the mirror vdev (it actually reuses all the same ops) but has a few
2983 * extra rules: you can't attach to it after it's been created, and upon
2984 * completion of resilvering, the first disk (the one being replaced)
2985 * is automatically detached.
2988 spa_vdev_attach(spa_t
*spa
, uint64_t guid
, nvlist_t
*nvroot
, int replacing
)
2990 uint64_t txg
, open_txg
;
2991 vdev_t
*rvd
= spa
->spa_root_vdev
;
2992 vdev_t
*oldvd
, *newvd
, *newrootvd
, *pvd
, *tvd
;
2995 char *oldvdpath
, *newvdpath
;
2999 txg
= spa_vdev_enter(spa
);
3001 oldvd
= spa_lookup_by_guid(spa
, guid
, B_FALSE
);
3004 return (spa_vdev_exit(spa
, NULL
, txg
, ENODEV
));
3006 if (!oldvd
->vdev_ops
->vdev_op_leaf
)
3007 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3009 pvd
= oldvd
->vdev_parent
;
3011 if ((error
= spa_config_parse(spa
, &newrootvd
, nvroot
, NULL
, 0,
3012 VDEV_ALLOC_ADD
)) != 0)
3013 return (spa_vdev_exit(spa
, NULL
, txg
, EINVAL
));
3015 if (newrootvd
->vdev_children
!= 1)
3016 return (spa_vdev_exit(spa
, newrootvd
, txg
, EINVAL
));
3018 newvd
= newrootvd
->vdev_child
[0];
3020 if (!newvd
->vdev_ops
->vdev_op_leaf
)
3021 return (spa_vdev_exit(spa
, newrootvd
, txg
, EINVAL
));
3023 if ((error
= vdev_create(newrootvd
, txg
, replacing
)) != 0)
3024 return (spa_vdev_exit(spa
, newrootvd
, txg
, error
));
3027 * Spares can't replace logs
3029 if (oldvd
->vdev_top
->vdev_islog
&& newvd
->vdev_isspare
)
3030 return (spa_vdev_exit(spa
, newrootvd
, txg
, ENOTSUP
));
3034 * For attach, the only allowable parent is a mirror or the root
3037 if (pvd
->vdev_ops
!= &vdev_mirror_ops
&&
3038 pvd
->vdev_ops
!= &vdev_root_ops
)
3039 return (spa_vdev_exit(spa
, newrootvd
, txg
, ENOTSUP
));
3041 pvops
= &vdev_mirror_ops
;
3044 * Active hot spares can only be replaced by inactive hot
3047 if (pvd
->vdev_ops
== &vdev_spare_ops
&&
3048 pvd
->vdev_child
[1] == oldvd
&&
3049 !spa_has_spare(spa
, newvd
->vdev_guid
))
3050 return (spa_vdev_exit(spa
, newrootvd
, txg
, ENOTSUP
));
3053 * If the source is a hot spare, and the parent isn't already a
3054 * spare, then we want to create a new hot spare. Otherwise, we
3055 * want to create a replacing vdev. The user is not allowed to
3056 * attach to a spared vdev child unless the 'isspare' state is
3057 * the same (spare replaces spare, non-spare replaces
3060 if (pvd
->vdev_ops
== &vdev_replacing_ops
)
3061 return (spa_vdev_exit(spa
, newrootvd
, txg
, ENOTSUP
));
3062 else if (pvd
->vdev_ops
== &vdev_spare_ops
&&
3063 newvd
->vdev_isspare
!= oldvd
->vdev_isspare
)
3064 return (spa_vdev_exit(spa
, newrootvd
, txg
, ENOTSUP
));
3065 else if (pvd
->vdev_ops
!= &vdev_spare_ops
&&
3066 newvd
->vdev_isspare
)
3067 pvops
= &vdev_spare_ops
;
3069 pvops
= &vdev_replacing_ops
;
3073 * Make sure the new device is big enough.
3075 if (newvd
->vdev_asize
< vdev_get_min_asize(oldvd
))
3076 return (spa_vdev_exit(spa
, newrootvd
, txg
, EOVERFLOW
));
3079 * The new device cannot have a higher alignment requirement
3080 * than the top-level vdev.
3082 if (newvd
->vdev_ashift
> oldvd
->vdev_top
->vdev_ashift
)
3083 return (spa_vdev_exit(spa
, newrootvd
, txg
, EDOM
));
3086 * If this is an in-place replacement, update oldvd's path and devid
3087 * to make it distinguishable from newvd, and unopenable from now on.
3089 if (strcmp(oldvd
->vdev_path
, newvd
->vdev_path
) == 0) {
3090 spa_strfree(oldvd
->vdev_path
);
3091 oldvd
->vdev_path
= kmem_alloc(strlen(newvd
->vdev_path
) + 5,
3093 (void) sprintf(oldvd
->vdev_path
, "%s/%s",
3094 newvd
->vdev_path
, "old");
3095 if (oldvd
->vdev_devid
!= NULL
) {
3096 spa_strfree(oldvd
->vdev_devid
);
3097 oldvd
->vdev_devid
= NULL
;
3102 * If the parent is not a mirror, or if we're replacing, insert the new
3103 * mirror/replacing/spare vdev above oldvd.
3105 if (pvd
->vdev_ops
!= pvops
)
3106 pvd
= vdev_add_parent(oldvd
, pvops
);
3108 ASSERT(pvd
->vdev_top
->vdev_parent
== rvd
);
3109 ASSERT(pvd
->vdev_ops
== pvops
);
3110 ASSERT(oldvd
->vdev_parent
== pvd
);
3113 * Extract the new device from its root and add it to pvd.
3115 vdev_remove_child(newrootvd
, newvd
);
3116 newvd
->vdev_id
= pvd
->vdev_children
;
3117 vdev_add_child(pvd
, newvd
);
3119 tvd
= newvd
->vdev_top
;
3120 ASSERT(pvd
->vdev_top
== tvd
);
3121 ASSERT(tvd
->vdev_parent
== rvd
);
3123 vdev_config_dirty(tvd
);
3126 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
3127 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
3129 open_txg
= txg
+ TXG_CONCURRENT_STATES
- 1;
3131 vdev_dtl_dirty(newvd
, DTL_MISSING
,
3132 TXG_INITIAL
, open_txg
- TXG_INITIAL
+ 1);
3134 if (newvd
->vdev_isspare
) {
3135 spa_spare_activate(newvd
);
3136 spa_event_notify(spa
, newvd
, ESC_ZFS_VDEV_SPARE
);
3139 oldvdpath
= spa_strdup(oldvd
->vdev_path
);
3140 newvdpath
= spa_strdup(newvd
->vdev_path
);
3141 newvd_isspare
= newvd
->vdev_isspare
;
3144 * Mark newvd's DTL dirty in this txg.
3146 vdev_dirty(tvd
, VDD_DTL
, newvd
, txg
);
3148 (void) spa_vdev_exit(spa
, newrootvd
, open_txg
, 0);
3150 tx
= dmu_tx_create_dd(spa_get_dsl(spa
)->dp_mos_dir
);
3151 if (dmu_tx_assign(tx
, TXG_WAIT
) == 0) {
3152 spa_history_internal_log(LOG_POOL_VDEV_ATTACH
, spa
, tx
,
3153 CRED(), "%s vdev=%s %s vdev=%s",
3154 replacing
&& newvd_isspare
? "spare in" :
3155 replacing
? "replace" : "attach", newvdpath
,
3156 replacing
? "for" : "to", oldvdpath
);
3162 spa_strfree(oldvdpath
);
3163 spa_strfree(newvdpath
);
3166 * Kick off a resilver to update newvd.
3168 VERIFY3U(spa_scrub(spa
, POOL_SCRUB_RESILVER
), ==, 0);
3174 * Detach a device from a mirror or replacing vdev.
3175 * If 'replace_done' is specified, only detach if the parent
3176 * is a replacing vdev.
3179 spa_vdev_detach(spa_t
*spa
, uint64_t guid
, uint64_t pguid
, int replace_done
)
3183 vdev_t
*rvd
= spa
->spa_root_vdev
;
3184 vdev_t
*vd
, *pvd
, *cvd
, *tvd
;
3185 boolean_t unspare
= B_FALSE
;
3186 uint64_t unspare_guid
;
3189 txg
= spa_vdev_enter(spa
);
3191 vd
= spa_lookup_by_guid(spa
, guid
, B_FALSE
);
3194 return (spa_vdev_exit(spa
, NULL
, txg
, ENODEV
));
3196 if (!vd
->vdev_ops
->vdev_op_leaf
)
3197 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3199 pvd
= vd
->vdev_parent
;
3202 * If the parent/child relationship is not as expected, don't do it.
3203 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
3204 * vdev that's replacing B with C. The user's intent in replacing
3205 * is to go from M(A,B) to M(A,C). If the user decides to cancel
3206 * the replace by detaching C, the expected behavior is to end up
3207 * M(A,B). But suppose that right after deciding to detach C,
3208 * the replacement of B completes. We would have M(A,C), and then
3209 * ask to detach C, which would leave us with just A -- not what
3210 * the user wanted. To prevent this, we make sure that the
3211 * parent/child relationship hasn't changed -- in this example,
3212 * that C's parent is still the replacing vdev R.
3214 if (pvd
->vdev_guid
!= pguid
&& pguid
!= 0)
3215 return (spa_vdev_exit(spa
, NULL
, txg
, EBUSY
));
3218 * If replace_done is specified, only remove this device if it's
3219 * the first child of a replacing vdev. For the 'spare' vdev, either
3220 * disk can be removed.
3223 if (pvd
->vdev_ops
== &vdev_replacing_ops
) {
3224 if (vd
->vdev_id
!= 0)
3225 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3226 } else if (pvd
->vdev_ops
!= &vdev_spare_ops
) {
3227 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3231 ASSERT(pvd
->vdev_ops
!= &vdev_spare_ops
||
3232 spa_version(spa
) >= SPA_VERSION_SPARES
);
3235 * Only mirror, replacing, and spare vdevs support detach.
3237 if (pvd
->vdev_ops
!= &vdev_replacing_ops
&&
3238 pvd
->vdev_ops
!= &vdev_mirror_ops
&&
3239 pvd
->vdev_ops
!= &vdev_spare_ops
)
3240 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3243 * If this device has the only valid copy of some data,
3244 * we cannot safely detach it.
3246 if (vdev_dtl_required(vd
))
3247 return (spa_vdev_exit(spa
, NULL
, txg
, EBUSY
));
3249 ASSERT(pvd
->vdev_children
>= 2);
3252 * If we are detaching the second disk from a replacing vdev, then
3253 * check to see if we changed the original vdev's path to have "/old"
3254 * at the end in spa_vdev_attach(). If so, undo that change now.
3256 if (pvd
->vdev_ops
== &vdev_replacing_ops
&& vd
->vdev_id
== 1 &&
3257 pvd
->vdev_child
[0]->vdev_path
!= NULL
&&
3258 pvd
->vdev_child
[1]->vdev_path
!= NULL
) {
3259 ASSERT(pvd
->vdev_child
[1] == vd
);
3260 cvd
= pvd
->vdev_child
[0];
3261 len
= strlen(vd
->vdev_path
);
3262 if (strncmp(cvd
->vdev_path
, vd
->vdev_path
, len
) == 0 &&
3263 strcmp(cvd
->vdev_path
+ len
, "/old") == 0) {
3264 spa_strfree(cvd
->vdev_path
);
3265 cvd
->vdev_path
= spa_strdup(vd
->vdev_path
);
3270 * If we are detaching the original disk from a spare, then it implies
3271 * that the spare should become a real disk, and be removed from the
3272 * active spare list for the pool.
3274 if (pvd
->vdev_ops
== &vdev_spare_ops
&&
3275 vd
->vdev_id
== 0 && pvd
->vdev_child
[1]->vdev_isspare
)
3279 * Erase the disk labels so the disk can be used for other things.
3280 * This must be done after all other error cases are handled,
3281 * but before we disembowel vd (so we can still do I/O to it).
3282 * But if we can't do it, don't treat the error as fatal --
3283 * it may be that the unwritability of the disk is the reason
3284 * it's being detached!
3286 error
= vdev_label_init(vd
, 0, VDEV_LABEL_REMOVE
);
3289 * Remove vd from its parent and compact the parent's children.
3291 vdev_remove_child(pvd
, vd
);
3292 vdev_compact_children(pvd
);
3295 * Remember one of the remaining children so we can get tvd below.
3297 cvd
= pvd
->vdev_child
[0];
3300 * If we need to remove the remaining child from the list of hot spares,
3301 * do it now, marking the vdev as no longer a spare in the process.
3302 * We must do this before vdev_remove_parent(), because that can
3303 * change the GUID if it creates a new toplevel GUID. For a similar
3304 * reason, we must remove the spare now, in the same txg as the detach;
3305 * otherwise someone could attach a new sibling, change the GUID, and
3306 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
3309 ASSERT(cvd
->vdev_isspare
);
3310 spa_spare_remove(cvd
);
3311 unspare_guid
= cvd
->vdev_guid
;
3312 (void) spa_vdev_remove(spa
, unspare_guid
, B_TRUE
);
3316 * If the parent mirror/replacing vdev only has one child,
3317 * the parent is no longer needed. Remove it from the tree.
3319 if (pvd
->vdev_children
== 1)
3320 vdev_remove_parent(cvd
);
3323 * We don't set tvd until now because the parent we just removed
3324 * may have been the previous top-level vdev.
3326 tvd
= cvd
->vdev_top
;
3327 ASSERT(tvd
->vdev_parent
== rvd
);
3330 * Reevaluate the parent vdev state.
3332 vdev_propagate_state(cvd
);
3335 * If the 'autoexpand' property is set on the pool then automatically
3336 * try to expand the size of the pool. For example if the device we
3337 * just detached was smaller than the others, it may be possible to
3338 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
3339 * first so that we can obtain the updated sizes of the leaf vdevs.
3341 if (spa
->spa_autoexpand
) {
3343 vdev_expand(tvd
, txg
);
3346 vdev_config_dirty(tvd
);
3349 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
3350 * vd->vdev_detached is set and free vd's DTL object in syncing context.
3351 * But first make sure we're not on any *other* txg's DTL list, to
3352 * prevent vd from being accessed after it's freed.
3354 for (int t
= 0; t
< TXG_SIZE
; t
++)
3355 (void) txg_list_remove_this(&tvd
->vdev_dtl_list
, vd
, t
);
3356 vd
->vdev_detached
= B_TRUE
;
3357 vdev_dirty(tvd
, VDD_DTL
, vd
, txg
);
3359 spa_event_notify(spa
, vd
, ESC_ZFS_VDEV_REMOVE
);
3361 error
= spa_vdev_exit(spa
, vd
, txg
, 0);
3364 * If this was the removal of the original device in a hot spare vdev,
3365 * then we want to go through and remove the device from the hot spare
3366 * list of every other pool.
3371 mutex_enter(&spa_namespace_lock
);
3372 while ((spa
= spa_next(spa
)) != NULL
) {
3373 if (spa
->spa_state
!= POOL_STATE_ACTIVE
)
3377 spa_open_ref(spa
, FTAG
);
3378 mutex_exit(&spa_namespace_lock
);
3379 (void) spa_vdev_remove(spa
, unspare_guid
, B_TRUE
);
3380 mutex_enter(&spa_namespace_lock
);
3381 spa_close(spa
, FTAG
);
3383 mutex_exit(&spa_namespace_lock
);
3390 spa_nvlist_lookup_by_guid(nvlist_t
**nvpp
, int count
, uint64_t target_guid
)
3392 for (int i
= 0; i
< count
; i
++) {
3395 VERIFY(nvlist_lookup_uint64(nvpp
[i
], ZPOOL_CONFIG_GUID
,
3398 if (guid
== target_guid
)
3406 spa_vdev_remove_aux(nvlist_t
*config
, char *name
, nvlist_t
**dev
, int count
,
3407 nvlist_t
*dev_to_remove
)
3409 nvlist_t
**newdev
= NULL
;
3412 newdev
= kmem_alloc((count
- 1) * sizeof (void *), KM_SLEEP
);
3414 for (int i
= 0, j
= 0; i
< count
; i
++) {
3415 if (dev
[i
] == dev_to_remove
)
3417 VERIFY(nvlist_dup(dev
[i
], &newdev
[j
++], KM_SLEEP
) == 0);
3420 VERIFY(nvlist_remove(config
, name
, DATA_TYPE_NVLIST_ARRAY
) == 0);
3421 VERIFY(nvlist_add_nvlist_array(config
, name
, newdev
, count
- 1) == 0);
3423 for (int i
= 0; i
< count
- 1; i
++)
3424 nvlist_free(newdev
[i
]);
3427 kmem_free(newdev
, (count
- 1) * sizeof (void *));
3431 * Remove a device from the pool. Currently, this supports removing only hot
3432 * spares and level 2 ARC devices.
3435 spa_vdev_remove(spa_t
*spa
, uint64_t guid
, boolean_t unspare
)
3438 nvlist_t
**spares
, **l2cache
, *nv
;
3439 uint_t nspares
, nl2cache
;
3442 boolean_t locked
= MUTEX_HELD(&spa_namespace_lock
);
3445 txg
= spa_vdev_enter(spa
);
3447 vd
= spa_lookup_by_guid(spa
, guid
, B_FALSE
);
3449 if (spa
->spa_spares
.sav_vdevs
!= NULL
&&
3450 nvlist_lookup_nvlist_array(spa
->spa_spares
.sav_config
,
3451 ZPOOL_CONFIG_SPARES
, &spares
, &nspares
) == 0 &&
3452 (nv
= spa_nvlist_lookup_by_guid(spares
, nspares
, guid
)) != NULL
) {
3454 * Only remove the hot spare if it's not currently in use
3457 if (vd
== NULL
|| unspare
) {
3458 spa_vdev_remove_aux(spa
->spa_spares
.sav_config
,
3459 ZPOOL_CONFIG_SPARES
, spares
, nspares
, nv
);
3460 spa_load_spares(spa
);
3461 spa
->spa_spares
.sav_sync
= B_TRUE
;
3465 } else if (spa
->spa_l2cache
.sav_vdevs
!= NULL
&&
3466 nvlist_lookup_nvlist_array(spa
->spa_l2cache
.sav_config
,
3467 ZPOOL_CONFIG_L2CACHE
, &l2cache
, &nl2cache
) == 0 &&
3468 (nv
= spa_nvlist_lookup_by_guid(l2cache
, nl2cache
, guid
)) != NULL
) {
3470 * Cache devices can always be removed.
3472 spa_vdev_remove_aux(spa
->spa_l2cache
.sav_config
,
3473 ZPOOL_CONFIG_L2CACHE
, l2cache
, nl2cache
, nv
);
3474 spa_load_l2cache(spa
);
3475 spa
->spa_l2cache
.sav_sync
= B_TRUE
;
3476 } else if (vd
!= NULL
) {
3478 * Normal vdevs cannot be removed (yet).
3483 * There is no vdev of any kind with the specified guid.
3489 return (spa_vdev_exit(spa
, NULL
, txg
, error
));
3495 * Find any device that's done replacing, or a vdev marked 'unspare' that's
3496 * current spared, so we can detach it.
3499 spa_vdev_resilver_done_hunt(vdev_t
*vd
)
3501 vdev_t
*newvd
, *oldvd
;
3503 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
3504 oldvd
= spa_vdev_resilver_done_hunt(vd
->vdev_child
[c
]);
3510 * Check for a completed replacement.
3512 if (vd
->vdev_ops
== &vdev_replacing_ops
&& vd
->vdev_children
== 2) {
3513 oldvd
= vd
->vdev_child
[0];
3514 newvd
= vd
->vdev_child
[1];
3516 if (vdev_dtl_empty(newvd
, DTL_MISSING
) &&
3517 !vdev_dtl_required(oldvd
))
3522 * Check for a completed resilver with the 'unspare' flag set.
3524 if (vd
->vdev_ops
== &vdev_spare_ops
&& vd
->vdev_children
== 2) {
3525 newvd
= vd
->vdev_child
[0];
3526 oldvd
= vd
->vdev_child
[1];
3528 if (newvd
->vdev_unspare
&&
3529 vdev_dtl_empty(newvd
, DTL_MISSING
) &&
3530 !vdev_dtl_required(oldvd
)) {
3531 newvd
->vdev_unspare
= 0;
3540 spa_vdev_resilver_done(spa_t
*spa
)
3542 vdev_t
*vd
, *pvd
, *ppvd
;
3543 uint64_t guid
, sguid
, pguid
, ppguid
;
3545 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
3547 while ((vd
= spa_vdev_resilver_done_hunt(spa
->spa_root_vdev
)) != NULL
) {
3548 pvd
= vd
->vdev_parent
;
3549 ppvd
= pvd
->vdev_parent
;
3550 guid
= vd
->vdev_guid
;
3551 pguid
= pvd
->vdev_guid
;
3552 ppguid
= ppvd
->vdev_guid
;
3555 * If we have just finished replacing a hot spared device, then
3556 * we need to detach the parent's first child (the original hot
3559 if (ppvd
->vdev_ops
== &vdev_spare_ops
&& pvd
->vdev_id
== 0) {
3560 ASSERT(pvd
->vdev_ops
== &vdev_replacing_ops
);
3561 ASSERT(ppvd
->vdev_children
== 2);
3562 sguid
= ppvd
->vdev_child
[1]->vdev_guid
;
3564 spa_config_exit(spa
, SCL_ALL
, FTAG
);
3565 if (spa_vdev_detach(spa
, guid
, pguid
, B_TRUE
) != 0)
3567 if (sguid
&& spa_vdev_detach(spa
, sguid
, ppguid
, B_TRUE
) != 0)
3569 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
3572 spa_config_exit(spa
, SCL_ALL
, FTAG
);
3576 * Update the stored path or FRU for this vdev. Dirty the vdev configuration,
3577 * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
3580 spa_vdev_set_common(spa_t
*spa
, uint64_t guid
, const char *value
,
3586 txg
= spa_vdev_enter(spa
);
3588 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
3589 return (spa_vdev_exit(spa
, NULL
, txg
, ENOENT
));
3591 if (!vd
->vdev_ops
->vdev_op_leaf
)
3592 return (spa_vdev_exit(spa
, NULL
, txg
, ENOTSUP
));
3595 spa_strfree(vd
->vdev_path
);
3596 vd
->vdev_path
= spa_strdup(value
);
3598 if (vd
->vdev_fru
!= NULL
)
3599 spa_strfree(vd
->vdev_fru
);
3600 vd
->vdev_fru
= spa_strdup(value
);
3603 vdev_config_dirty(vd
->vdev_top
);
3605 return (spa_vdev_exit(spa
, NULL
, txg
, 0));
3609 spa_vdev_setpath(spa_t
*spa
, uint64_t guid
, const char *newpath
)
3611 return (spa_vdev_set_common(spa
, guid
, newpath
, B_TRUE
));
3615 spa_vdev_setfru(spa_t
*spa
, uint64_t guid
, const char *newfru
)
3617 return (spa_vdev_set_common(spa
, guid
, newfru
, B_FALSE
));
3621 * ==========================================================================
3623 * ==========================================================================
3627 spa_scrub(spa_t
*spa
, pool_scrub_type_t type
)
3629 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == 0);
3631 if ((uint_t
)type
>= POOL_SCRUB_TYPES
)
3635 * If a resilver was requested, but there is no DTL on a
3636 * writeable leaf device, we have nothing to do.
3638 if (type
== POOL_SCRUB_RESILVER
&&
3639 !vdev_resilver_needed(spa
->spa_root_vdev
, NULL
, NULL
)) {
3640 spa_async_request(spa
, SPA_ASYNC_RESILVER_DONE
);
3644 if (type
== POOL_SCRUB_EVERYTHING
&&
3645 spa
->spa_dsl_pool
->dp_scrub_func
!= SCRUB_FUNC_NONE
&&
3646 spa
->spa_dsl_pool
->dp_scrub_isresilver
)
3649 if (type
== POOL_SCRUB_EVERYTHING
|| type
== POOL_SCRUB_RESILVER
) {
3650 return (dsl_pool_scrub_clean(spa
->spa_dsl_pool
));
3651 } else if (type
== POOL_SCRUB_NONE
) {
3652 return (dsl_pool_scrub_cancel(spa
->spa_dsl_pool
));
3659 * ==========================================================================
3660 * SPA async task processing
3661 * ==========================================================================
3665 spa_async_remove(spa_t
*spa
, vdev_t
*vd
)
3667 if (vd
->vdev_remove_wanted
) {
3668 vd
->vdev_remove_wanted
= 0;
3669 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_REMOVED
, VDEV_AUX_NONE
);
3670 vdev_clear(spa
, vd
);
3671 vdev_state_dirty(vd
->vdev_top
);
3674 for (int c
= 0; c
< vd
->vdev_children
; c
++)
3675 spa_async_remove(spa
, vd
->vdev_child
[c
]);
3679 spa_async_probe(spa_t
*spa
, vdev_t
*vd
)
3681 if (vd
->vdev_probe_wanted
) {
3682 vd
->vdev_probe_wanted
= 0;
3683 vdev_reopen(vd
); /* vdev_open() does the actual probe */
3686 for (int c
= 0; c
< vd
->vdev_children
; c
++)
3687 spa_async_probe(spa
, vd
->vdev_child
[c
]);
3691 spa_async_autoexpand(spa_t
*spa
, vdev_t
*vd
)
3697 if (!spa
->spa_autoexpand
)
3700 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
3701 vdev_t
*cvd
= vd
->vdev_child
[c
];
3702 spa_async_autoexpand(spa
, cvd
);
3705 if (!vd
->vdev_ops
->vdev_op_leaf
|| vd
->vdev_physpath
== NULL
)
3708 physpath
= kmem_zalloc(MAXPATHLEN
, KM_SLEEP
);
3709 (void) snprintf(physpath
, MAXPATHLEN
, "/devices%s", vd
->vdev_physpath
);
3711 VERIFY(nvlist_alloc(&attr
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
3712 VERIFY(nvlist_add_string(attr
, DEV_PHYS_PATH
, physpath
) == 0);
3714 (void) ddi_log_sysevent(zfs_dip
, SUNW_VENDOR
, EC_DEV_STATUS
,
3715 ESC_DEV_DLE
, attr
, &eid
, DDI_SLEEP
);
3718 kmem_free(physpath
, MAXPATHLEN
);
3722 spa_async_thread(spa_t
*spa
)
3726 ASSERT(spa
->spa_sync_on
);
3728 mutex_enter(&spa
->spa_async_lock
);
3729 tasks
= spa
->spa_async_tasks
;
3730 spa
->spa_async_tasks
= 0;
3731 mutex_exit(&spa
->spa_async_lock
);
3734 * See if the config needs to be updated.
3736 if (tasks
& SPA_ASYNC_CONFIG_UPDATE
) {
3737 uint64_t oldsz
, space_update
;
3739 mutex_enter(&spa_namespace_lock
);
3740 oldsz
= spa_get_space(spa
);
3741 spa_config_update(spa
, SPA_CONFIG_UPDATE_POOL
);
3742 space_update
= spa_get_space(spa
) - oldsz
;
3743 mutex_exit(&spa_namespace_lock
);
3746 * If the pool grew as a result of the config update,
3747 * then log an internal history event.
3752 tx
= dmu_tx_create_dd(spa_get_dsl(spa
)->dp_mos_dir
);
3753 if (dmu_tx_assign(tx
, TXG_WAIT
) == 0) {
3754 spa_history_internal_log(LOG_POOL_VDEV_ONLINE
,
3756 "pool '%s' size: %llu(+%llu)",
3757 spa_name(spa
), spa_get_space(spa
),
3767 * See if any devices need to be marked REMOVED.
3769 if (tasks
& SPA_ASYNC_REMOVE
) {
3770 spa_vdev_state_enter(spa
);
3771 spa_async_remove(spa
, spa
->spa_root_vdev
);
3772 for (int i
= 0; i
< spa
->spa_l2cache
.sav_count
; i
++)
3773 spa_async_remove(spa
, spa
->spa_l2cache
.sav_vdevs
[i
]);
3774 for (int i
= 0; i
< spa
->spa_spares
.sav_count
; i
++)
3775 spa_async_remove(spa
, spa
->spa_spares
.sav_vdevs
[i
]);
3776 (void) spa_vdev_state_exit(spa
, NULL
, 0);
3779 if ((tasks
& SPA_ASYNC_AUTOEXPAND
) && !spa_suspended(spa
)) {
3780 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
3781 spa_async_autoexpand(spa
, spa
->spa_root_vdev
);
3782 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
3786 * See if any devices need to be probed.
3788 if (tasks
& SPA_ASYNC_PROBE
) {
3789 spa_vdev_state_enter(spa
);
3790 spa_async_probe(spa
, spa
->spa_root_vdev
);
3791 (void) spa_vdev_state_exit(spa
, NULL
, 0);
3795 * If any devices are done replacing, detach them.
3797 if (tasks
& SPA_ASYNC_RESILVER_DONE
)
3798 spa_vdev_resilver_done(spa
);
3801 * Kick off a resilver.
3803 if (tasks
& SPA_ASYNC_RESILVER
)
3804 VERIFY(spa_scrub(spa
, POOL_SCRUB_RESILVER
) == 0);
3807 * Let the world know that we're done.
3809 mutex_enter(&spa
->spa_async_lock
);
3810 spa
->spa_async_thread
= NULL
;
3811 cv_broadcast(&spa
->spa_async_cv
);
3812 mutex_exit(&spa
->spa_async_lock
);
3817 spa_async_suspend(spa_t
*spa
)
3819 mutex_enter(&spa
->spa_async_lock
);
3820 spa
->spa_async_suspended
++;
3821 while (spa
->spa_async_thread
!= NULL
)
3822 cv_wait(&spa
->spa_async_cv
, &spa
->spa_async_lock
);
3823 mutex_exit(&spa
->spa_async_lock
);
3827 spa_async_resume(spa_t
*spa
)
3829 mutex_enter(&spa
->spa_async_lock
);
3830 ASSERT(spa
->spa_async_suspended
!= 0);
3831 spa
->spa_async_suspended
--;
3832 mutex_exit(&spa
->spa_async_lock
);
3836 spa_async_dispatch(spa_t
*spa
)
3838 mutex_enter(&spa
->spa_async_lock
);
3839 if (spa
->spa_async_tasks
&& !spa
->spa_async_suspended
&&
3840 spa
->spa_async_thread
== NULL
&&
3841 rootdir
!= NULL
&& !vn_is_readonly(rootdir
))
3842 spa
->spa_async_thread
= thread_create(NULL
, 0,
3843 spa_async_thread
, spa
, 0, &p0
, TS_RUN
, maxclsyspri
);
3844 mutex_exit(&spa
->spa_async_lock
);
3848 spa_async_request(spa_t
*spa
, int task
)
3850 mutex_enter(&spa
->spa_async_lock
);
3851 spa
->spa_async_tasks
|= task
;
3852 mutex_exit(&spa
->spa_async_lock
);
3856 * ==========================================================================
3857 * SPA syncing routines
3858 * ==========================================================================
3862 spa_sync_deferred_frees(spa_t
*spa
, uint64_t txg
)
3864 bplist_t
*bpl
= &spa
->spa_sync_bplist
;
3872 zio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
3874 while (bplist_iterate(bpl
, &itor
, &blk
) == 0) {
3875 ASSERT(blk
.blk_birth
< txg
);
3876 zio_nowait(zio_free(zio
, spa
, txg
, &blk
, NULL
, NULL
,
3877 ZIO_FLAG_MUSTSUCCEED
));
3880 error
= zio_wait(zio
);
3881 ASSERT3U(error
, ==, 0);
3883 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
3884 bplist_vacate(bpl
, tx
);
3887 * Pre-dirty the first block so we sync to convergence faster.
3888 * (Usually only the first block is needed.)
3890 dmu_write(spa
->spa_meta_objset
, spa
->spa_sync_bplist_obj
, 0, 1, &c
, tx
);
3895 spa_sync_nvlist(spa_t
*spa
, uint64_t obj
, nvlist_t
*nv
, dmu_tx_t
*tx
)
3897 char *packed
= NULL
;
3902 VERIFY(nvlist_size(nv
, &nvsize
, NV_ENCODE_XDR
) == 0);
3905 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
3906 * information. This avoids the dbuf_will_dirty() path and
3907 * saves us a pre-read to get data we don't actually care about.
3909 bufsize
= P2ROUNDUP(nvsize
, SPA_CONFIG_BLOCKSIZE
);
3910 packed
= kmem_alloc(bufsize
, KM_SLEEP
);
3912 VERIFY(nvlist_pack(nv
, &packed
, &nvsize
, NV_ENCODE_XDR
,
3914 bzero(packed
+ nvsize
, bufsize
- nvsize
);
3916 dmu_write(spa
->spa_meta_objset
, obj
, 0, bufsize
, packed
, tx
);
3918 kmem_free(packed
, bufsize
);
3920 VERIFY(0 == dmu_bonus_hold(spa
->spa_meta_objset
, obj
, FTAG
, &db
));
3921 dmu_buf_will_dirty(db
, tx
);
3922 *(uint64_t *)db
->db_data
= nvsize
;
3923 dmu_buf_rele(db
, FTAG
);
3927 spa_sync_aux_dev(spa_t
*spa
, spa_aux_vdev_t
*sav
, dmu_tx_t
*tx
,
3928 const char *config
, const char *entry
)
3938 * Update the MOS nvlist describing the list of available devices.
3939 * spa_validate_aux() will have already made sure this nvlist is
3940 * valid and the vdevs are labeled appropriately.
3942 if (sav
->sav_object
== 0) {
3943 sav
->sav_object
= dmu_object_alloc(spa
->spa_meta_objset
,
3944 DMU_OT_PACKED_NVLIST
, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE
,
3945 sizeof (uint64_t), tx
);
3946 VERIFY(zap_update(spa
->spa_meta_objset
,
3947 DMU_POOL_DIRECTORY_OBJECT
, entry
, sizeof (uint64_t), 1,
3948 &sav
->sav_object
, tx
) == 0);
3951 VERIFY(nvlist_alloc(&nvroot
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
3952 if (sav
->sav_count
== 0) {
3953 VERIFY(nvlist_add_nvlist_array(nvroot
, config
, NULL
, 0) == 0);
3955 list
= kmem_alloc(sav
->sav_count
* sizeof (void *), KM_SLEEP
);
3956 for (i
= 0; i
< sav
->sav_count
; i
++)
3957 list
[i
] = vdev_config_generate(spa
, sav
->sav_vdevs
[i
],
3958 B_FALSE
, B_FALSE
, B_TRUE
);
3959 VERIFY(nvlist_add_nvlist_array(nvroot
, config
, list
,
3960 sav
->sav_count
) == 0);
3961 for (i
= 0; i
< sav
->sav_count
; i
++)
3962 nvlist_free(list
[i
]);
3963 kmem_free(list
, sav
->sav_count
* sizeof (void *));
3966 spa_sync_nvlist(spa
, sav
->sav_object
, nvroot
, tx
);
3967 nvlist_free(nvroot
);
3969 sav
->sav_sync
= B_FALSE
;
3973 spa_sync_config_object(spa_t
*spa
, dmu_tx_t
*tx
)
3977 if (list_is_empty(&spa
->spa_config_dirty_list
))
3980 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
3982 config
= spa_config_generate(spa
, spa
->spa_root_vdev
,
3983 dmu_tx_get_txg(tx
), B_FALSE
);
3985 spa_config_exit(spa
, SCL_STATE
, FTAG
);
3987 if (spa
->spa_config_syncing
)
3988 nvlist_free(spa
->spa_config_syncing
);
3989 spa
->spa_config_syncing
= config
;
3991 spa_sync_nvlist(spa
, spa
->spa_config_object
, config
, tx
);
3995 * Set zpool properties.
3998 spa_sync_props(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
4001 objset_t
*mos
= spa
->spa_meta_objset
;
4002 nvlist_t
*nvp
= arg2
;
4007 const char *propname
;
4008 zprop_type_t proptype
;
4010 mutex_enter(&spa
->spa_props_lock
);
4013 while ((elem
= nvlist_next_nvpair(nvp
, elem
))) {
4014 switch (prop
= zpool_name_to_prop(nvpair_name(elem
))) {
4015 case ZPOOL_PROP_VERSION
:
4017 * Only set version for non-zpool-creation cases
4018 * (set/import). spa_create() needs special care
4019 * for version setting.
4021 if (tx
->tx_txg
!= TXG_INITIAL
) {
4022 VERIFY(nvpair_value_uint64(elem
,
4024 ASSERT(intval
<= SPA_VERSION
);
4025 ASSERT(intval
>= spa_version(spa
));
4026 spa
->spa_uberblock
.ub_version
= intval
;
4027 vdev_config_dirty(spa
->spa_root_vdev
);
4031 case ZPOOL_PROP_ALTROOT
:
4033 * 'altroot' is a non-persistent property. It should
4034 * have been set temporarily at creation or import time.
4036 ASSERT(spa
->spa_root
!= NULL
);
4039 case ZPOOL_PROP_CACHEFILE
:
4041 * 'cachefile' is also a non-persisitent property.
4046 * Set pool property values in the poolprops mos object.
4048 if (spa
->spa_pool_props_object
== 0) {
4049 objset_t
*mos
= spa
->spa_meta_objset
;
4051 VERIFY((spa
->spa_pool_props_object
=
4052 zap_create(mos
, DMU_OT_POOL_PROPS
,
4053 DMU_OT_NONE
, 0, tx
)) > 0);
4055 VERIFY(zap_update(mos
,
4056 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_PROPS
,
4057 8, 1, &spa
->spa_pool_props_object
, tx
)
4061 /* normalize the property name */
4062 propname
= zpool_prop_to_name(prop
);
4063 proptype
= zpool_prop_get_type(prop
);
4065 if (nvpair_type(elem
) == DATA_TYPE_STRING
) {
4066 ASSERT(proptype
== PROP_TYPE_STRING
);
4067 VERIFY(nvpair_value_string(elem
, &strval
) == 0);
4068 VERIFY(zap_update(mos
,
4069 spa
->spa_pool_props_object
, propname
,
4070 1, strlen(strval
) + 1, strval
, tx
) == 0);
4072 } else if (nvpair_type(elem
) == DATA_TYPE_UINT64
) {
4073 VERIFY(nvpair_value_uint64(elem
, &intval
) == 0);
4075 if (proptype
== PROP_TYPE_INDEX
) {
4077 VERIFY(zpool_prop_index_to_string(
4078 prop
, intval
, &unused
) == 0);
4080 VERIFY(zap_update(mos
,
4081 spa
->spa_pool_props_object
, propname
,
4082 8, 1, &intval
, tx
) == 0);
4084 ASSERT(0); /* not allowed */
4088 case ZPOOL_PROP_DELEGATION
:
4089 spa
->spa_delegation
= intval
;
4091 case ZPOOL_PROP_BOOTFS
:
4092 spa
->spa_bootfs
= intval
;
4094 case ZPOOL_PROP_FAILUREMODE
:
4095 spa
->spa_failmode
= intval
;
4097 case ZPOOL_PROP_AUTOEXPAND
:
4098 spa
->spa_autoexpand
= intval
;
4099 spa_async_request(spa
, SPA_ASYNC_AUTOEXPAND
);
4106 /* log internal history if this is not a zpool create */
4107 if (spa_version(spa
) >= SPA_VERSION_ZPOOL_HISTORY
&&
4108 tx
->tx_txg
!= TXG_INITIAL
) {
4109 spa_history_internal_log(LOG_POOL_PROPSET
,
4110 spa
, tx
, cr
, "%s %lld %s",
4111 nvpair_name(elem
), intval
, spa_name(spa
));
4115 mutex_exit(&spa
->spa_props_lock
);
4119 * Sync the specified transaction group. New blocks may be dirtied as
4120 * part of the process, so we iterate until it converges.
4123 spa_sync(spa_t
*spa
, uint64_t txg
)
4125 dsl_pool_t
*dp
= spa
->spa_dsl_pool
;
4126 objset_t
*mos
= spa
->spa_meta_objset
;
4127 bplist_t
*bpl
= &spa
->spa_sync_bplist
;
4128 vdev_t
*rvd
= spa
->spa_root_vdev
;
4135 * Lock out configuration changes.
4137 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
4139 spa
->spa_syncing_txg
= txg
;
4140 spa
->spa_sync_pass
= 0;
4143 * If there are any pending vdev state changes, convert them
4144 * into config changes that go out with this transaction group.
4146 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
4147 while (list_head(&spa
->spa_state_dirty_list
) != NULL
) {
4149 * We need the write lock here because, for aux vdevs,
4150 * calling vdev_config_dirty() modifies sav_config.
4151 * This is ugly and will become unnecessary when we
4152 * eliminate the aux vdev wart by integrating all vdevs
4153 * into the root vdev tree.
4155 spa_config_exit(spa
, SCL_CONFIG
| SCL_STATE
, FTAG
);
4156 spa_config_enter(spa
, SCL_CONFIG
| SCL_STATE
, FTAG
, RW_WRITER
);
4157 while ((vd
= list_head(&spa
->spa_state_dirty_list
)) != NULL
) {
4158 vdev_state_clean(vd
);
4159 vdev_config_dirty(vd
);
4161 spa_config_exit(spa
, SCL_CONFIG
| SCL_STATE
, FTAG
);
4162 spa_config_enter(spa
, SCL_CONFIG
| SCL_STATE
, FTAG
, RW_READER
);
4164 spa_config_exit(spa
, SCL_STATE
, FTAG
);
4166 VERIFY(0 == bplist_open(bpl
, mos
, spa
->spa_sync_bplist_obj
));
4168 tx
= dmu_tx_create_assigned(dp
, txg
);
4171 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4172 * set spa_deflate if we have no raid-z vdevs.
4174 if (spa
->spa_ubsync
.ub_version
< SPA_VERSION_RAIDZ_DEFLATE
&&
4175 spa
->spa_uberblock
.ub_version
>= SPA_VERSION_RAIDZ_DEFLATE
) {
4178 for (i
= 0; i
< rvd
->vdev_children
; i
++) {
4179 vd
= rvd
->vdev_child
[i
];
4180 if (vd
->vdev_deflate_ratio
!= SPA_MINBLOCKSIZE
)
4183 if (i
== rvd
->vdev_children
) {
4184 spa
->spa_deflate
= TRUE
;
4185 VERIFY(0 == zap_add(spa
->spa_meta_objset
,
4186 DMU_POOL_DIRECTORY_OBJECT
, DMU_POOL_DEFLATE
,
4187 sizeof (uint64_t), 1, &spa
->spa_deflate
, tx
));
4191 if (spa
->spa_ubsync
.ub_version
< SPA_VERSION_ORIGIN
&&
4192 spa
->spa_uberblock
.ub_version
>= SPA_VERSION_ORIGIN
) {
4193 dsl_pool_create_origin(dp
, tx
);
4195 /* Keeping the origin open increases spa_minref */
4196 spa
->spa_minref
+= 3;
4199 if (spa
->spa_ubsync
.ub_version
< SPA_VERSION_NEXT_CLONES
&&
4200 spa
->spa_uberblock
.ub_version
>= SPA_VERSION_NEXT_CLONES
) {
4201 dsl_pool_upgrade_clones(dp
, tx
);
4205 * If anything has changed in this txg, push the deferred frees
4206 * from the previous txg. If not, leave them alone so that we
4207 * don't generate work on an otherwise idle system.
4209 if (!txg_list_empty(&dp
->dp_dirty_datasets
, txg
) ||
4210 !txg_list_empty(&dp
->dp_dirty_dirs
, txg
) ||
4211 !txg_list_empty(&dp
->dp_sync_tasks
, txg
))
4212 spa_sync_deferred_frees(spa
, txg
);
4215 * Iterate to convergence.
4218 spa
->spa_sync_pass
++;
4220 spa_sync_config_object(spa
, tx
);
4221 spa_sync_aux_dev(spa
, &spa
->spa_spares
, tx
,
4222 ZPOOL_CONFIG_SPARES
, DMU_POOL_SPARES
);
4223 spa_sync_aux_dev(spa
, &spa
->spa_l2cache
, tx
,
4224 ZPOOL_CONFIG_L2CACHE
, DMU_POOL_L2CACHE
);
4225 spa_errlog_sync(spa
, txg
);
4226 dsl_pool_sync(dp
, txg
);
4229 while (vd
= txg_list_remove(&spa
->spa_vdev_txg_list
, txg
)) {
4234 bplist_sync(bpl
, tx
);
4235 } while (dirty_vdevs
);
4239 dprintf("txg %llu passes %d\n", txg
, spa
->spa_sync_pass
);
4242 * Rewrite the vdev configuration (which includes the uberblock)
4243 * to commit the transaction group.
4245 * If there are no dirty vdevs, we sync the uberblock to a few
4246 * random top-level vdevs that are known to be visible in the
4247 * config cache (see spa_vdev_add() for a complete description).
4248 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
4252 * We hold SCL_STATE to prevent vdev open/close/etc.
4253 * while we're attempting to write the vdev labels.
4255 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
4257 if (list_is_empty(&spa
->spa_config_dirty_list
)) {
4258 vdev_t
*svd
[SPA_DVAS_PER_BP
];
4260 int children
= rvd
->vdev_children
;
4261 int c0
= spa_get_random(children
);
4263 for (int c
= 0; c
< children
; c
++) {
4264 vd
= rvd
->vdev_child
[(c0
+ c
) % children
];
4265 if (vd
->vdev_ms_array
== 0 || vd
->vdev_islog
)
4267 svd
[svdcount
++] = vd
;
4268 if (svdcount
== SPA_DVAS_PER_BP
)
4271 error
= vdev_config_sync(svd
, svdcount
, txg
, B_FALSE
);
4273 error
= vdev_config_sync(svd
, svdcount
, txg
,
4276 error
= vdev_config_sync(rvd
->vdev_child
,
4277 rvd
->vdev_children
, txg
, B_FALSE
);
4279 error
= vdev_config_sync(rvd
->vdev_child
,
4280 rvd
->vdev_children
, txg
, B_TRUE
);
4283 spa_config_exit(spa
, SCL_STATE
, FTAG
);
4287 zio_suspend(spa
, NULL
);
4288 zio_resume_wait(spa
);
4293 * Clear the dirty config list.
4295 while ((vd
= list_head(&spa
->spa_config_dirty_list
)) != NULL
)
4296 vdev_config_clean(vd
);
4299 * Now that the new config has synced transactionally,
4300 * let it become visible to the config cache.
4302 if (spa
->spa_config_syncing
!= NULL
) {
4303 spa_config_set(spa
, spa
->spa_config_syncing
);
4304 spa
->spa_config_txg
= txg
;
4305 spa
->spa_config_syncing
= NULL
;
4308 spa
->spa_ubsync
= spa
->spa_uberblock
;
4311 * Clean up the ZIL records for the synced txg.
4313 dsl_pool_zil_clean(dp
);
4316 * Update usable space statistics.
4318 while (vd
= txg_list_remove(&spa
->spa_vdev_txg_list
, TXG_CLEAN(txg
)))
4319 vdev_sync_done(vd
, txg
);
4322 * It had better be the case that we didn't dirty anything
4323 * since vdev_config_sync().
4325 ASSERT(txg_list_empty(&dp
->dp_dirty_datasets
, txg
));
4326 ASSERT(txg_list_empty(&dp
->dp_dirty_dirs
, txg
));
4327 ASSERT(txg_list_empty(&spa
->spa_vdev_txg_list
, txg
));
4328 ASSERT(bpl
->bpl_queue
== NULL
);
4330 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
4333 * If any async tasks have been requested, kick them off.
4335 spa_async_dispatch(spa
);
4339 * Sync all pools. We don't want to hold the namespace lock across these
4340 * operations, so we take a reference on the spa_t and drop the lock during the
4344 spa_sync_allpools(void)
4347 mutex_enter(&spa_namespace_lock
);
4348 while ((spa
= spa_next(spa
)) != NULL
) {
4349 if (spa_state(spa
) != POOL_STATE_ACTIVE
|| spa_suspended(spa
))
4351 spa_open_ref(spa
, FTAG
);
4352 mutex_exit(&spa_namespace_lock
);
4353 txg_wait_synced(spa_get_dsl(spa
), 0);
4354 mutex_enter(&spa_namespace_lock
);
4355 spa_close(spa
, FTAG
);
4357 mutex_exit(&spa_namespace_lock
);
4361 * ==========================================================================
4362 * Miscellaneous routines
4363 * ==========================================================================
4367 * Remove all pools in the system.
4375 * Remove all cached state. All pools should be closed now,
4376 * so every spa in the AVL tree should be unreferenced.
4378 mutex_enter(&spa_namespace_lock
);
4379 while ((spa
= spa_next(NULL
)) != NULL
) {
4381 * Stop async tasks. The async thread may need to detach
4382 * a device that's been replaced, which requires grabbing
4383 * spa_namespace_lock, so we must drop it here.
4385 spa_open_ref(spa
, FTAG
);
4386 mutex_exit(&spa_namespace_lock
);
4387 spa_async_suspend(spa
);
4388 mutex_enter(&spa_namespace_lock
);
4389 spa_close(spa
, FTAG
);
4391 if (spa
->spa_state
!= POOL_STATE_UNINITIALIZED
) {
4393 spa_deactivate(spa
);
4397 mutex_exit(&spa_namespace_lock
);
4401 spa_lookup_by_guid(spa_t
*spa
, uint64_t guid
, boolean_t aux
)
4406 if ((vd
= vdev_lookup_by_guid(spa
->spa_root_vdev
, guid
)) != NULL
)
4410 for (i
= 0; i
< spa
->spa_l2cache
.sav_count
; i
++) {
4411 vd
= spa
->spa_l2cache
.sav_vdevs
[i
];
4412 if (vd
->vdev_guid
== guid
)
4416 for (i
= 0; i
< spa
->spa_spares
.sav_count
; i
++) {
4417 vd
= spa
->spa_spares
.sav_vdevs
[i
];
4418 if (vd
->vdev_guid
== guid
)
4427 spa_upgrade(spa_t
*spa
, uint64_t version
)
4429 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_WRITER
);
4432 * This should only be called for a non-faulted pool, and since a
4433 * future version would result in an unopenable pool, this shouldn't be
4436 ASSERT(spa
->spa_uberblock
.ub_version
<= SPA_VERSION
);
4437 ASSERT(version
>= spa
->spa_uberblock
.ub_version
);
4439 spa
->spa_uberblock
.ub_version
= version
;
4440 vdev_config_dirty(spa
->spa_root_vdev
);
4442 spa_config_exit(spa
, SCL_ALL
, FTAG
);
4444 txg_wait_synced(spa_get_dsl(spa
), 0);
4448 spa_has_spare(spa_t
*spa
, uint64_t guid
)
4452 spa_aux_vdev_t
*sav
= &spa
->spa_spares
;
4454 for (i
= 0; i
< sav
->sav_count
; i
++)
4455 if (sav
->sav_vdevs
[i
]->vdev_guid
== guid
)
4458 for (i
= 0; i
< sav
->sav_npending
; i
++) {
4459 if (nvlist_lookup_uint64(sav
->sav_pending
[i
], ZPOOL_CONFIG_GUID
,
4460 &spareguid
) == 0 && spareguid
== guid
)
4468 * Check if a pool has an active shared spare device.
4469 * Note: reference count of an active spare is 2, as a spare and as a replace
4472 spa_has_active_shared_spare(spa_t
*spa
)
4476 spa_aux_vdev_t
*sav
= &spa
->spa_spares
;
4478 for (i
= 0; i
< sav
->sav_count
; i
++) {
4479 if (spa_spare_exists(sav
->sav_vdevs
[i
]->vdev_guid
, &pool
,
4480 &refcnt
) && pool
!= 0ULL && pool
== spa_guid(spa
) &&
4489 * Post a sysevent corresponding to the given event. The 'name' must be one of
4490 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
4491 * filled in from the spa and (optionally) the vdev. This doesn't do anything
4492 * in the userland libzpool, as we don't want consumers to misinterpret ztest
4493 * or zdb as real changes.
4496 spa_event_notify(spa_t
*spa
, vdev_t
*vd
, const char *name
)
4500 sysevent_attr_list_t
*attr
= NULL
;
4501 sysevent_value_t value
;
4504 ev
= sysevent_alloc(EC_ZFS
, (char *)name
, SUNW_KERN_PUB
"zfs",
4507 value
.value_type
= SE_DATA_TYPE_STRING
;
4508 value
.value
.sv_string
= spa_name(spa
);
4509 if (sysevent_add_attr(&attr
, ZFS_EV_POOL_NAME
, &value
, SE_SLEEP
) != 0)
4512 value
.value_type
= SE_DATA_TYPE_UINT64
;
4513 value
.value
.sv_uint64
= spa_guid(spa
);
4514 if (sysevent_add_attr(&attr
, ZFS_EV_POOL_GUID
, &value
, SE_SLEEP
) != 0)
4518 value
.value_type
= SE_DATA_TYPE_UINT64
;
4519 value
.value
.sv_uint64
= vd
->vdev_guid
;
4520 if (sysevent_add_attr(&attr
, ZFS_EV_VDEV_GUID
, &value
,
4524 if (vd
->vdev_path
) {
4525 value
.value_type
= SE_DATA_TYPE_STRING
;
4526 value
.value
.sv_string
= vd
->vdev_path
;
4527 if (sysevent_add_attr(&attr
, ZFS_EV_VDEV_PATH
,
4528 &value
, SE_SLEEP
) != 0)
4533 if (sysevent_attach_attributes(ev
, attr
) != 0)
4537 (void) log_sysevent(ev
, SE_SLEEP
, &eid
);
4541 sysevent_free_attr(attr
);