4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Toomas Soome <tsoome@me.com>
28 * Copyright 2017 Joyent, Inc.
29 * Copyright (c) 2017, Intel Corporation.
30 * Copyright (c) 2019, Datto Inc. All rights reserved.
33 #include <sys/zfs_context.h>
34 #include <sys/fm/fs/zfs.h>
36 #include <sys/spa_impl.h>
37 #include <sys/bpobj.h>
39 #include <sys/dmu_tx.h>
40 #include <sys/dsl_dir.h>
41 #include <sys/vdev_impl.h>
42 #include <sys/uberblock_impl.h>
43 #include <sys/metaslab.h>
44 #include <sys/metaslab_impl.h>
45 #include <sys/space_map.h>
46 #include <sys/space_reftree.h>
49 #include <sys/fs/zfs.h>
52 #include <sys/dsl_scan.h>
54 #include <sys/vdev_initialize.h>
55 #include <sys/vdev_trim.h>
57 #include <sys/zfs_ratelimit.h>
59 /* default target for number of metaslabs per top-level vdev */
60 int zfs_vdev_default_ms_count
= 200;
62 /* minimum number of metaslabs per top-level vdev */
63 int zfs_vdev_min_ms_count
= 16;
65 /* practical upper limit of total metaslabs per top-level vdev */
66 int zfs_vdev_ms_count_limit
= 1ULL << 17;
68 /* lower limit for metaslab size (512M) */
69 int zfs_vdev_default_ms_shift
= 29;
71 /* upper limit for metaslab size (16G) */
72 int zfs_vdev_max_ms_shift
= 34;
74 int vdev_validate_skip
= B_FALSE
;
77 * Since the DTL space map of a vdev is not expected to have a lot of
78 * entries, we default its block size to 4K.
80 int zfs_vdev_dtl_sm_blksz
= (1 << 12);
83 * Rate limit slow IO (delay) events to this many per second.
85 unsigned int zfs_slow_io_events_per_second
= 20;
88 * Rate limit checksum events after this many checksum errors per second.
90 unsigned int zfs_checksum_events_per_second
= 20;
93 * Ignore errors during scrub/resilver. Allows to work around resilver
94 * upon import when there are pool errors.
96 int zfs_scan_ignore_errors
= 0;
99 * vdev-wide space maps that have lots of entries written to them at
100 * the end of each transaction can benefit from a higher I/O bandwidth
101 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
103 int zfs_vdev_standard_sm_blksz
= (1 << 17);
106 * Tunable parameter for debugging or performance analysis. Setting this
107 * will cause pool corruption on power loss if a volatile out-of-order
108 * write cache is enabled.
110 int zfs_nocacheflush
= 0;
114 vdev_dbgmsg(vdev_t
*vd
, const char *fmt
, ...)
120 (void) vsnprintf(buf
, sizeof (buf
), fmt
, adx
);
123 if (vd
->vdev_path
!= NULL
) {
124 zfs_dbgmsg("%s vdev '%s': %s", vd
->vdev_ops
->vdev_op_type
,
127 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
128 vd
->vdev_ops
->vdev_op_type
,
129 (u_longlong_t
)vd
->vdev_id
,
130 (u_longlong_t
)vd
->vdev_guid
, buf
);
135 vdev_dbgmsg_print_tree(vdev_t
*vd
, int indent
)
139 if (vd
->vdev_ishole
|| vd
->vdev_ops
== &vdev_missing_ops
) {
140 zfs_dbgmsg("%*svdev %u: %s", indent
, "", vd
->vdev_id
,
141 vd
->vdev_ops
->vdev_op_type
);
145 switch (vd
->vdev_state
) {
146 case VDEV_STATE_UNKNOWN
:
147 (void) snprintf(state
, sizeof (state
), "unknown");
149 case VDEV_STATE_CLOSED
:
150 (void) snprintf(state
, sizeof (state
), "closed");
152 case VDEV_STATE_OFFLINE
:
153 (void) snprintf(state
, sizeof (state
), "offline");
155 case VDEV_STATE_REMOVED
:
156 (void) snprintf(state
, sizeof (state
), "removed");
158 case VDEV_STATE_CANT_OPEN
:
159 (void) snprintf(state
, sizeof (state
), "can't open");
161 case VDEV_STATE_FAULTED
:
162 (void) snprintf(state
, sizeof (state
), "faulted");
164 case VDEV_STATE_DEGRADED
:
165 (void) snprintf(state
, sizeof (state
), "degraded");
167 case VDEV_STATE_HEALTHY
:
168 (void) snprintf(state
, sizeof (state
), "healthy");
171 (void) snprintf(state
, sizeof (state
), "<state %u>",
172 (uint_t
)vd
->vdev_state
);
175 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent
,
176 "", (int)vd
->vdev_id
, vd
->vdev_ops
->vdev_op_type
,
177 vd
->vdev_islog
? " (log)" : "",
178 (u_longlong_t
)vd
->vdev_guid
,
179 vd
->vdev_path
? vd
->vdev_path
: "N/A", state
);
181 for (uint64_t i
= 0; i
< vd
->vdev_children
; i
++)
182 vdev_dbgmsg_print_tree(vd
->vdev_child
[i
], indent
+ 2);
186 * Virtual device management.
189 static vdev_ops_t
*vdev_ops_table
[] = {
204 * Given a vdev type, return the appropriate ops vector.
207 vdev_getops(const char *type
)
209 vdev_ops_t
*ops
, **opspp
;
211 for (opspp
= vdev_ops_table
; (ops
= *opspp
) != NULL
; opspp
++)
212 if (strcmp(ops
->vdev_op_type
, type
) == 0)
220 vdev_default_xlate(vdev_t
*vd
, const range_seg64_t
*in
, range_seg64_t
*res
)
222 res
->rs_start
= in
->rs_start
;
223 res
->rs_end
= in
->rs_end
;
227 * Derive the enumerated allocation bias from string input.
228 * String origin is either the per-vdev zap or zpool(1M).
230 static vdev_alloc_bias_t
231 vdev_derive_alloc_bias(const char *bias
)
233 vdev_alloc_bias_t alloc_bias
= VDEV_BIAS_NONE
;
235 if (strcmp(bias
, VDEV_ALLOC_BIAS_LOG
) == 0)
236 alloc_bias
= VDEV_BIAS_LOG
;
237 else if (strcmp(bias
, VDEV_ALLOC_BIAS_SPECIAL
) == 0)
238 alloc_bias
= VDEV_BIAS_SPECIAL
;
239 else if (strcmp(bias
, VDEV_ALLOC_BIAS_DEDUP
) == 0)
240 alloc_bias
= VDEV_BIAS_DEDUP
;
246 * Default asize function: return the MAX of psize with the asize of
247 * all children. This is what's used by anything other than RAID-Z.
250 vdev_default_asize(vdev_t
*vd
, uint64_t psize
)
252 uint64_t asize
= P2ROUNDUP(psize
, 1ULL << vd
->vdev_top
->vdev_ashift
);
255 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
256 csize
= vdev_psize_to_asize(vd
->vdev_child
[c
], psize
);
257 asize
= MAX(asize
, csize
);
264 * Get the minimum allocatable size. We define the allocatable size as
265 * the vdev's asize rounded to the nearest metaslab. This allows us to
266 * replace or attach devices which don't have the same physical size but
267 * can still satisfy the same number of allocations.
270 vdev_get_min_asize(vdev_t
*vd
)
272 vdev_t
*pvd
= vd
->vdev_parent
;
275 * If our parent is NULL (inactive spare or cache) or is the root,
276 * just return our own asize.
279 return (vd
->vdev_asize
);
282 * The top-level vdev just returns the allocatable size rounded
283 * to the nearest metaslab.
285 if (vd
== vd
->vdev_top
)
286 return (P2ALIGN(vd
->vdev_asize
, 1ULL << vd
->vdev_ms_shift
));
289 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
290 * so each child must provide at least 1/Nth of its asize.
292 if (pvd
->vdev_ops
== &vdev_raidz_ops
)
293 return ((pvd
->vdev_min_asize
+ pvd
->vdev_children
- 1) /
296 return (pvd
->vdev_min_asize
);
300 vdev_set_min_asize(vdev_t
*vd
)
302 vd
->vdev_min_asize
= vdev_get_min_asize(vd
);
304 for (int c
= 0; c
< vd
->vdev_children
; c
++)
305 vdev_set_min_asize(vd
->vdev_child
[c
]);
309 vdev_lookup_top(spa_t
*spa
, uint64_t vdev
)
311 vdev_t
*rvd
= spa
->spa_root_vdev
;
313 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
315 if (vdev
< rvd
->vdev_children
) {
316 ASSERT(rvd
->vdev_child
[vdev
] != NULL
);
317 return (rvd
->vdev_child
[vdev
]);
324 vdev_lookup_by_guid(vdev_t
*vd
, uint64_t guid
)
328 if (vd
->vdev_guid
== guid
)
331 for (int c
= 0; c
< vd
->vdev_children
; c
++)
332 if ((mvd
= vdev_lookup_by_guid(vd
->vdev_child
[c
], guid
)) !=
340 vdev_count_leaves_impl(vdev_t
*vd
)
344 if (vd
->vdev_ops
->vdev_op_leaf
)
347 for (int c
= 0; c
< vd
->vdev_children
; c
++)
348 n
+= vdev_count_leaves_impl(vd
->vdev_child
[c
]);
354 vdev_count_leaves(spa_t
*spa
)
358 spa_config_enter(spa
, SCL_VDEV
, FTAG
, RW_READER
);
359 rc
= vdev_count_leaves_impl(spa
->spa_root_vdev
);
360 spa_config_exit(spa
, SCL_VDEV
, FTAG
);
366 vdev_add_child(vdev_t
*pvd
, vdev_t
*cvd
)
368 size_t oldsize
, newsize
;
369 uint64_t id
= cvd
->vdev_id
;
372 ASSERT(spa_config_held(cvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
373 ASSERT(cvd
->vdev_parent
== NULL
);
375 cvd
->vdev_parent
= pvd
;
380 ASSERT(id
>= pvd
->vdev_children
|| pvd
->vdev_child
[id
] == NULL
);
382 oldsize
= pvd
->vdev_children
* sizeof (vdev_t
*);
383 pvd
->vdev_children
= MAX(pvd
->vdev_children
, id
+ 1);
384 newsize
= pvd
->vdev_children
* sizeof (vdev_t
*);
386 newchild
= kmem_alloc(newsize
, KM_SLEEP
);
387 if (pvd
->vdev_child
!= NULL
) {
388 bcopy(pvd
->vdev_child
, newchild
, oldsize
);
389 kmem_free(pvd
->vdev_child
, oldsize
);
392 pvd
->vdev_child
= newchild
;
393 pvd
->vdev_child
[id
] = cvd
;
395 cvd
->vdev_top
= (pvd
->vdev_top
? pvd
->vdev_top
: cvd
);
396 ASSERT(cvd
->vdev_top
->vdev_parent
->vdev_parent
== NULL
);
399 * Walk up all ancestors to update guid sum.
401 for (; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
402 pvd
->vdev_guid_sum
+= cvd
->vdev_guid_sum
;
404 if (cvd
->vdev_ops
->vdev_op_leaf
) {
405 list_insert_head(&cvd
->vdev_spa
->spa_leaf_list
, cvd
);
406 cvd
->vdev_spa
->spa_leaf_list_gen
++;
411 vdev_remove_child(vdev_t
*pvd
, vdev_t
*cvd
)
414 uint_t id
= cvd
->vdev_id
;
416 ASSERT(cvd
->vdev_parent
== pvd
);
421 ASSERT(id
< pvd
->vdev_children
);
422 ASSERT(pvd
->vdev_child
[id
] == cvd
);
424 pvd
->vdev_child
[id
] = NULL
;
425 cvd
->vdev_parent
= NULL
;
427 for (c
= 0; c
< pvd
->vdev_children
; c
++)
428 if (pvd
->vdev_child
[c
])
431 if (c
== pvd
->vdev_children
) {
432 kmem_free(pvd
->vdev_child
, c
* sizeof (vdev_t
*));
433 pvd
->vdev_child
= NULL
;
434 pvd
->vdev_children
= 0;
437 if (cvd
->vdev_ops
->vdev_op_leaf
) {
438 spa_t
*spa
= cvd
->vdev_spa
;
439 list_remove(&spa
->spa_leaf_list
, cvd
);
440 spa
->spa_leaf_list_gen
++;
444 * Walk up all ancestors to update guid sum.
446 for (; pvd
!= NULL
; pvd
= pvd
->vdev_parent
)
447 pvd
->vdev_guid_sum
-= cvd
->vdev_guid_sum
;
451 * Remove any holes in the child array.
454 vdev_compact_children(vdev_t
*pvd
)
456 vdev_t
**newchild
, *cvd
;
457 int oldc
= pvd
->vdev_children
;
460 ASSERT(spa_config_held(pvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
465 for (int c
= newc
= 0; c
< oldc
; c
++)
466 if (pvd
->vdev_child
[c
])
470 newchild
= kmem_zalloc(newc
* sizeof (vdev_t
*), KM_SLEEP
);
472 for (int c
= newc
= 0; c
< oldc
; c
++) {
473 if ((cvd
= pvd
->vdev_child
[c
]) != NULL
) {
474 newchild
[newc
] = cvd
;
475 cvd
->vdev_id
= newc
++;
482 kmem_free(pvd
->vdev_child
, oldc
* sizeof (vdev_t
*));
483 pvd
->vdev_child
= newchild
;
484 pvd
->vdev_children
= newc
;
488 * Allocate and minimally initialize a vdev_t.
491 vdev_alloc_common(spa_t
*spa
, uint_t id
, uint64_t guid
, vdev_ops_t
*ops
)
494 vdev_indirect_config_t
*vic
;
496 vd
= kmem_zalloc(sizeof (vdev_t
), KM_SLEEP
);
497 vic
= &vd
->vdev_indirect_config
;
499 if (spa
->spa_root_vdev
== NULL
) {
500 ASSERT(ops
== &vdev_root_ops
);
501 spa
->spa_root_vdev
= vd
;
502 spa
->spa_load_guid
= spa_generate_guid(NULL
);
505 if (guid
== 0 && ops
!= &vdev_hole_ops
) {
506 if (spa
->spa_root_vdev
== vd
) {
508 * The root vdev's guid will also be the pool guid,
509 * which must be unique among all pools.
511 guid
= spa_generate_guid(NULL
);
514 * Any other vdev's guid must be unique within the pool.
516 guid
= spa_generate_guid(spa
);
518 ASSERT(!spa_guid_exists(spa_guid(spa
), guid
));
523 vd
->vdev_guid
= guid
;
524 vd
->vdev_guid_sum
= guid
;
526 vd
->vdev_state
= VDEV_STATE_CLOSED
;
527 vd
->vdev_ishole
= (ops
== &vdev_hole_ops
);
528 vic
->vic_prev_indirect_vdev
= UINT64_MAX
;
530 rw_init(&vd
->vdev_indirect_rwlock
, NULL
, RW_DEFAULT
, NULL
);
531 mutex_init(&vd
->vdev_obsolete_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
532 vd
->vdev_obsolete_segments
= range_tree_create(NULL
, RANGE_SEG64
, NULL
,
536 * Initialize rate limit structs for events. We rate limit ZIO delay
537 * and checksum events so that we don't overwhelm ZED with thousands
538 * of events when a disk is acting up.
540 zfs_ratelimit_init(&vd
->vdev_delay_rl
, &zfs_slow_io_events_per_second
,
542 zfs_ratelimit_init(&vd
->vdev_checksum_rl
,
543 &zfs_checksum_events_per_second
, 1);
545 list_link_init(&vd
->vdev_config_dirty_node
);
546 list_link_init(&vd
->vdev_state_dirty_node
);
547 list_link_init(&vd
->vdev_initialize_node
);
548 list_link_init(&vd
->vdev_leaf_node
);
549 list_link_init(&vd
->vdev_trim_node
);
550 mutex_init(&vd
->vdev_dtl_lock
, NULL
, MUTEX_NOLOCKDEP
, NULL
);
551 mutex_init(&vd
->vdev_stat_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
552 mutex_init(&vd
->vdev_probe_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
553 mutex_init(&vd
->vdev_scan_io_queue_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
554 mutex_init(&vd
->vdev_initialize_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
555 mutex_init(&vd
->vdev_initialize_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
556 cv_init(&vd
->vdev_initialize_cv
, NULL
, CV_DEFAULT
, NULL
);
557 cv_init(&vd
->vdev_initialize_io_cv
, NULL
, CV_DEFAULT
, NULL
);
558 mutex_init(&vd
->vdev_trim_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
559 mutex_init(&vd
->vdev_autotrim_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
560 mutex_init(&vd
->vdev_trim_io_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
561 cv_init(&vd
->vdev_trim_cv
, NULL
, CV_DEFAULT
, NULL
);
562 cv_init(&vd
->vdev_autotrim_cv
, NULL
, CV_DEFAULT
, NULL
);
563 cv_init(&vd
->vdev_trim_io_cv
, NULL
, CV_DEFAULT
, NULL
);
565 for (int t
= 0; t
< DTL_TYPES
; t
++) {
566 vd
->vdev_dtl
[t
] = range_tree_create(NULL
, RANGE_SEG64
, NULL
, 0,
569 txg_list_create(&vd
->vdev_ms_list
, spa
,
570 offsetof(struct metaslab
, ms_txg_node
));
571 txg_list_create(&vd
->vdev_dtl_list
, spa
,
572 offsetof(struct vdev
, vdev_dtl_node
));
573 vd
->vdev_stat
.vs_timestamp
= gethrtime();
581 * Allocate a new vdev. The 'alloctype' is used to control whether we are
582 * creating a new vdev or loading an existing one - the behavior is slightly
583 * different for each case.
586 vdev_alloc(spa_t
*spa
, vdev_t
**vdp
, nvlist_t
*nv
, vdev_t
*parent
, uint_t id
,
591 uint64_t guid
= 0, islog
, nparity
;
593 vdev_indirect_config_t
*vic
;
596 vdev_alloc_bias_t alloc_bias
= VDEV_BIAS_NONE
;
597 boolean_t top_level
= (parent
&& !parent
->vdev_parent
);
599 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
601 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_TYPE
, &type
) != 0)
602 return (SET_ERROR(EINVAL
));
604 if ((ops
= vdev_getops(type
)) == NULL
)
605 return (SET_ERROR(EINVAL
));
608 * If this is a load, get the vdev guid from the nvlist.
609 * Otherwise, vdev_alloc_common() will generate one for us.
611 if (alloctype
== VDEV_ALLOC_LOAD
) {
614 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ID
, &label_id
) ||
616 return (SET_ERROR(EINVAL
));
618 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
619 return (SET_ERROR(EINVAL
));
620 } else if (alloctype
== VDEV_ALLOC_SPARE
) {
621 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
622 return (SET_ERROR(EINVAL
));
623 } else if (alloctype
== VDEV_ALLOC_L2CACHE
) {
624 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
625 return (SET_ERROR(EINVAL
));
626 } else if (alloctype
== VDEV_ALLOC_ROOTPOOL
) {
627 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) != 0)
628 return (SET_ERROR(EINVAL
));
632 * The first allocated vdev must be of type 'root'.
634 if (ops
!= &vdev_root_ops
&& spa
->spa_root_vdev
== NULL
)
635 return (SET_ERROR(EINVAL
));
638 * Determine whether we're a log vdev.
641 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_IS_LOG
, &islog
);
642 if (islog
&& spa_version(spa
) < SPA_VERSION_SLOGS
)
643 return (SET_ERROR(ENOTSUP
));
645 if (ops
== &vdev_hole_ops
&& spa_version(spa
) < SPA_VERSION_HOLES
)
646 return (SET_ERROR(ENOTSUP
));
649 * Set the nparity property for RAID-Z vdevs.
652 if (ops
== &vdev_raidz_ops
) {
653 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_NPARITY
,
655 if (nparity
== 0 || nparity
> VDEV_RAIDZ_MAXPARITY
)
656 return (SET_ERROR(EINVAL
));
658 * Previous versions could only support 1 or 2 parity
662 spa_version(spa
) < SPA_VERSION_RAIDZ2
)
663 return (SET_ERROR(ENOTSUP
));
665 spa_version(spa
) < SPA_VERSION_RAIDZ3
)
666 return (SET_ERROR(ENOTSUP
));
669 * We require the parity to be specified for SPAs that
670 * support multiple parity levels.
672 if (spa_version(spa
) >= SPA_VERSION_RAIDZ2
)
673 return (SET_ERROR(EINVAL
));
675 * Otherwise, we default to 1 parity device for RAID-Z.
682 ASSERT(nparity
!= -1ULL);
685 * If creating a top-level vdev, check for allocation classes input
687 if (top_level
&& alloctype
== VDEV_ALLOC_ADD
) {
690 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_ALLOCATION_BIAS
,
692 alloc_bias
= vdev_derive_alloc_bias(bias
);
694 /* spa_vdev_add() expects feature to be enabled */
695 if (spa
->spa_load_state
!= SPA_LOAD_CREATE
&&
696 !spa_feature_is_enabled(spa
,
697 SPA_FEATURE_ALLOCATION_CLASSES
)) {
698 return (SET_ERROR(ENOTSUP
));
703 vd
= vdev_alloc_common(spa
, id
, guid
, ops
);
704 vic
= &vd
->vdev_indirect_config
;
706 vd
->vdev_islog
= islog
;
707 vd
->vdev_nparity
= nparity
;
708 if (top_level
&& alloc_bias
!= VDEV_BIAS_NONE
)
709 vd
->vdev_alloc_bias
= alloc_bias
;
711 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PATH
, &vd
->vdev_path
) == 0)
712 vd
->vdev_path
= spa_strdup(vd
->vdev_path
);
715 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
716 * fault on a vdev and want it to persist across imports (like with
719 rc
= nvlist_lookup_string(nv
, ZPOOL_CONFIG_AUX_STATE
, &tmp
);
720 if (rc
== 0 && tmp
!= NULL
&& strcmp(tmp
, "external") == 0) {
721 vd
->vdev_stat
.vs_aux
= VDEV_AUX_EXTERNAL
;
722 vd
->vdev_faulted
= 1;
723 vd
->vdev_label_aux
= VDEV_AUX_EXTERNAL
;
726 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_DEVID
, &vd
->vdev_devid
) == 0)
727 vd
->vdev_devid
= spa_strdup(vd
->vdev_devid
);
728 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_PHYS_PATH
,
729 &vd
->vdev_physpath
) == 0)
730 vd
->vdev_physpath
= spa_strdup(vd
->vdev_physpath
);
732 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH
,
733 &vd
->vdev_enc_sysfs_path
) == 0)
734 vd
->vdev_enc_sysfs_path
= spa_strdup(vd
->vdev_enc_sysfs_path
);
736 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_FRU
, &vd
->vdev_fru
) == 0)
737 vd
->vdev_fru
= spa_strdup(vd
->vdev_fru
);
740 * Set the whole_disk property. If it's not specified, leave the value
743 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_WHOLE_DISK
,
744 &vd
->vdev_wholedisk
) != 0)
745 vd
->vdev_wholedisk
= -1ULL;
747 ASSERT0(vic
->vic_mapping_object
);
748 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_INDIRECT_OBJECT
,
749 &vic
->vic_mapping_object
);
750 ASSERT0(vic
->vic_births_object
);
751 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_INDIRECT_BIRTHS
,
752 &vic
->vic_births_object
);
753 ASSERT3U(vic
->vic_prev_indirect_vdev
, ==, UINT64_MAX
);
754 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_PREV_INDIRECT_VDEV
,
755 &vic
->vic_prev_indirect_vdev
);
758 * Look for the 'not present' flag. This will only be set if the device
759 * was not present at the time of import.
761 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_NOT_PRESENT
,
762 &vd
->vdev_not_present
);
765 * Get the alignment requirement.
767 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ASHIFT
, &vd
->vdev_ashift
);
770 * Retrieve the vdev creation time.
772 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_CREATE_TXG
,
776 * If we're a top-level vdev, try to load the allocation parameters.
779 (alloctype
== VDEV_ALLOC_LOAD
|| alloctype
== VDEV_ALLOC_SPLIT
)) {
780 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_METASLAB_ARRAY
,
782 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_METASLAB_SHIFT
,
784 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ASIZE
,
786 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_REMOVING
,
788 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_VDEV_TOP_ZAP
,
791 ASSERT0(vd
->vdev_top_zap
);
794 if (top_level
&& alloctype
!= VDEV_ALLOC_ATTACH
) {
795 ASSERT(alloctype
== VDEV_ALLOC_LOAD
||
796 alloctype
== VDEV_ALLOC_ADD
||
797 alloctype
== VDEV_ALLOC_SPLIT
||
798 alloctype
== VDEV_ALLOC_ROOTPOOL
);
799 /* Note: metaslab_group_create() is now deferred */
802 if (vd
->vdev_ops
->vdev_op_leaf
&&
803 (alloctype
== VDEV_ALLOC_LOAD
|| alloctype
== VDEV_ALLOC_SPLIT
)) {
804 (void) nvlist_lookup_uint64(nv
,
805 ZPOOL_CONFIG_VDEV_LEAF_ZAP
, &vd
->vdev_leaf_zap
);
807 ASSERT0(vd
->vdev_leaf_zap
);
811 * If we're a leaf vdev, try to load the DTL object and other state.
814 if (vd
->vdev_ops
->vdev_op_leaf
&&
815 (alloctype
== VDEV_ALLOC_LOAD
|| alloctype
== VDEV_ALLOC_L2CACHE
||
816 alloctype
== VDEV_ALLOC_ROOTPOOL
)) {
817 if (alloctype
== VDEV_ALLOC_LOAD
) {
818 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_DTL
,
819 &vd
->vdev_dtl_object
);
820 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_UNSPARE
,
824 if (alloctype
== VDEV_ALLOC_ROOTPOOL
) {
827 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_IS_SPARE
,
828 &spare
) == 0 && spare
)
832 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_OFFLINE
,
835 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_RESILVER_TXG
,
836 &vd
->vdev_resilver_txg
);
838 if (nvlist_exists(nv
, ZPOOL_CONFIG_RESILVER_DEFER
))
839 vdev_defer_resilver(vd
);
842 * In general, when importing a pool we want to ignore the
843 * persistent fault state, as the diagnosis made on another
844 * system may not be valid in the current context. The only
845 * exception is if we forced a vdev to a persistently faulted
846 * state with 'zpool offline -f'. The persistent fault will
847 * remain across imports until cleared.
849 * Local vdevs will remain in the faulted state.
851 if (spa_load_state(spa
) == SPA_LOAD_OPEN
||
852 spa_load_state(spa
) == SPA_LOAD_IMPORT
) {
853 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_FAULTED
,
855 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_DEGRADED
,
857 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_REMOVED
,
860 if (vd
->vdev_faulted
|| vd
->vdev_degraded
) {
864 VDEV_AUX_ERR_EXCEEDED
;
865 if (nvlist_lookup_string(nv
,
866 ZPOOL_CONFIG_AUX_STATE
, &aux
) == 0 &&
867 strcmp(aux
, "external") == 0)
868 vd
->vdev_label_aux
= VDEV_AUX_EXTERNAL
;
870 vd
->vdev_faulted
= 0ULL;
876 * Add ourselves to the parent's list of children.
878 vdev_add_child(parent
, vd
);
886 vdev_free(vdev_t
*vd
)
888 spa_t
*spa
= vd
->vdev_spa
;
890 ASSERT3P(vd
->vdev_initialize_thread
, ==, NULL
);
891 ASSERT3P(vd
->vdev_trim_thread
, ==, NULL
);
892 ASSERT3P(vd
->vdev_autotrim_thread
, ==, NULL
);
895 * Scan queues are normally destroyed at the end of a scan. If the
896 * queue exists here, that implies the vdev is being removed while
897 * the scan is still running.
899 if (vd
->vdev_scan_io_queue
!= NULL
) {
900 mutex_enter(&vd
->vdev_scan_io_queue_lock
);
901 dsl_scan_io_queue_destroy(vd
->vdev_scan_io_queue
);
902 vd
->vdev_scan_io_queue
= NULL
;
903 mutex_exit(&vd
->vdev_scan_io_queue_lock
);
907 * vdev_free() implies closing the vdev first. This is simpler than
908 * trying to ensure complicated semantics for all callers.
912 ASSERT(!list_link_active(&vd
->vdev_config_dirty_node
));
913 ASSERT(!list_link_active(&vd
->vdev_state_dirty_node
));
918 for (int c
= 0; c
< vd
->vdev_children
; c
++)
919 vdev_free(vd
->vdev_child
[c
]);
921 ASSERT(vd
->vdev_child
== NULL
);
922 ASSERT(vd
->vdev_guid_sum
== vd
->vdev_guid
);
925 * Discard allocation state.
927 if (vd
->vdev_mg
!= NULL
) {
928 vdev_metaslab_fini(vd
);
929 metaslab_group_destroy(vd
->vdev_mg
);
933 ASSERT0(vd
->vdev_stat
.vs_space
);
934 ASSERT0(vd
->vdev_stat
.vs_dspace
);
935 ASSERT0(vd
->vdev_stat
.vs_alloc
);
938 * Remove this vdev from its parent's child list.
940 vdev_remove_child(vd
->vdev_parent
, vd
);
942 ASSERT(vd
->vdev_parent
== NULL
);
943 ASSERT(!list_link_active(&vd
->vdev_leaf_node
));
946 * Clean up vdev structure.
952 spa_strfree(vd
->vdev_path
);
954 spa_strfree(vd
->vdev_devid
);
955 if (vd
->vdev_physpath
)
956 spa_strfree(vd
->vdev_physpath
);
958 if (vd
->vdev_enc_sysfs_path
)
959 spa_strfree(vd
->vdev_enc_sysfs_path
);
962 spa_strfree(vd
->vdev_fru
);
964 if (vd
->vdev_isspare
)
965 spa_spare_remove(vd
);
966 if (vd
->vdev_isl2cache
)
967 spa_l2cache_remove(vd
);
969 txg_list_destroy(&vd
->vdev_ms_list
);
970 txg_list_destroy(&vd
->vdev_dtl_list
);
972 mutex_enter(&vd
->vdev_dtl_lock
);
973 space_map_close(vd
->vdev_dtl_sm
);
974 for (int t
= 0; t
< DTL_TYPES
; t
++) {
975 range_tree_vacate(vd
->vdev_dtl
[t
], NULL
, NULL
);
976 range_tree_destroy(vd
->vdev_dtl
[t
]);
978 mutex_exit(&vd
->vdev_dtl_lock
);
980 EQUIV(vd
->vdev_indirect_births
!= NULL
,
981 vd
->vdev_indirect_mapping
!= NULL
);
982 if (vd
->vdev_indirect_births
!= NULL
) {
983 vdev_indirect_mapping_close(vd
->vdev_indirect_mapping
);
984 vdev_indirect_births_close(vd
->vdev_indirect_births
);
987 if (vd
->vdev_obsolete_sm
!= NULL
) {
988 ASSERT(vd
->vdev_removing
||
989 vd
->vdev_ops
== &vdev_indirect_ops
);
990 space_map_close(vd
->vdev_obsolete_sm
);
991 vd
->vdev_obsolete_sm
= NULL
;
993 range_tree_destroy(vd
->vdev_obsolete_segments
);
994 rw_destroy(&vd
->vdev_indirect_rwlock
);
995 mutex_destroy(&vd
->vdev_obsolete_lock
);
997 mutex_destroy(&vd
->vdev_dtl_lock
);
998 mutex_destroy(&vd
->vdev_stat_lock
);
999 mutex_destroy(&vd
->vdev_probe_lock
);
1000 mutex_destroy(&vd
->vdev_scan_io_queue_lock
);
1001 mutex_destroy(&vd
->vdev_initialize_lock
);
1002 mutex_destroy(&vd
->vdev_initialize_io_lock
);
1003 cv_destroy(&vd
->vdev_initialize_io_cv
);
1004 cv_destroy(&vd
->vdev_initialize_cv
);
1005 mutex_destroy(&vd
->vdev_trim_lock
);
1006 mutex_destroy(&vd
->vdev_autotrim_lock
);
1007 mutex_destroy(&vd
->vdev_trim_io_lock
);
1008 cv_destroy(&vd
->vdev_trim_cv
);
1009 cv_destroy(&vd
->vdev_autotrim_cv
);
1010 cv_destroy(&vd
->vdev_trim_io_cv
);
1012 zfs_ratelimit_fini(&vd
->vdev_delay_rl
);
1013 zfs_ratelimit_fini(&vd
->vdev_checksum_rl
);
1015 if (vd
== spa
->spa_root_vdev
)
1016 spa
->spa_root_vdev
= NULL
;
1018 kmem_free(vd
, sizeof (vdev_t
));
1022 * Transfer top-level vdev state from svd to tvd.
1025 vdev_top_transfer(vdev_t
*svd
, vdev_t
*tvd
)
1027 spa_t
*spa
= svd
->vdev_spa
;
1032 ASSERT(tvd
== tvd
->vdev_top
);
1034 tvd
->vdev_pending_fastwrite
= svd
->vdev_pending_fastwrite
;
1035 tvd
->vdev_ms_array
= svd
->vdev_ms_array
;
1036 tvd
->vdev_ms_shift
= svd
->vdev_ms_shift
;
1037 tvd
->vdev_ms_count
= svd
->vdev_ms_count
;
1038 tvd
->vdev_top_zap
= svd
->vdev_top_zap
;
1040 svd
->vdev_ms_array
= 0;
1041 svd
->vdev_ms_shift
= 0;
1042 svd
->vdev_ms_count
= 0;
1043 svd
->vdev_top_zap
= 0;
1046 ASSERT3P(tvd
->vdev_mg
, ==, svd
->vdev_mg
);
1047 tvd
->vdev_mg
= svd
->vdev_mg
;
1048 tvd
->vdev_ms
= svd
->vdev_ms
;
1050 svd
->vdev_mg
= NULL
;
1051 svd
->vdev_ms
= NULL
;
1053 if (tvd
->vdev_mg
!= NULL
)
1054 tvd
->vdev_mg
->mg_vd
= tvd
;
1056 tvd
->vdev_checkpoint_sm
= svd
->vdev_checkpoint_sm
;
1057 svd
->vdev_checkpoint_sm
= NULL
;
1059 tvd
->vdev_alloc_bias
= svd
->vdev_alloc_bias
;
1060 svd
->vdev_alloc_bias
= VDEV_BIAS_NONE
;
1062 tvd
->vdev_stat
.vs_alloc
= svd
->vdev_stat
.vs_alloc
;
1063 tvd
->vdev_stat
.vs_space
= svd
->vdev_stat
.vs_space
;
1064 tvd
->vdev_stat
.vs_dspace
= svd
->vdev_stat
.vs_dspace
;
1066 svd
->vdev_stat
.vs_alloc
= 0;
1067 svd
->vdev_stat
.vs_space
= 0;
1068 svd
->vdev_stat
.vs_dspace
= 0;
1071 * State which may be set on a top-level vdev that's in the
1072 * process of being removed.
1074 ASSERT0(tvd
->vdev_indirect_config
.vic_births_object
);
1075 ASSERT0(tvd
->vdev_indirect_config
.vic_mapping_object
);
1076 ASSERT3U(tvd
->vdev_indirect_config
.vic_prev_indirect_vdev
, ==, -1ULL);
1077 ASSERT3P(tvd
->vdev_indirect_mapping
, ==, NULL
);
1078 ASSERT3P(tvd
->vdev_indirect_births
, ==, NULL
);
1079 ASSERT3P(tvd
->vdev_obsolete_sm
, ==, NULL
);
1080 ASSERT0(tvd
->vdev_removing
);
1081 tvd
->vdev_removing
= svd
->vdev_removing
;
1082 tvd
->vdev_indirect_config
= svd
->vdev_indirect_config
;
1083 tvd
->vdev_indirect_mapping
= svd
->vdev_indirect_mapping
;
1084 tvd
->vdev_indirect_births
= svd
->vdev_indirect_births
;
1085 range_tree_swap(&svd
->vdev_obsolete_segments
,
1086 &tvd
->vdev_obsolete_segments
);
1087 tvd
->vdev_obsolete_sm
= svd
->vdev_obsolete_sm
;
1088 svd
->vdev_indirect_config
.vic_mapping_object
= 0;
1089 svd
->vdev_indirect_config
.vic_births_object
= 0;
1090 svd
->vdev_indirect_config
.vic_prev_indirect_vdev
= -1ULL;
1091 svd
->vdev_indirect_mapping
= NULL
;
1092 svd
->vdev_indirect_births
= NULL
;
1093 svd
->vdev_obsolete_sm
= NULL
;
1094 svd
->vdev_removing
= 0;
1096 for (t
= 0; t
< TXG_SIZE
; t
++) {
1097 while ((msp
= txg_list_remove(&svd
->vdev_ms_list
, t
)) != NULL
)
1098 (void) txg_list_add(&tvd
->vdev_ms_list
, msp
, t
);
1099 while ((vd
= txg_list_remove(&svd
->vdev_dtl_list
, t
)) != NULL
)
1100 (void) txg_list_add(&tvd
->vdev_dtl_list
, vd
, t
);
1101 if (txg_list_remove_this(&spa
->spa_vdev_txg_list
, svd
, t
))
1102 (void) txg_list_add(&spa
->spa_vdev_txg_list
, tvd
, t
);
1105 if (list_link_active(&svd
->vdev_config_dirty_node
)) {
1106 vdev_config_clean(svd
);
1107 vdev_config_dirty(tvd
);
1110 if (list_link_active(&svd
->vdev_state_dirty_node
)) {
1111 vdev_state_clean(svd
);
1112 vdev_state_dirty(tvd
);
1115 tvd
->vdev_deflate_ratio
= svd
->vdev_deflate_ratio
;
1116 svd
->vdev_deflate_ratio
= 0;
1118 tvd
->vdev_islog
= svd
->vdev_islog
;
1119 svd
->vdev_islog
= 0;
1121 dsl_scan_io_queue_vdev_xfer(svd
, tvd
);
1125 vdev_top_update(vdev_t
*tvd
, vdev_t
*vd
)
1132 for (int c
= 0; c
< vd
->vdev_children
; c
++)
1133 vdev_top_update(tvd
, vd
->vdev_child
[c
]);
1137 * Add a mirror/replacing vdev above an existing vdev.
1140 vdev_add_parent(vdev_t
*cvd
, vdev_ops_t
*ops
)
1142 spa_t
*spa
= cvd
->vdev_spa
;
1143 vdev_t
*pvd
= cvd
->vdev_parent
;
1146 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
1148 mvd
= vdev_alloc_common(spa
, cvd
->vdev_id
, 0, ops
);
1150 mvd
->vdev_asize
= cvd
->vdev_asize
;
1151 mvd
->vdev_min_asize
= cvd
->vdev_min_asize
;
1152 mvd
->vdev_max_asize
= cvd
->vdev_max_asize
;
1153 mvd
->vdev_psize
= cvd
->vdev_psize
;
1154 mvd
->vdev_ashift
= cvd
->vdev_ashift
;
1155 mvd
->vdev_state
= cvd
->vdev_state
;
1156 mvd
->vdev_crtxg
= cvd
->vdev_crtxg
;
1158 vdev_remove_child(pvd
, cvd
);
1159 vdev_add_child(pvd
, mvd
);
1160 cvd
->vdev_id
= mvd
->vdev_children
;
1161 vdev_add_child(mvd
, cvd
);
1162 vdev_top_update(cvd
->vdev_top
, cvd
->vdev_top
);
1164 if (mvd
== mvd
->vdev_top
)
1165 vdev_top_transfer(cvd
, mvd
);
1171 * Remove a 1-way mirror/replacing vdev from the tree.
1174 vdev_remove_parent(vdev_t
*cvd
)
1176 vdev_t
*mvd
= cvd
->vdev_parent
;
1177 vdev_t
*pvd
= mvd
->vdev_parent
;
1179 ASSERT(spa_config_held(cvd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
1181 ASSERT(mvd
->vdev_children
== 1);
1182 ASSERT(mvd
->vdev_ops
== &vdev_mirror_ops
||
1183 mvd
->vdev_ops
== &vdev_replacing_ops
||
1184 mvd
->vdev_ops
== &vdev_spare_ops
);
1185 cvd
->vdev_ashift
= mvd
->vdev_ashift
;
1187 vdev_remove_child(mvd
, cvd
);
1188 vdev_remove_child(pvd
, mvd
);
1191 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
1192 * Otherwise, we could have detached an offline device, and when we
1193 * go to import the pool we'll think we have two top-level vdevs,
1194 * instead of a different version of the same top-level vdev.
1196 if (mvd
->vdev_top
== mvd
) {
1197 uint64_t guid_delta
= mvd
->vdev_guid
- cvd
->vdev_guid
;
1198 cvd
->vdev_orig_guid
= cvd
->vdev_guid
;
1199 cvd
->vdev_guid
+= guid_delta
;
1200 cvd
->vdev_guid_sum
+= guid_delta
;
1203 * If pool not set for autoexpand, we need to also preserve
1204 * mvd's asize to prevent automatic expansion of cvd.
1205 * Otherwise if we are adjusting the mirror by attaching and
1206 * detaching children of non-uniform sizes, the mirror could
1207 * autoexpand, unexpectedly requiring larger devices to
1208 * re-establish the mirror.
1210 if (!cvd
->vdev_spa
->spa_autoexpand
)
1211 cvd
->vdev_asize
= mvd
->vdev_asize
;
1213 cvd
->vdev_id
= mvd
->vdev_id
;
1214 vdev_add_child(pvd
, cvd
);
1215 vdev_top_update(cvd
->vdev_top
, cvd
->vdev_top
);
1217 if (cvd
== cvd
->vdev_top
)
1218 vdev_top_transfer(mvd
, cvd
);
1220 ASSERT(mvd
->vdev_children
== 0);
1225 vdev_metaslab_group_create(vdev_t
*vd
)
1227 spa_t
*spa
= vd
->vdev_spa
;
1230 * metaslab_group_create was delayed until allocation bias was available
1232 if (vd
->vdev_mg
== NULL
) {
1233 metaslab_class_t
*mc
;
1235 if (vd
->vdev_islog
&& vd
->vdev_alloc_bias
== VDEV_BIAS_NONE
)
1236 vd
->vdev_alloc_bias
= VDEV_BIAS_LOG
;
1238 ASSERT3U(vd
->vdev_islog
, ==,
1239 (vd
->vdev_alloc_bias
== VDEV_BIAS_LOG
));
1241 switch (vd
->vdev_alloc_bias
) {
1243 mc
= spa_log_class(spa
);
1245 case VDEV_BIAS_SPECIAL
:
1246 mc
= spa_special_class(spa
);
1248 case VDEV_BIAS_DEDUP
:
1249 mc
= spa_dedup_class(spa
);
1252 mc
= spa_normal_class(spa
);
1255 vd
->vdev_mg
= metaslab_group_create(mc
, vd
,
1256 spa
->spa_alloc_count
);
1259 * The spa ashift values currently only reflect the
1260 * general vdev classes. Class destination is late
1261 * binding so ashift checking had to wait until now
1263 if (vd
->vdev_top
== vd
&& vd
->vdev_ashift
!= 0 &&
1264 mc
== spa_normal_class(spa
) && vd
->vdev_aux
== NULL
) {
1265 if (vd
->vdev_ashift
> spa
->spa_max_ashift
)
1266 spa
->spa_max_ashift
= vd
->vdev_ashift
;
1267 if (vd
->vdev_ashift
< spa
->spa_min_ashift
)
1268 spa
->spa_min_ashift
= vd
->vdev_ashift
;
1274 vdev_metaslab_init(vdev_t
*vd
, uint64_t txg
)
1276 spa_t
*spa
= vd
->vdev_spa
;
1277 objset_t
*mos
= spa
->spa_meta_objset
;
1279 uint64_t oldc
= vd
->vdev_ms_count
;
1280 uint64_t newc
= vd
->vdev_asize
>> vd
->vdev_ms_shift
;
1283 boolean_t expanding
= (oldc
!= 0);
1285 ASSERT(txg
== 0 || spa_config_held(spa
, SCL_ALLOC
, RW_WRITER
));
1288 * This vdev is not being allocated from yet or is a hole.
1290 if (vd
->vdev_ms_shift
== 0)
1293 ASSERT(!vd
->vdev_ishole
);
1295 ASSERT(oldc
<= newc
);
1297 mspp
= vmem_zalloc(newc
* sizeof (*mspp
), KM_SLEEP
);
1300 bcopy(vd
->vdev_ms
, mspp
, oldc
* sizeof (*mspp
));
1301 vmem_free(vd
->vdev_ms
, oldc
* sizeof (*mspp
));
1305 vd
->vdev_ms_count
= newc
;
1306 for (m
= oldc
; m
< newc
; m
++) {
1307 uint64_t object
= 0;
1310 * vdev_ms_array may be 0 if we are creating the "fake"
1311 * metaslabs for an indirect vdev for zdb's leak detection.
1312 * See zdb_leak_init().
1314 if (txg
== 0 && vd
->vdev_ms_array
!= 0) {
1315 error
= dmu_read(mos
, vd
->vdev_ms_array
,
1316 m
* sizeof (uint64_t), sizeof (uint64_t), &object
,
1319 vdev_dbgmsg(vd
, "unable to read the metaslab "
1320 "array [error=%d]", error
);
1327 * To accommodate zdb_leak_init() fake indirect
1328 * metaslabs, we allocate a metaslab group for
1329 * indirect vdevs which normally don't have one.
1331 if (vd
->vdev_mg
== NULL
) {
1332 ASSERT0(vdev_is_concrete(vd
));
1333 vdev_metaslab_group_create(vd
);
1336 error
= metaslab_init(vd
->vdev_mg
, m
, object
, txg
,
1339 vdev_dbgmsg(vd
, "metaslab_init failed [error=%d]",
1346 spa_config_enter(spa
, SCL_ALLOC
, FTAG
, RW_WRITER
);
1349 * If the vdev is being removed we don't activate
1350 * the metaslabs since we want to ensure that no new
1351 * allocations are performed on this device.
1353 if (!expanding
&& !vd
->vdev_removing
) {
1354 metaslab_group_activate(vd
->vdev_mg
);
1358 spa_config_exit(spa
, SCL_ALLOC
, FTAG
);
1361 * Regardless whether this vdev was just added or it is being
1362 * expanded, the metaslab count has changed. Recalculate the
1365 spa_log_sm_set_blocklimit(spa
);
1371 vdev_metaslab_fini(vdev_t
*vd
)
1373 if (vd
->vdev_checkpoint_sm
!= NULL
) {
1374 ASSERT(spa_feature_is_active(vd
->vdev_spa
,
1375 SPA_FEATURE_POOL_CHECKPOINT
));
1376 space_map_close(vd
->vdev_checkpoint_sm
);
1378 * Even though we close the space map, we need to set its
1379 * pointer to NULL. The reason is that vdev_metaslab_fini()
1380 * may be called multiple times for certain operations
1381 * (i.e. when destroying a pool) so we need to ensure that
1382 * this clause never executes twice. This logic is similar
1383 * to the one used for the vdev_ms clause below.
1385 vd
->vdev_checkpoint_sm
= NULL
;
1388 if (vd
->vdev_ms
!= NULL
) {
1389 metaslab_group_t
*mg
= vd
->vdev_mg
;
1390 metaslab_group_passivate(mg
);
1392 uint64_t count
= vd
->vdev_ms_count
;
1393 for (uint64_t m
= 0; m
< count
; m
++) {
1394 metaslab_t
*msp
= vd
->vdev_ms
[m
];
1398 vmem_free(vd
->vdev_ms
, count
* sizeof (metaslab_t
*));
1401 vd
->vdev_ms_count
= 0;
1403 for (int i
= 0; i
< RANGE_TREE_HISTOGRAM_SIZE
; i
++)
1404 ASSERT0(mg
->mg_histogram
[i
]);
1406 ASSERT0(vd
->vdev_ms_count
);
1407 ASSERT3U(vd
->vdev_pending_fastwrite
, ==, 0);
1410 typedef struct vdev_probe_stats
{
1411 boolean_t vps_readable
;
1412 boolean_t vps_writeable
;
1414 } vdev_probe_stats_t
;
1417 vdev_probe_done(zio_t
*zio
)
1419 spa_t
*spa
= zio
->io_spa
;
1420 vdev_t
*vd
= zio
->io_vd
;
1421 vdev_probe_stats_t
*vps
= zio
->io_private
;
1423 ASSERT(vd
->vdev_probe_zio
!= NULL
);
1425 if (zio
->io_type
== ZIO_TYPE_READ
) {
1426 if (zio
->io_error
== 0)
1427 vps
->vps_readable
= 1;
1428 if (zio
->io_error
== 0 && spa_writeable(spa
)) {
1429 zio_nowait(zio_write_phys(vd
->vdev_probe_zio
, vd
,
1430 zio
->io_offset
, zio
->io_size
, zio
->io_abd
,
1431 ZIO_CHECKSUM_OFF
, vdev_probe_done
, vps
,
1432 ZIO_PRIORITY_SYNC_WRITE
, vps
->vps_flags
, B_TRUE
));
1434 abd_free(zio
->io_abd
);
1436 } else if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1437 if (zio
->io_error
== 0)
1438 vps
->vps_writeable
= 1;
1439 abd_free(zio
->io_abd
);
1440 } else if (zio
->io_type
== ZIO_TYPE_NULL
) {
1444 vd
->vdev_cant_read
|= !vps
->vps_readable
;
1445 vd
->vdev_cant_write
|= !vps
->vps_writeable
;
1447 if (vdev_readable(vd
) &&
1448 (vdev_writeable(vd
) || !spa_writeable(spa
))) {
1451 ASSERT(zio
->io_error
!= 0);
1452 vdev_dbgmsg(vd
, "failed probe");
1453 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE
,
1454 spa
, vd
, NULL
, NULL
, 0, 0);
1455 zio
->io_error
= SET_ERROR(ENXIO
);
1458 mutex_enter(&vd
->vdev_probe_lock
);
1459 ASSERT(vd
->vdev_probe_zio
== zio
);
1460 vd
->vdev_probe_zio
= NULL
;
1461 mutex_exit(&vd
->vdev_probe_lock
);
1464 while ((pio
= zio_walk_parents(zio
, &zl
)) != NULL
)
1465 if (!vdev_accessible(vd
, pio
))
1466 pio
->io_error
= SET_ERROR(ENXIO
);
1468 kmem_free(vps
, sizeof (*vps
));
1473 * Determine whether this device is accessible.
1475 * Read and write to several known locations: the pad regions of each
1476 * vdev label but the first, which we leave alone in case it contains
1480 vdev_probe(vdev_t
*vd
, zio_t
*zio
)
1482 spa_t
*spa
= vd
->vdev_spa
;
1483 vdev_probe_stats_t
*vps
= NULL
;
1486 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
1489 * Don't probe the probe.
1491 if (zio
&& (zio
->io_flags
& ZIO_FLAG_PROBE
))
1495 * To prevent 'probe storms' when a device fails, we create
1496 * just one probe i/o at a time. All zios that want to probe
1497 * this vdev will become parents of the probe io.
1499 mutex_enter(&vd
->vdev_probe_lock
);
1501 if ((pio
= vd
->vdev_probe_zio
) == NULL
) {
1502 vps
= kmem_zalloc(sizeof (*vps
), KM_SLEEP
);
1504 vps
->vps_flags
= ZIO_FLAG_CANFAIL
| ZIO_FLAG_PROBE
|
1505 ZIO_FLAG_DONT_CACHE
| ZIO_FLAG_DONT_AGGREGATE
|
1508 if (spa_config_held(spa
, SCL_ZIO
, RW_WRITER
)) {
1510 * vdev_cant_read and vdev_cant_write can only
1511 * transition from TRUE to FALSE when we have the
1512 * SCL_ZIO lock as writer; otherwise they can only
1513 * transition from FALSE to TRUE. This ensures that
1514 * any zio looking at these values can assume that
1515 * failures persist for the life of the I/O. That's
1516 * important because when a device has intermittent
1517 * connectivity problems, we want to ensure that
1518 * they're ascribed to the device (ENXIO) and not
1521 * Since we hold SCL_ZIO as writer here, clear both
1522 * values so the probe can reevaluate from first
1525 vps
->vps_flags
|= ZIO_FLAG_CONFIG_WRITER
;
1526 vd
->vdev_cant_read
= B_FALSE
;
1527 vd
->vdev_cant_write
= B_FALSE
;
1530 vd
->vdev_probe_zio
= pio
= zio_null(NULL
, spa
, vd
,
1531 vdev_probe_done
, vps
,
1532 vps
->vps_flags
| ZIO_FLAG_DONT_PROPAGATE
);
1535 * We can't change the vdev state in this context, so we
1536 * kick off an async task to do it on our behalf.
1539 vd
->vdev_probe_wanted
= B_TRUE
;
1540 spa_async_request(spa
, SPA_ASYNC_PROBE
);
1545 zio_add_child(zio
, pio
);
1547 mutex_exit(&vd
->vdev_probe_lock
);
1550 ASSERT(zio
!= NULL
);
1554 for (int l
= 1; l
< VDEV_LABELS
; l
++) {
1555 zio_nowait(zio_read_phys(pio
, vd
,
1556 vdev_label_offset(vd
->vdev_psize
, l
,
1557 offsetof(vdev_label_t
, vl_pad2
)), VDEV_PAD_SIZE
,
1558 abd_alloc_for_io(VDEV_PAD_SIZE
, B_TRUE
),
1559 ZIO_CHECKSUM_OFF
, vdev_probe_done
, vps
,
1560 ZIO_PRIORITY_SYNC_READ
, vps
->vps_flags
, B_TRUE
));
1571 vdev_open_child(void *arg
)
1575 vd
->vdev_open_thread
= curthread
;
1576 vd
->vdev_open_error
= vdev_open(vd
);
1577 vd
->vdev_open_thread
= NULL
;
1581 vdev_uses_zvols(vdev_t
*vd
)
1584 if (zvol_is_zvol(vd
->vdev_path
))
1588 for (int c
= 0; c
< vd
->vdev_children
; c
++)
1589 if (vdev_uses_zvols(vd
->vdev_child
[c
]))
1596 vdev_open_children(vdev_t
*vd
)
1599 int children
= vd
->vdev_children
;
1602 * in order to handle pools on top of zvols, do the opens
1603 * in a single thread so that the same thread holds the
1604 * spa_namespace_lock
1606 if (vdev_uses_zvols(vd
)) {
1608 for (int c
= 0; c
< children
; c
++)
1609 vd
->vdev_child
[c
]->vdev_open_error
=
1610 vdev_open(vd
->vdev_child
[c
]);
1612 tq
= taskq_create("vdev_open", children
, minclsyspri
,
1613 children
, children
, TASKQ_PREPOPULATE
);
1617 for (int c
= 0; c
< children
; c
++)
1618 VERIFY(taskq_dispatch(tq
, vdev_open_child
,
1619 vd
->vdev_child
[c
], TQ_SLEEP
) != TASKQID_INVALID
);
1624 vd
->vdev_nonrot
= B_TRUE
;
1626 for (int c
= 0; c
< children
; c
++)
1627 vd
->vdev_nonrot
&= vd
->vdev_child
[c
]->vdev_nonrot
;
1631 * Compute the raidz-deflation ratio. Note, we hard-code
1632 * in 128k (1 << 17) because it is the "typical" blocksize.
1633 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
1634 * otherwise it would inconsistently account for existing bp's.
1637 vdev_set_deflate_ratio(vdev_t
*vd
)
1639 if (vd
== vd
->vdev_top
&& !vd
->vdev_ishole
&& vd
->vdev_ashift
!= 0) {
1640 vd
->vdev_deflate_ratio
= (1 << 17) /
1641 (vdev_psize_to_asize(vd
, 1 << 17) >> SPA_MINBLOCKSHIFT
);
1646 * Prepare a virtual device for access.
1649 vdev_open(vdev_t
*vd
)
1651 spa_t
*spa
= vd
->vdev_spa
;
1654 uint64_t max_osize
= 0;
1655 uint64_t asize
, max_asize
, psize
;
1656 uint64_t ashift
= 0;
1658 ASSERT(vd
->vdev_open_thread
== curthread
||
1659 spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
1660 ASSERT(vd
->vdev_state
== VDEV_STATE_CLOSED
||
1661 vd
->vdev_state
== VDEV_STATE_CANT_OPEN
||
1662 vd
->vdev_state
== VDEV_STATE_OFFLINE
);
1664 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
1665 vd
->vdev_cant_read
= B_FALSE
;
1666 vd
->vdev_cant_write
= B_FALSE
;
1667 vd
->vdev_min_asize
= vdev_get_min_asize(vd
);
1670 * If this vdev is not removed, check its fault status. If it's
1671 * faulted, bail out of the open.
1673 if (!vd
->vdev_removed
&& vd
->vdev_faulted
) {
1674 ASSERT(vd
->vdev_children
== 0);
1675 ASSERT(vd
->vdev_label_aux
== VDEV_AUX_ERR_EXCEEDED
||
1676 vd
->vdev_label_aux
== VDEV_AUX_EXTERNAL
);
1677 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_FAULTED
,
1678 vd
->vdev_label_aux
);
1679 return (SET_ERROR(ENXIO
));
1680 } else if (vd
->vdev_offline
) {
1681 ASSERT(vd
->vdev_children
== 0);
1682 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_OFFLINE
, VDEV_AUX_NONE
);
1683 return (SET_ERROR(ENXIO
));
1686 error
= vd
->vdev_ops
->vdev_op_open(vd
, &osize
, &max_osize
, &ashift
);
1689 * Physical volume size should never be larger than its max size, unless
1690 * the disk has shrunk while we were reading it or the device is buggy
1691 * or damaged: either way it's not safe for use, bail out of the open.
1693 if (osize
> max_osize
) {
1694 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1695 VDEV_AUX_OPEN_FAILED
);
1696 return (SET_ERROR(ENXIO
));
1700 * Reset the vdev_reopening flag so that we actually close
1701 * the vdev on error.
1703 vd
->vdev_reopening
= B_FALSE
;
1704 if (zio_injection_enabled
&& error
== 0)
1705 error
= zio_handle_device_injection(vd
, NULL
, SET_ERROR(ENXIO
));
1708 if (vd
->vdev_removed
&&
1709 vd
->vdev_stat
.vs_aux
!= VDEV_AUX_OPEN_FAILED
)
1710 vd
->vdev_removed
= B_FALSE
;
1712 if (vd
->vdev_stat
.vs_aux
== VDEV_AUX_CHILDREN_OFFLINE
) {
1713 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_OFFLINE
,
1714 vd
->vdev_stat
.vs_aux
);
1716 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1717 vd
->vdev_stat
.vs_aux
);
1722 vd
->vdev_removed
= B_FALSE
;
1725 * Recheck the faulted flag now that we have confirmed that
1726 * the vdev is accessible. If we're faulted, bail.
1728 if (vd
->vdev_faulted
) {
1729 ASSERT(vd
->vdev_children
== 0);
1730 ASSERT(vd
->vdev_label_aux
== VDEV_AUX_ERR_EXCEEDED
||
1731 vd
->vdev_label_aux
== VDEV_AUX_EXTERNAL
);
1732 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_FAULTED
,
1733 vd
->vdev_label_aux
);
1734 return (SET_ERROR(ENXIO
));
1737 if (vd
->vdev_degraded
) {
1738 ASSERT(vd
->vdev_children
== 0);
1739 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_DEGRADED
,
1740 VDEV_AUX_ERR_EXCEEDED
);
1742 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_HEALTHY
, 0);
1746 * For hole or missing vdevs we just return success.
1748 if (vd
->vdev_ishole
|| vd
->vdev_ops
== &vdev_missing_ops
)
1751 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
1752 if (vd
->vdev_child
[c
]->vdev_state
!= VDEV_STATE_HEALTHY
) {
1753 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_DEGRADED
,
1759 osize
= P2ALIGN(osize
, (uint64_t)sizeof (vdev_label_t
));
1760 max_osize
= P2ALIGN(max_osize
, (uint64_t)sizeof (vdev_label_t
));
1762 if (vd
->vdev_children
== 0) {
1763 if (osize
< SPA_MINDEVSIZE
) {
1764 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1765 VDEV_AUX_TOO_SMALL
);
1766 return (SET_ERROR(EOVERFLOW
));
1769 asize
= osize
- (VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
);
1770 max_asize
= max_osize
- (VDEV_LABEL_START_SIZE
+
1771 VDEV_LABEL_END_SIZE
);
1773 if (vd
->vdev_parent
!= NULL
&& osize
< SPA_MINDEVSIZE
-
1774 (VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
)) {
1775 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1776 VDEV_AUX_TOO_SMALL
);
1777 return (SET_ERROR(EOVERFLOW
));
1781 max_asize
= max_osize
;
1785 * If the vdev was expanded, record this so that we can re-create the
1786 * uberblock rings in labels {2,3}, during the next sync.
1788 if ((psize
> vd
->vdev_psize
) && (vd
->vdev_psize
!= 0))
1789 vd
->vdev_copy_uberblocks
= B_TRUE
;
1791 vd
->vdev_psize
= psize
;
1794 * Make sure the allocatable size hasn't shrunk too much.
1796 if (asize
< vd
->vdev_min_asize
) {
1797 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1798 VDEV_AUX_BAD_LABEL
);
1799 return (SET_ERROR(EINVAL
));
1802 if (vd
->vdev_asize
== 0) {
1804 * This is the first-ever open, so use the computed values.
1805 * For compatibility, a different ashift can be requested.
1807 vd
->vdev_asize
= asize
;
1808 vd
->vdev_max_asize
= max_asize
;
1809 if (vd
->vdev_ashift
== 0) {
1810 vd
->vdev_ashift
= ashift
; /* use detected value */
1812 if (vd
->vdev_ashift
!= 0 && (vd
->vdev_ashift
< ASHIFT_MIN
||
1813 vd
->vdev_ashift
> ASHIFT_MAX
)) {
1814 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1815 VDEV_AUX_BAD_ASHIFT
);
1816 return (SET_ERROR(EDOM
));
1820 * Detect if the alignment requirement has increased.
1821 * We don't want to make the pool unavailable, just
1822 * post an event instead.
1824 if (ashift
> vd
->vdev_top
->vdev_ashift
&&
1825 vd
->vdev_ops
->vdev_op_leaf
) {
1826 zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT
,
1827 spa
, vd
, NULL
, NULL
, 0, 0);
1830 vd
->vdev_max_asize
= max_asize
;
1834 * If all children are healthy we update asize if either:
1835 * The asize has increased, due to a device expansion caused by dynamic
1836 * LUN growth or vdev replacement, and automatic expansion is enabled;
1837 * making the additional space available.
1839 * The asize has decreased, due to a device shrink usually caused by a
1840 * vdev replace with a smaller device. This ensures that calculations
1841 * based of max_asize and asize e.g. esize are always valid. It's safe
1842 * to do this as we've already validated that asize is greater than
1845 if (vd
->vdev_state
== VDEV_STATE_HEALTHY
&&
1846 ((asize
> vd
->vdev_asize
&&
1847 (vd
->vdev_expanding
|| spa
->spa_autoexpand
)) ||
1848 (asize
< vd
->vdev_asize
)))
1849 vd
->vdev_asize
= asize
;
1851 vdev_set_min_asize(vd
);
1854 * Ensure we can issue some IO before declaring the
1855 * vdev open for business.
1857 if (vd
->vdev_ops
->vdev_op_leaf
&&
1858 (error
= zio_wait(vdev_probe(vd
, NULL
))) != 0) {
1859 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_FAULTED
,
1860 VDEV_AUX_ERR_EXCEEDED
);
1865 * Track the min and max ashift values for normal data devices.
1867 if (vd
->vdev_top
== vd
&& vd
->vdev_ashift
!= 0 &&
1868 vd
->vdev_alloc_bias
== VDEV_BIAS_NONE
&&
1869 vd
->vdev_islog
== 0 && vd
->vdev_aux
== NULL
) {
1870 if (vd
->vdev_ashift
> spa
->spa_max_ashift
)
1871 spa
->spa_max_ashift
= vd
->vdev_ashift
;
1872 if (vd
->vdev_ashift
< spa
->spa_min_ashift
)
1873 spa
->spa_min_ashift
= vd
->vdev_ashift
;
1877 * If this is a leaf vdev, assess whether a resilver is needed.
1878 * But don't do this if we are doing a reopen for a scrub, since
1879 * this would just restart the scrub we are already doing.
1881 if (vd
->vdev_ops
->vdev_op_leaf
&& !spa
->spa_scrub_reopen
)
1882 dsl_scan_assess_vdev(spa
->spa_dsl_pool
, vd
);
1888 * Called once the vdevs are all opened, this routine validates the label
1889 * contents. This needs to be done before vdev_load() so that we don't
1890 * inadvertently do repair I/Os to the wrong device.
1892 * This function will only return failure if one of the vdevs indicates that it
1893 * has since been destroyed or exported. This is only possible if
1894 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
1895 * will be updated but the function will return 0.
1898 vdev_validate(vdev_t
*vd
)
1900 spa_t
*spa
= vd
->vdev_spa
;
1902 uint64_t guid
= 0, aux_guid
= 0, top_guid
;
1907 if (vdev_validate_skip
)
1910 for (uint64_t c
= 0; c
< vd
->vdev_children
; c
++)
1911 if (vdev_validate(vd
->vdev_child
[c
]) != 0)
1912 return (SET_ERROR(EBADF
));
1915 * If the device has already failed, or was marked offline, don't do
1916 * any further validation. Otherwise, label I/O will fail and we will
1917 * overwrite the previous state.
1919 if (!vd
->vdev_ops
->vdev_op_leaf
|| !vdev_readable(vd
))
1923 * If we are performing an extreme rewind, we allow for a label that
1924 * was modified at a point after the current txg.
1925 * If config lock is not held do not check for the txg. spa_sync could
1926 * be updating the vdev's label before updating spa_last_synced_txg.
1928 if (spa
->spa_extreme_rewind
|| spa_last_synced_txg(spa
) == 0 ||
1929 spa_config_held(spa
, SCL_CONFIG
, RW_WRITER
) != SCL_CONFIG
)
1932 txg
= spa_last_synced_txg(spa
);
1934 if ((label
= vdev_label_read_config(vd
, txg
)) == NULL
) {
1935 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
1936 VDEV_AUX_BAD_LABEL
);
1937 vdev_dbgmsg(vd
, "vdev_validate: failed reading config for "
1938 "txg %llu", (u_longlong_t
)txg
);
1943 * Determine if this vdev has been split off into another
1944 * pool. If so, then refuse to open it.
1946 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_SPLIT_GUID
,
1947 &aux_guid
) == 0 && aux_guid
== spa_guid(spa
)) {
1948 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1949 VDEV_AUX_SPLIT_POOL
);
1951 vdev_dbgmsg(vd
, "vdev_validate: vdev split into other pool");
1955 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_GUID
, &guid
) != 0) {
1956 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1957 VDEV_AUX_CORRUPT_DATA
);
1959 vdev_dbgmsg(vd
, "vdev_validate: '%s' missing from label",
1960 ZPOOL_CONFIG_POOL_GUID
);
1965 * If config is not trusted then ignore the spa guid check. This is
1966 * necessary because if the machine crashed during a re-guid the new
1967 * guid might have been written to all of the vdev labels, but not the
1968 * cached config. The check will be performed again once we have the
1969 * trusted config from the MOS.
1971 if (spa
->spa_trust_config
&& guid
!= spa_guid(spa
)) {
1972 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1973 VDEV_AUX_CORRUPT_DATA
);
1975 vdev_dbgmsg(vd
, "vdev_validate: vdev label pool_guid doesn't "
1976 "match config (%llu != %llu)", (u_longlong_t
)guid
,
1977 (u_longlong_t
)spa_guid(spa
));
1981 if (nvlist_lookup_nvlist(label
, ZPOOL_CONFIG_VDEV_TREE
, &nvl
)
1982 != 0 || nvlist_lookup_uint64(nvl
, ZPOOL_CONFIG_ORIG_GUID
,
1986 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
, &guid
) != 0) {
1987 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1988 VDEV_AUX_CORRUPT_DATA
);
1990 vdev_dbgmsg(vd
, "vdev_validate: '%s' missing from label",
1995 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_TOP_GUID
, &top_guid
)
1997 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
1998 VDEV_AUX_CORRUPT_DATA
);
2000 vdev_dbgmsg(vd
, "vdev_validate: '%s' missing from label",
2001 ZPOOL_CONFIG_TOP_GUID
);
2006 * If this vdev just became a top-level vdev because its sibling was
2007 * detached, it will have adopted the parent's vdev guid -- but the
2008 * label may or may not be on disk yet. Fortunately, either version
2009 * of the label will have the same top guid, so if we're a top-level
2010 * vdev, we can safely compare to that instead.
2011 * However, if the config comes from a cachefile that failed to update
2012 * after the detach, a top-level vdev will appear as a non top-level
2013 * vdev in the config. Also relax the constraints if we perform an
2016 * If we split this vdev off instead, then we also check the
2017 * original pool's guid. We don't want to consider the vdev
2018 * corrupt if it is partway through a split operation.
2020 if (vd
->vdev_guid
!= guid
&& vd
->vdev_guid
!= aux_guid
) {
2021 boolean_t mismatch
= B_FALSE
;
2022 if (spa
->spa_trust_config
&& !spa
->spa_extreme_rewind
) {
2023 if (vd
!= vd
->vdev_top
|| vd
->vdev_guid
!= top_guid
)
2026 if (vd
->vdev_guid
!= top_guid
&&
2027 vd
->vdev_top
->vdev_guid
!= guid
)
2032 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2033 VDEV_AUX_CORRUPT_DATA
);
2035 vdev_dbgmsg(vd
, "vdev_validate: config guid "
2036 "doesn't match label guid");
2037 vdev_dbgmsg(vd
, "CONFIG: guid %llu, top_guid %llu",
2038 (u_longlong_t
)vd
->vdev_guid
,
2039 (u_longlong_t
)vd
->vdev_top
->vdev_guid
);
2040 vdev_dbgmsg(vd
, "LABEL: guid %llu, top_guid %llu, "
2041 "aux_guid %llu", (u_longlong_t
)guid
,
2042 (u_longlong_t
)top_guid
, (u_longlong_t
)aux_guid
);
2047 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_STATE
,
2049 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2050 VDEV_AUX_CORRUPT_DATA
);
2052 vdev_dbgmsg(vd
, "vdev_validate: '%s' missing from label",
2053 ZPOOL_CONFIG_POOL_STATE
);
2060 * If this is a verbatim import, no need to check the
2061 * state of the pool.
2063 if (!(spa
->spa_import_flags
& ZFS_IMPORT_VERBATIM
) &&
2064 spa_load_state(spa
) == SPA_LOAD_OPEN
&&
2065 state
!= POOL_STATE_ACTIVE
) {
2066 vdev_dbgmsg(vd
, "vdev_validate: invalid pool state (%llu) "
2067 "for spa %s", (u_longlong_t
)state
, spa
->spa_name
);
2068 return (SET_ERROR(EBADF
));
2072 * If we were able to open and validate a vdev that was
2073 * previously marked permanently unavailable, clear that state
2076 if (vd
->vdev_not_present
)
2077 vd
->vdev_not_present
= 0;
2083 vdev_copy_path_impl(vdev_t
*svd
, vdev_t
*dvd
)
2085 if (svd
->vdev_path
!= NULL
&& dvd
->vdev_path
!= NULL
) {
2086 if (strcmp(svd
->vdev_path
, dvd
->vdev_path
) != 0) {
2087 zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
2088 "from '%s' to '%s'", (u_longlong_t
)dvd
->vdev_guid
,
2089 dvd
->vdev_path
, svd
->vdev_path
);
2090 spa_strfree(dvd
->vdev_path
);
2091 dvd
->vdev_path
= spa_strdup(svd
->vdev_path
);
2093 } else if (svd
->vdev_path
!= NULL
) {
2094 dvd
->vdev_path
= spa_strdup(svd
->vdev_path
);
2095 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
2096 (u_longlong_t
)dvd
->vdev_guid
, dvd
->vdev_path
);
2101 * Recursively copy vdev paths from one vdev to another. Source and destination
2102 * vdev trees must have same geometry otherwise return error. Intended to copy
2103 * paths from userland config into MOS config.
2106 vdev_copy_path_strict(vdev_t
*svd
, vdev_t
*dvd
)
2108 if ((svd
->vdev_ops
== &vdev_missing_ops
) ||
2109 (svd
->vdev_ishole
&& dvd
->vdev_ishole
) ||
2110 (dvd
->vdev_ops
== &vdev_indirect_ops
))
2113 if (svd
->vdev_ops
!= dvd
->vdev_ops
) {
2114 vdev_dbgmsg(svd
, "vdev_copy_path: vdev type mismatch: %s != %s",
2115 svd
->vdev_ops
->vdev_op_type
, dvd
->vdev_ops
->vdev_op_type
);
2116 return (SET_ERROR(EINVAL
));
2119 if (svd
->vdev_guid
!= dvd
->vdev_guid
) {
2120 vdev_dbgmsg(svd
, "vdev_copy_path: guids mismatch (%llu != "
2121 "%llu)", (u_longlong_t
)svd
->vdev_guid
,
2122 (u_longlong_t
)dvd
->vdev_guid
);
2123 return (SET_ERROR(EINVAL
));
2126 if (svd
->vdev_children
!= dvd
->vdev_children
) {
2127 vdev_dbgmsg(svd
, "vdev_copy_path: children count mismatch: "
2128 "%llu != %llu", (u_longlong_t
)svd
->vdev_children
,
2129 (u_longlong_t
)dvd
->vdev_children
);
2130 return (SET_ERROR(EINVAL
));
2133 for (uint64_t i
= 0; i
< svd
->vdev_children
; i
++) {
2134 int error
= vdev_copy_path_strict(svd
->vdev_child
[i
],
2135 dvd
->vdev_child
[i
]);
2140 if (svd
->vdev_ops
->vdev_op_leaf
)
2141 vdev_copy_path_impl(svd
, dvd
);
2147 vdev_copy_path_search(vdev_t
*stvd
, vdev_t
*dvd
)
2149 ASSERT(stvd
->vdev_top
== stvd
);
2150 ASSERT3U(stvd
->vdev_id
, ==, dvd
->vdev_top
->vdev_id
);
2152 for (uint64_t i
= 0; i
< dvd
->vdev_children
; i
++) {
2153 vdev_copy_path_search(stvd
, dvd
->vdev_child
[i
]);
2156 if (!dvd
->vdev_ops
->vdev_op_leaf
|| !vdev_is_concrete(dvd
))
2160 * The idea here is that while a vdev can shift positions within
2161 * a top vdev (when replacing, attaching mirror, etc.) it cannot
2162 * step outside of it.
2164 vdev_t
*vd
= vdev_lookup_by_guid(stvd
, dvd
->vdev_guid
);
2166 if (vd
== NULL
|| vd
->vdev_ops
!= dvd
->vdev_ops
)
2169 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
2171 vdev_copy_path_impl(vd
, dvd
);
2175 * Recursively copy vdev paths from one root vdev to another. Source and
2176 * destination vdev trees may differ in geometry. For each destination leaf
2177 * vdev, search a vdev with the same guid and top vdev id in the source.
2178 * Intended to copy paths from userland config into MOS config.
2181 vdev_copy_path_relaxed(vdev_t
*srvd
, vdev_t
*drvd
)
2183 uint64_t children
= MIN(srvd
->vdev_children
, drvd
->vdev_children
);
2184 ASSERT(srvd
->vdev_ops
== &vdev_root_ops
);
2185 ASSERT(drvd
->vdev_ops
== &vdev_root_ops
);
2187 for (uint64_t i
= 0; i
< children
; i
++) {
2188 vdev_copy_path_search(srvd
->vdev_child
[i
],
2189 drvd
->vdev_child
[i
]);
2194 * Close a virtual device.
2197 vdev_close(vdev_t
*vd
)
2199 vdev_t
*pvd
= vd
->vdev_parent
;
2200 spa_t
*spa __maybe_unused
= vd
->vdev_spa
;
2202 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
2205 * If our parent is reopening, then we are as well, unless we are
2208 if (pvd
!= NULL
&& pvd
->vdev_reopening
)
2209 vd
->vdev_reopening
= (pvd
->vdev_reopening
&& !vd
->vdev_offline
);
2211 vd
->vdev_ops
->vdev_op_close(vd
);
2213 vdev_cache_purge(vd
);
2216 * We record the previous state before we close it, so that if we are
2217 * doing a reopen(), we don't generate FMA ereports if we notice that
2218 * it's still faulted.
2220 vd
->vdev_prevstate
= vd
->vdev_state
;
2222 if (vd
->vdev_offline
)
2223 vd
->vdev_state
= VDEV_STATE_OFFLINE
;
2225 vd
->vdev_state
= VDEV_STATE_CLOSED
;
2226 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
2230 vdev_hold(vdev_t
*vd
)
2232 spa_t
*spa
= vd
->vdev_spa
;
2234 ASSERT(spa_is_root(spa
));
2235 if (spa
->spa_state
== POOL_STATE_UNINITIALIZED
)
2238 for (int c
= 0; c
< vd
->vdev_children
; c
++)
2239 vdev_hold(vd
->vdev_child
[c
]);
2241 if (vd
->vdev_ops
->vdev_op_leaf
)
2242 vd
->vdev_ops
->vdev_op_hold(vd
);
2246 vdev_rele(vdev_t
*vd
)
2248 ASSERT(spa_is_root(vd
->vdev_spa
));
2249 for (int c
= 0; c
< vd
->vdev_children
; c
++)
2250 vdev_rele(vd
->vdev_child
[c
]);
2252 if (vd
->vdev_ops
->vdev_op_leaf
)
2253 vd
->vdev_ops
->vdev_op_rele(vd
);
2257 * Reopen all interior vdevs and any unopened leaves. We don't actually
2258 * reopen leaf vdevs which had previously been opened as they might deadlock
2259 * on the spa_config_lock. Instead we only obtain the leaf's physical size.
2260 * If the leaf has never been opened then open it, as usual.
2263 vdev_reopen(vdev_t
*vd
)
2265 spa_t
*spa
= vd
->vdev_spa
;
2267 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
2269 /* set the reopening flag unless we're taking the vdev offline */
2270 vd
->vdev_reopening
= !vd
->vdev_offline
;
2272 (void) vdev_open(vd
);
2275 * Call vdev_validate() here to make sure we have the same device.
2276 * Otherwise, a device with an invalid label could be successfully
2277 * opened in response to vdev_reopen().
2280 (void) vdev_validate_aux(vd
);
2281 if (vdev_readable(vd
) && vdev_writeable(vd
) &&
2282 vd
->vdev_aux
== &spa
->spa_l2cache
) {
2284 * When reopening we can assume the device label has
2285 * already the attribute l2cache_persistent, since we've
2286 * opened the device in the past and updated the label.
2287 * In case the vdev is present we should evict all ARC
2288 * buffers and pointers to log blocks and reclaim their
2289 * space before restoring its contents to L2ARC.
2291 if (l2arc_vdev_present(vd
)) {
2292 l2arc_rebuild_vdev(vd
, B_TRUE
);
2294 l2arc_add_vdev(spa
, vd
);
2296 spa_async_request(spa
, SPA_ASYNC_L2CACHE_REBUILD
);
2299 (void) vdev_validate(vd
);
2303 * Reassess parent vdev's health.
2305 vdev_propagate_state(vd
);
2309 vdev_create(vdev_t
*vd
, uint64_t txg
, boolean_t isreplacing
)
2314 * Normally, partial opens (e.g. of a mirror) are allowed.
2315 * For a create, however, we want to fail the request if
2316 * there are any components we can't open.
2318 error
= vdev_open(vd
);
2320 if (error
|| vd
->vdev_state
!= VDEV_STATE_HEALTHY
) {
2322 return (error
? error
: SET_ERROR(ENXIO
));
2326 * Recursively load DTLs and initialize all labels.
2328 if ((error
= vdev_dtl_load(vd
)) != 0 ||
2329 (error
= vdev_label_init(vd
, txg
, isreplacing
?
2330 VDEV_LABEL_REPLACE
: VDEV_LABEL_CREATE
)) != 0) {
2339 vdev_metaslab_set_size(vdev_t
*vd
)
2341 uint64_t asize
= vd
->vdev_asize
;
2342 uint64_t ms_count
= asize
>> zfs_vdev_default_ms_shift
;
2346 * There are two dimensions to the metaslab sizing calculation:
2347 * the size of the metaslab and the count of metaslabs per vdev.
2349 * The default values used below are a good balance between memory
2350 * usage (larger metaslab size means more memory needed for loaded
2351 * metaslabs; more metaslabs means more memory needed for the
2352 * metaslab_t structs), metaslab load time (larger metaslabs take
2353 * longer to load), and metaslab sync time (more metaslabs means
2354 * more time spent syncing all of them).
2356 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
2357 * The range of the dimensions are as follows:
2359 * 2^29 <= ms_size <= 2^34
2360 * 16 <= ms_count <= 131,072
2362 * On the lower end of vdev sizes, we aim for metaslabs sizes of
2363 * at least 512MB (2^29) to minimize fragmentation effects when
2364 * testing with smaller devices. However, the count constraint
2365 * of at least 16 metaslabs will override this minimum size goal.
2367 * On the upper end of vdev sizes, we aim for a maximum metaslab
2368 * size of 16GB. However, we will cap the total count to 2^17
2369 * metaslabs to keep our memory footprint in check and let the
2370 * metaslab size grow from there if that limit is hit.
2372 * The net effect of applying above constrains is summarized below.
2374 * vdev size metaslab count
2375 * --------------|-----------------
2377 * 8GB - 100GB one per 512MB
2379 * 3TB - 2PB one per 16GB
2381 * --------------------------------
2383 * Finally, note that all of the above calculate the initial
2384 * number of metaslabs. Expanding a top-level vdev will result
2385 * in additional metaslabs being allocated making it possible
2386 * to exceed the zfs_vdev_ms_count_limit.
2389 if (ms_count
< zfs_vdev_min_ms_count
)
2390 ms_shift
= highbit64(asize
/ zfs_vdev_min_ms_count
);
2391 else if (ms_count
> zfs_vdev_default_ms_count
)
2392 ms_shift
= highbit64(asize
/ zfs_vdev_default_ms_count
);
2394 ms_shift
= zfs_vdev_default_ms_shift
;
2396 if (ms_shift
< SPA_MAXBLOCKSHIFT
) {
2397 ms_shift
= SPA_MAXBLOCKSHIFT
;
2398 } else if (ms_shift
> zfs_vdev_max_ms_shift
) {
2399 ms_shift
= zfs_vdev_max_ms_shift
;
2400 /* cap the total count to constrain memory footprint */
2401 if ((asize
>> ms_shift
) > zfs_vdev_ms_count_limit
)
2402 ms_shift
= highbit64(asize
/ zfs_vdev_ms_count_limit
);
2405 vd
->vdev_ms_shift
= ms_shift
;
2406 ASSERT3U(vd
->vdev_ms_shift
, >=, SPA_MAXBLOCKSHIFT
);
2410 vdev_dirty(vdev_t
*vd
, int flags
, void *arg
, uint64_t txg
)
2412 ASSERT(vd
== vd
->vdev_top
);
2413 /* indirect vdevs don't have metaslabs or dtls */
2414 ASSERT(vdev_is_concrete(vd
) || flags
== 0);
2415 ASSERT(ISP2(flags
));
2416 ASSERT(spa_writeable(vd
->vdev_spa
));
2418 if (flags
& VDD_METASLAB
)
2419 (void) txg_list_add(&vd
->vdev_ms_list
, arg
, txg
);
2421 if (flags
& VDD_DTL
)
2422 (void) txg_list_add(&vd
->vdev_dtl_list
, arg
, txg
);
2424 (void) txg_list_add(&vd
->vdev_spa
->spa_vdev_txg_list
, vd
, txg
);
2428 vdev_dirty_leaves(vdev_t
*vd
, int flags
, uint64_t txg
)
2430 for (int c
= 0; c
< vd
->vdev_children
; c
++)
2431 vdev_dirty_leaves(vd
->vdev_child
[c
], flags
, txg
);
2433 if (vd
->vdev_ops
->vdev_op_leaf
)
2434 vdev_dirty(vd
->vdev_top
, flags
, vd
, txg
);
2440 * A vdev's DTL (dirty time log) is the set of transaction groups for which
2441 * the vdev has less than perfect replication. There are four kinds of DTL:
2443 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
2445 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
2447 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
2448 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
2449 * txgs that was scrubbed.
2451 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
2452 * persistent errors or just some device being offline.
2453 * Unlike the other three, the DTL_OUTAGE map is not generally
2454 * maintained; it's only computed when needed, typically to
2455 * determine whether a device can be detached.
2457 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
2458 * either has the data or it doesn't.
2460 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
2461 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
2462 * if any child is less than fully replicated, then so is its parent.
2463 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
2464 * comprising only those txgs which appear in 'maxfaults' or more children;
2465 * those are the txgs we don't have enough replication to read. For example,
2466 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
2467 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
2468 * two child DTL_MISSING maps.
2470 * It should be clear from the above that to compute the DTLs and outage maps
2471 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
2472 * Therefore, that is all we keep on disk. When loading the pool, or after
2473 * a configuration change, we generate all other DTLs from first principles.
2476 vdev_dtl_dirty(vdev_t
*vd
, vdev_dtl_type_t t
, uint64_t txg
, uint64_t size
)
2478 range_tree_t
*rt
= vd
->vdev_dtl
[t
];
2480 ASSERT(t
< DTL_TYPES
);
2481 ASSERT(vd
!= vd
->vdev_spa
->spa_root_vdev
);
2482 ASSERT(spa_writeable(vd
->vdev_spa
));
2484 mutex_enter(&vd
->vdev_dtl_lock
);
2485 if (!range_tree_contains(rt
, txg
, size
))
2486 range_tree_add(rt
, txg
, size
);
2487 mutex_exit(&vd
->vdev_dtl_lock
);
2491 vdev_dtl_contains(vdev_t
*vd
, vdev_dtl_type_t t
, uint64_t txg
, uint64_t size
)
2493 range_tree_t
*rt
= vd
->vdev_dtl
[t
];
2494 boolean_t dirty
= B_FALSE
;
2496 ASSERT(t
< DTL_TYPES
);
2497 ASSERT(vd
!= vd
->vdev_spa
->spa_root_vdev
);
2500 * While we are loading the pool, the DTLs have not been loaded yet.
2501 * Ignore the DTLs and try all devices. This avoids a recursive
2502 * mutex enter on the vdev_dtl_lock, and also makes us try hard
2503 * when loading the pool (relying on the checksum to ensure that
2504 * we get the right data -- note that we while loading, we are
2505 * only reading the MOS, which is always checksummed).
2507 if (vd
->vdev_spa
->spa_load_state
!= SPA_LOAD_NONE
)
2510 mutex_enter(&vd
->vdev_dtl_lock
);
2511 if (!range_tree_is_empty(rt
))
2512 dirty
= range_tree_contains(rt
, txg
, size
);
2513 mutex_exit(&vd
->vdev_dtl_lock
);
2519 vdev_dtl_empty(vdev_t
*vd
, vdev_dtl_type_t t
)
2521 range_tree_t
*rt
= vd
->vdev_dtl
[t
];
2524 mutex_enter(&vd
->vdev_dtl_lock
);
2525 empty
= range_tree_is_empty(rt
);
2526 mutex_exit(&vd
->vdev_dtl_lock
);
2532 * Returns B_TRUE if vdev determines offset needs to be resilvered.
2535 vdev_dtl_need_resilver(vdev_t
*vd
, uint64_t offset
, size_t psize
)
2537 ASSERT(vd
!= vd
->vdev_spa
->spa_root_vdev
);
2539 if (vd
->vdev_ops
->vdev_op_need_resilver
== NULL
||
2540 vd
->vdev_ops
->vdev_op_leaf
)
2543 return (vd
->vdev_ops
->vdev_op_need_resilver(vd
, offset
, psize
));
2547 * Returns the lowest txg in the DTL range.
2550 vdev_dtl_min(vdev_t
*vd
)
2552 ASSERT(MUTEX_HELD(&vd
->vdev_dtl_lock
));
2553 ASSERT3U(range_tree_space(vd
->vdev_dtl
[DTL_MISSING
]), !=, 0);
2554 ASSERT0(vd
->vdev_children
);
2556 return (range_tree_min(vd
->vdev_dtl
[DTL_MISSING
]) - 1);
2560 * Returns the highest txg in the DTL.
2563 vdev_dtl_max(vdev_t
*vd
)
2565 ASSERT(MUTEX_HELD(&vd
->vdev_dtl_lock
));
2566 ASSERT3U(range_tree_space(vd
->vdev_dtl
[DTL_MISSING
]), !=, 0);
2567 ASSERT0(vd
->vdev_children
);
2569 return (range_tree_max(vd
->vdev_dtl
[DTL_MISSING
]));
2573 * Determine if a resilvering vdev should remove any DTL entries from
2574 * its range. If the vdev was resilvering for the entire duration of the
2575 * scan then it should excise that range from its DTLs. Otherwise, this
2576 * vdev is considered partially resilvered and should leave its DTL
2577 * entries intact. The comment in vdev_dtl_reassess() describes how we
2581 vdev_dtl_should_excise(vdev_t
*vd
)
2583 spa_t
*spa
= vd
->vdev_spa
;
2584 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
2586 ASSERT0(scn
->scn_phys
.scn_errors
);
2587 ASSERT0(vd
->vdev_children
);
2589 if (vd
->vdev_state
< VDEV_STATE_DEGRADED
)
2592 if (vd
->vdev_resilver_deferred
)
2595 if (vd
->vdev_resilver_txg
== 0 ||
2596 range_tree_is_empty(vd
->vdev_dtl
[DTL_MISSING
]))
2600 * When a resilver is initiated the scan will assign the scn_max_txg
2601 * value to the highest txg value that exists in all DTLs. If this
2602 * device's max DTL is not part of this scan (i.e. it is not in
2603 * the range (scn_min_txg, scn_max_txg] then it is not eligible
2606 if (vdev_dtl_max(vd
) <= scn
->scn_phys
.scn_max_txg
) {
2607 ASSERT3U(scn
->scn_phys
.scn_min_txg
, <=, vdev_dtl_min(vd
));
2608 ASSERT3U(scn
->scn_phys
.scn_min_txg
, <, vd
->vdev_resilver_txg
);
2609 ASSERT3U(vd
->vdev_resilver_txg
, <=, scn
->scn_phys
.scn_max_txg
);
2616 * Reassess DTLs after a config change or scrub completion. If txg == 0 no
2617 * write operations will be issued to the pool.
2620 vdev_dtl_reassess(vdev_t
*vd
, uint64_t txg
, uint64_t scrub_txg
, int scrub_done
)
2622 spa_t
*spa
= vd
->vdev_spa
;
2626 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
2628 for (int c
= 0; c
< vd
->vdev_children
; c
++)
2629 vdev_dtl_reassess(vd
->vdev_child
[c
], txg
,
2630 scrub_txg
, scrub_done
);
2632 if (vd
== spa
->spa_root_vdev
|| !vdev_is_concrete(vd
) || vd
->vdev_aux
)
2635 if (vd
->vdev_ops
->vdev_op_leaf
) {
2636 dsl_scan_t
*scn
= spa
->spa_dsl_pool
->dp_scan
;
2638 mutex_enter(&vd
->vdev_dtl_lock
);
2641 * If requested, pretend the scan completed cleanly.
2643 if (zfs_scan_ignore_errors
&& scn
)
2644 scn
->scn_phys
.scn_errors
= 0;
2647 * If we've completed a scan cleanly then determine
2648 * if this vdev should remove any DTLs. We only want to
2649 * excise regions on vdevs that were available during
2650 * the entire duration of this scan.
2652 if (scrub_txg
!= 0 &&
2653 (spa
->spa_scrub_started
||
2654 (scn
!= NULL
&& scn
->scn_phys
.scn_errors
== 0)) &&
2655 vdev_dtl_should_excise(vd
)) {
2657 * We completed a scrub up to scrub_txg. If we
2658 * did it without rebooting, then the scrub dtl
2659 * will be valid, so excise the old region and
2660 * fold in the scrub dtl. Otherwise, leave the
2661 * dtl as-is if there was an error.
2663 * There's little trick here: to excise the beginning
2664 * of the DTL_MISSING map, we put it into a reference
2665 * tree and then add a segment with refcnt -1 that
2666 * covers the range [0, scrub_txg). This means
2667 * that each txg in that range has refcnt -1 or 0.
2668 * We then add DTL_SCRUB with a refcnt of 2, so that
2669 * entries in the range [0, scrub_txg) will have a
2670 * positive refcnt -- either 1 or 2. We then convert
2671 * the reference tree into the new DTL_MISSING map.
2673 space_reftree_create(&reftree
);
2674 space_reftree_add_map(&reftree
,
2675 vd
->vdev_dtl
[DTL_MISSING
], 1);
2676 space_reftree_add_seg(&reftree
, 0, scrub_txg
, -1);
2677 space_reftree_add_map(&reftree
,
2678 vd
->vdev_dtl
[DTL_SCRUB
], 2);
2679 space_reftree_generate_map(&reftree
,
2680 vd
->vdev_dtl
[DTL_MISSING
], 1);
2681 space_reftree_destroy(&reftree
);
2683 range_tree_vacate(vd
->vdev_dtl
[DTL_PARTIAL
], NULL
, NULL
);
2684 range_tree_walk(vd
->vdev_dtl
[DTL_MISSING
],
2685 range_tree_add
, vd
->vdev_dtl
[DTL_PARTIAL
]);
2687 range_tree_vacate(vd
->vdev_dtl
[DTL_SCRUB
], NULL
, NULL
);
2688 range_tree_vacate(vd
->vdev_dtl
[DTL_OUTAGE
], NULL
, NULL
);
2689 if (!vdev_readable(vd
))
2690 range_tree_add(vd
->vdev_dtl
[DTL_OUTAGE
], 0, -1ULL);
2692 range_tree_walk(vd
->vdev_dtl
[DTL_MISSING
],
2693 range_tree_add
, vd
->vdev_dtl
[DTL_OUTAGE
]);
2696 * If the vdev was resilvering and no longer has any
2697 * DTLs then reset its resilvering flag and dirty
2698 * the top level so that we persist the change.
2700 if (txg
!= 0 && vd
->vdev_resilver_txg
!= 0 &&
2701 range_tree_is_empty(vd
->vdev_dtl
[DTL_MISSING
]) &&
2702 range_tree_is_empty(vd
->vdev_dtl
[DTL_OUTAGE
])) {
2703 vd
->vdev_resilver_txg
= 0;
2704 vdev_config_dirty(vd
->vdev_top
);
2707 mutex_exit(&vd
->vdev_dtl_lock
);
2710 vdev_dirty(vd
->vdev_top
, VDD_DTL
, vd
, txg
);
2714 mutex_enter(&vd
->vdev_dtl_lock
);
2715 for (int t
= 0; t
< DTL_TYPES
; t
++) {
2716 /* account for child's outage in parent's missing map */
2717 int s
= (t
== DTL_MISSING
) ? DTL_OUTAGE
: t
;
2719 continue; /* leaf vdevs only */
2720 if (t
== DTL_PARTIAL
)
2721 minref
= 1; /* i.e. non-zero */
2722 else if (vd
->vdev_nparity
!= 0)
2723 minref
= vd
->vdev_nparity
+ 1; /* RAID-Z */
2725 minref
= vd
->vdev_children
; /* any kind of mirror */
2726 space_reftree_create(&reftree
);
2727 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
2728 vdev_t
*cvd
= vd
->vdev_child
[c
];
2729 mutex_enter(&cvd
->vdev_dtl_lock
);
2730 space_reftree_add_map(&reftree
, cvd
->vdev_dtl
[s
], 1);
2731 mutex_exit(&cvd
->vdev_dtl_lock
);
2733 space_reftree_generate_map(&reftree
, vd
->vdev_dtl
[t
], minref
);
2734 space_reftree_destroy(&reftree
);
2736 mutex_exit(&vd
->vdev_dtl_lock
);
2740 vdev_dtl_load(vdev_t
*vd
)
2742 spa_t
*spa
= vd
->vdev_spa
;
2743 objset_t
*mos
= spa
->spa_meta_objset
;
2746 if (vd
->vdev_ops
->vdev_op_leaf
&& vd
->vdev_dtl_object
!= 0) {
2747 ASSERT(vdev_is_concrete(vd
));
2749 error
= space_map_open(&vd
->vdev_dtl_sm
, mos
,
2750 vd
->vdev_dtl_object
, 0, -1ULL, 0);
2753 ASSERT(vd
->vdev_dtl_sm
!= NULL
);
2755 mutex_enter(&vd
->vdev_dtl_lock
);
2756 error
= space_map_load(vd
->vdev_dtl_sm
,
2757 vd
->vdev_dtl
[DTL_MISSING
], SM_ALLOC
);
2758 mutex_exit(&vd
->vdev_dtl_lock
);
2763 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
2764 error
= vdev_dtl_load(vd
->vdev_child
[c
]);
2773 vdev_zap_allocation_data(vdev_t
*vd
, dmu_tx_t
*tx
)
2775 spa_t
*spa
= vd
->vdev_spa
;
2776 objset_t
*mos
= spa
->spa_meta_objset
;
2777 vdev_alloc_bias_t alloc_bias
= vd
->vdev_alloc_bias
;
2780 ASSERT(alloc_bias
!= VDEV_BIAS_NONE
);
2783 (alloc_bias
== VDEV_BIAS_LOG
) ? VDEV_ALLOC_BIAS_LOG
:
2784 (alloc_bias
== VDEV_BIAS_SPECIAL
) ? VDEV_ALLOC_BIAS_SPECIAL
:
2785 (alloc_bias
== VDEV_BIAS_DEDUP
) ? VDEV_ALLOC_BIAS_DEDUP
: NULL
;
2787 ASSERT(string
!= NULL
);
2788 VERIFY0(zap_add(mos
, vd
->vdev_top_zap
, VDEV_TOP_ZAP_ALLOCATION_BIAS
,
2789 1, strlen(string
) + 1, string
, tx
));
2791 if (alloc_bias
== VDEV_BIAS_SPECIAL
|| alloc_bias
== VDEV_BIAS_DEDUP
) {
2792 spa_activate_allocation_classes(spa
, tx
);
2797 vdev_destroy_unlink_zap(vdev_t
*vd
, uint64_t zapobj
, dmu_tx_t
*tx
)
2799 spa_t
*spa
= vd
->vdev_spa
;
2801 VERIFY0(zap_destroy(spa
->spa_meta_objset
, zapobj
, tx
));
2802 VERIFY0(zap_remove_int(spa
->spa_meta_objset
, spa
->spa_all_vdev_zaps
,
2807 vdev_create_link_zap(vdev_t
*vd
, dmu_tx_t
*tx
)
2809 spa_t
*spa
= vd
->vdev_spa
;
2810 uint64_t zap
= zap_create(spa
->spa_meta_objset
, DMU_OTN_ZAP_METADATA
,
2811 DMU_OT_NONE
, 0, tx
);
2814 VERIFY0(zap_add_int(spa
->spa_meta_objset
, spa
->spa_all_vdev_zaps
,
2821 vdev_construct_zaps(vdev_t
*vd
, dmu_tx_t
*tx
)
2823 if (vd
->vdev_ops
!= &vdev_hole_ops
&&
2824 vd
->vdev_ops
!= &vdev_missing_ops
&&
2825 vd
->vdev_ops
!= &vdev_root_ops
&&
2826 !vd
->vdev_top
->vdev_removing
) {
2827 if (vd
->vdev_ops
->vdev_op_leaf
&& vd
->vdev_leaf_zap
== 0) {
2828 vd
->vdev_leaf_zap
= vdev_create_link_zap(vd
, tx
);
2830 if (vd
== vd
->vdev_top
&& vd
->vdev_top_zap
== 0) {
2831 vd
->vdev_top_zap
= vdev_create_link_zap(vd
, tx
);
2832 if (vd
->vdev_alloc_bias
!= VDEV_BIAS_NONE
)
2833 vdev_zap_allocation_data(vd
, tx
);
2837 for (uint64_t i
= 0; i
< vd
->vdev_children
; i
++) {
2838 vdev_construct_zaps(vd
->vdev_child
[i
], tx
);
2843 vdev_dtl_sync(vdev_t
*vd
, uint64_t txg
)
2845 spa_t
*spa
= vd
->vdev_spa
;
2846 range_tree_t
*rt
= vd
->vdev_dtl
[DTL_MISSING
];
2847 objset_t
*mos
= spa
->spa_meta_objset
;
2848 range_tree_t
*rtsync
;
2850 uint64_t object
= space_map_object(vd
->vdev_dtl_sm
);
2852 ASSERT(vdev_is_concrete(vd
));
2853 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
2855 tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
2857 if (vd
->vdev_detached
|| vd
->vdev_top
->vdev_removing
) {
2858 mutex_enter(&vd
->vdev_dtl_lock
);
2859 space_map_free(vd
->vdev_dtl_sm
, tx
);
2860 space_map_close(vd
->vdev_dtl_sm
);
2861 vd
->vdev_dtl_sm
= NULL
;
2862 mutex_exit(&vd
->vdev_dtl_lock
);
2865 * We only destroy the leaf ZAP for detached leaves or for
2866 * removed log devices. Removed data devices handle leaf ZAP
2867 * cleanup later, once cancellation is no longer possible.
2869 if (vd
->vdev_leaf_zap
!= 0 && (vd
->vdev_detached
||
2870 vd
->vdev_top
->vdev_islog
)) {
2871 vdev_destroy_unlink_zap(vd
, vd
->vdev_leaf_zap
, tx
);
2872 vd
->vdev_leaf_zap
= 0;
2879 if (vd
->vdev_dtl_sm
== NULL
) {
2880 uint64_t new_object
;
2882 new_object
= space_map_alloc(mos
, zfs_vdev_dtl_sm_blksz
, tx
);
2883 VERIFY3U(new_object
, !=, 0);
2885 VERIFY0(space_map_open(&vd
->vdev_dtl_sm
, mos
, new_object
,
2887 ASSERT(vd
->vdev_dtl_sm
!= NULL
);
2890 rtsync
= range_tree_create(NULL
, RANGE_SEG64
, NULL
, 0, 0);
2892 mutex_enter(&vd
->vdev_dtl_lock
);
2893 range_tree_walk(rt
, range_tree_add
, rtsync
);
2894 mutex_exit(&vd
->vdev_dtl_lock
);
2896 space_map_truncate(vd
->vdev_dtl_sm
, zfs_vdev_dtl_sm_blksz
, tx
);
2897 space_map_write(vd
->vdev_dtl_sm
, rtsync
, SM_ALLOC
, SM_NO_VDEVID
, tx
);
2898 range_tree_vacate(rtsync
, NULL
, NULL
);
2900 range_tree_destroy(rtsync
);
2903 * If the object for the space map has changed then dirty
2904 * the top level so that we update the config.
2906 if (object
!= space_map_object(vd
->vdev_dtl_sm
)) {
2907 vdev_dbgmsg(vd
, "txg %llu, spa %s, DTL old object %llu, "
2908 "new object %llu", (u_longlong_t
)txg
, spa_name(spa
),
2909 (u_longlong_t
)object
,
2910 (u_longlong_t
)space_map_object(vd
->vdev_dtl_sm
));
2911 vdev_config_dirty(vd
->vdev_top
);
2918 * Determine whether the specified vdev can be offlined/detached/removed
2919 * without losing data.
2922 vdev_dtl_required(vdev_t
*vd
)
2924 spa_t
*spa
= vd
->vdev_spa
;
2925 vdev_t
*tvd
= vd
->vdev_top
;
2926 uint8_t cant_read
= vd
->vdev_cant_read
;
2929 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
2931 if (vd
== spa
->spa_root_vdev
|| vd
== tvd
)
2935 * Temporarily mark the device as unreadable, and then determine
2936 * whether this results in any DTL outages in the top-level vdev.
2937 * If not, we can safely offline/detach/remove the device.
2939 vd
->vdev_cant_read
= B_TRUE
;
2940 vdev_dtl_reassess(tvd
, 0, 0, B_FALSE
);
2941 required
= !vdev_dtl_empty(tvd
, DTL_OUTAGE
);
2942 vd
->vdev_cant_read
= cant_read
;
2943 vdev_dtl_reassess(tvd
, 0, 0, B_FALSE
);
2945 if (!required
&& zio_injection_enabled
) {
2946 required
= !!zio_handle_device_injection(vd
, NULL
,
2954 * Determine if resilver is needed, and if so the txg range.
2957 vdev_resilver_needed(vdev_t
*vd
, uint64_t *minp
, uint64_t *maxp
)
2959 boolean_t needed
= B_FALSE
;
2960 uint64_t thismin
= UINT64_MAX
;
2961 uint64_t thismax
= 0;
2963 if (vd
->vdev_children
== 0) {
2964 mutex_enter(&vd
->vdev_dtl_lock
);
2965 if (!range_tree_is_empty(vd
->vdev_dtl
[DTL_MISSING
]) &&
2966 vdev_writeable(vd
)) {
2968 thismin
= vdev_dtl_min(vd
);
2969 thismax
= vdev_dtl_max(vd
);
2972 mutex_exit(&vd
->vdev_dtl_lock
);
2974 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
2975 vdev_t
*cvd
= vd
->vdev_child
[c
];
2976 uint64_t cmin
, cmax
;
2978 if (vdev_resilver_needed(cvd
, &cmin
, &cmax
)) {
2979 thismin
= MIN(thismin
, cmin
);
2980 thismax
= MAX(thismax
, cmax
);
2986 if (needed
&& minp
) {
2994 * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj
2995 * will contain either the checkpoint spacemap object or zero if none exists.
2996 * All other errors are returned to the caller.
2999 vdev_checkpoint_sm_object(vdev_t
*vd
, uint64_t *sm_obj
)
3001 ASSERT0(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
));
3003 if (vd
->vdev_top_zap
== 0) {
3008 int error
= zap_lookup(spa_meta_objset(vd
->vdev_spa
), vd
->vdev_top_zap
,
3009 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM
, sizeof (uint64_t), 1, sm_obj
);
3010 if (error
== ENOENT
) {
3019 vdev_load(vdev_t
*vd
)
3024 * Recursively load all children.
3026 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
3027 error
= vdev_load(vd
->vdev_child
[c
]);
3033 vdev_set_deflate_ratio(vd
);
3036 * On spa_load path, grab the allocation bias from our zap
3038 if (vd
== vd
->vdev_top
&& vd
->vdev_top_zap
!= 0) {
3039 spa_t
*spa
= vd
->vdev_spa
;
3042 if (zap_lookup(spa
->spa_meta_objset
, vd
->vdev_top_zap
,
3043 VDEV_TOP_ZAP_ALLOCATION_BIAS
, 1, sizeof (bias_str
),
3045 ASSERT(vd
->vdev_alloc_bias
== VDEV_BIAS_NONE
);
3046 vd
->vdev_alloc_bias
= vdev_derive_alloc_bias(bias_str
);
3051 * If this is a top-level vdev, initialize its metaslabs.
3053 if (vd
== vd
->vdev_top
&& vdev_is_concrete(vd
)) {
3054 vdev_metaslab_group_create(vd
);
3056 if (vd
->vdev_ashift
== 0 || vd
->vdev_asize
== 0) {
3057 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
3058 VDEV_AUX_CORRUPT_DATA
);
3059 vdev_dbgmsg(vd
, "vdev_load: invalid size. ashift=%llu, "
3060 "asize=%llu", (u_longlong_t
)vd
->vdev_ashift
,
3061 (u_longlong_t
)vd
->vdev_asize
);
3062 return (SET_ERROR(ENXIO
));
3065 error
= vdev_metaslab_init(vd
, 0);
3067 vdev_dbgmsg(vd
, "vdev_load: metaslab_init failed "
3068 "[error=%d]", error
);
3069 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
3070 VDEV_AUX_CORRUPT_DATA
);
3074 uint64_t checkpoint_sm_obj
;
3075 error
= vdev_checkpoint_sm_object(vd
, &checkpoint_sm_obj
);
3076 if (error
== 0 && checkpoint_sm_obj
!= 0) {
3077 objset_t
*mos
= spa_meta_objset(vd
->vdev_spa
);
3078 ASSERT(vd
->vdev_asize
!= 0);
3079 ASSERT3P(vd
->vdev_checkpoint_sm
, ==, NULL
);
3081 error
= space_map_open(&vd
->vdev_checkpoint_sm
,
3082 mos
, checkpoint_sm_obj
, 0, vd
->vdev_asize
,
3085 vdev_dbgmsg(vd
, "vdev_load: space_map_open "
3086 "failed for checkpoint spacemap (obj %llu) "
3088 (u_longlong_t
)checkpoint_sm_obj
, error
);
3091 ASSERT3P(vd
->vdev_checkpoint_sm
, !=, NULL
);
3094 * Since the checkpoint_sm contains free entries
3095 * exclusively we can use space_map_allocated() to
3096 * indicate the cumulative checkpointed space that
3099 vd
->vdev_stat
.vs_checkpoint_space
=
3100 -space_map_allocated(vd
->vdev_checkpoint_sm
);
3101 vd
->vdev_spa
->spa_checkpoint_info
.sci_dspace
+=
3102 vd
->vdev_stat
.vs_checkpoint_space
;
3103 } else if (error
!= 0) {
3104 vdev_dbgmsg(vd
, "vdev_load: failed to retrieve "
3105 "checkpoint space map object from vdev ZAP "
3106 "[error=%d]", error
);
3112 * If this is a leaf vdev, load its DTL.
3114 if (vd
->vdev_ops
->vdev_op_leaf
&& (error
= vdev_dtl_load(vd
)) != 0) {
3115 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
3116 VDEV_AUX_CORRUPT_DATA
);
3117 vdev_dbgmsg(vd
, "vdev_load: vdev_dtl_load failed "
3118 "[error=%d]", error
);
3122 uint64_t obsolete_sm_object
;
3123 error
= vdev_obsolete_sm_object(vd
, &obsolete_sm_object
);
3124 if (error
== 0 && obsolete_sm_object
!= 0) {
3125 objset_t
*mos
= vd
->vdev_spa
->spa_meta_objset
;
3126 ASSERT(vd
->vdev_asize
!= 0);
3127 ASSERT3P(vd
->vdev_obsolete_sm
, ==, NULL
);
3129 if ((error
= space_map_open(&vd
->vdev_obsolete_sm
, mos
,
3130 obsolete_sm_object
, 0, vd
->vdev_asize
, 0))) {
3131 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
3132 VDEV_AUX_CORRUPT_DATA
);
3133 vdev_dbgmsg(vd
, "vdev_load: space_map_open failed for "
3134 "obsolete spacemap (obj %llu) [error=%d]",
3135 (u_longlong_t
)obsolete_sm_object
, error
);
3138 } else if (error
!= 0) {
3139 vdev_dbgmsg(vd
, "vdev_load: failed to retrieve obsolete "
3140 "space map object from vdev ZAP [error=%d]", error
);
3148 * The special vdev case is used for hot spares and l2cache devices. Its
3149 * sole purpose it to set the vdev state for the associated vdev. To do this,
3150 * we make sure that we can open the underlying device, then try to read the
3151 * label, and make sure that the label is sane and that it hasn't been
3152 * repurposed to another pool.
3155 vdev_validate_aux(vdev_t
*vd
)
3158 uint64_t guid
, version
;
3161 if (!vdev_readable(vd
))
3164 if ((label
= vdev_label_read_config(vd
, -1ULL)) == NULL
) {
3165 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
3166 VDEV_AUX_CORRUPT_DATA
);
3170 if (nvlist_lookup_uint64(label
, ZPOOL_CONFIG_VERSION
, &version
) != 0 ||
3171 !SPA_VERSION_IS_SUPPORTED(version
) ||
3172 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_GUID
, &guid
) != 0 ||
3173 guid
!= vd
->vdev_guid
||
3174 nvlist_lookup_uint64(label
, ZPOOL_CONFIG_POOL_STATE
, &state
) != 0) {
3175 vdev_set_state(vd
, B_TRUE
, VDEV_STATE_CANT_OPEN
,
3176 VDEV_AUX_CORRUPT_DATA
);
3182 * We don't actually check the pool state here. If it's in fact in
3183 * use by another pool, we update this fact on the fly when requested.
3190 vdev_destroy_ms_flush_data(vdev_t
*vd
, dmu_tx_t
*tx
)
3192 objset_t
*mos
= spa_meta_objset(vd
->vdev_spa
);
3194 if (vd
->vdev_top_zap
== 0)
3197 uint64_t object
= 0;
3198 int err
= zap_lookup(mos
, vd
->vdev_top_zap
,
3199 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS
, sizeof (uint64_t), 1, &object
);
3203 VERIFY0(dmu_object_free(mos
, object
, tx
));
3204 VERIFY0(zap_remove(mos
, vd
->vdev_top_zap
,
3205 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS
, tx
));
3209 * Free the objects used to store this vdev's spacemaps, and the array
3210 * that points to them.
3213 vdev_destroy_spacemaps(vdev_t
*vd
, dmu_tx_t
*tx
)
3215 if (vd
->vdev_ms_array
== 0)
3218 objset_t
*mos
= vd
->vdev_spa
->spa_meta_objset
;
3219 uint64_t array_count
= vd
->vdev_asize
>> vd
->vdev_ms_shift
;
3220 size_t array_bytes
= array_count
* sizeof (uint64_t);
3221 uint64_t *smobj_array
= kmem_alloc(array_bytes
, KM_SLEEP
);
3222 VERIFY0(dmu_read(mos
, vd
->vdev_ms_array
, 0,
3223 array_bytes
, smobj_array
, 0));
3225 for (uint64_t i
= 0; i
< array_count
; i
++) {
3226 uint64_t smobj
= smobj_array
[i
];
3230 space_map_free_obj(mos
, smobj
, tx
);
3233 kmem_free(smobj_array
, array_bytes
);
3234 VERIFY0(dmu_object_free(mos
, vd
->vdev_ms_array
, tx
));
3235 vdev_destroy_ms_flush_data(vd
, tx
);
3236 vd
->vdev_ms_array
= 0;
3240 vdev_remove_empty_log(vdev_t
*vd
, uint64_t txg
)
3242 spa_t
*spa
= vd
->vdev_spa
;
3244 ASSERT(vd
->vdev_islog
);
3245 ASSERT(vd
== vd
->vdev_top
);
3246 ASSERT3U(txg
, ==, spa_syncing_txg(spa
));
3248 dmu_tx_t
*tx
= dmu_tx_create_assigned(spa_get_dsl(spa
), txg
);
3250 vdev_destroy_spacemaps(vd
, tx
);
3251 if (vd
->vdev_top_zap
!= 0) {
3252 vdev_destroy_unlink_zap(vd
, vd
->vdev_top_zap
, tx
);
3253 vd
->vdev_top_zap
= 0;
3260 vdev_sync_done(vdev_t
*vd
, uint64_t txg
)
3263 boolean_t reassess
= !txg_list_empty(&vd
->vdev_ms_list
, TXG_CLEAN(txg
));
3265 ASSERT(vdev_is_concrete(vd
));
3267 while ((msp
= txg_list_remove(&vd
->vdev_ms_list
, TXG_CLEAN(txg
)))
3269 metaslab_sync_done(msp
, txg
);
3272 metaslab_sync_reassess(vd
->vdev_mg
);
3276 vdev_sync(vdev_t
*vd
, uint64_t txg
)
3278 spa_t
*spa
= vd
->vdev_spa
;
3282 ASSERT3U(txg
, ==, spa
->spa_syncing_txg
);
3283 dmu_tx_t
*tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
, txg
);
3284 if (range_tree_space(vd
->vdev_obsolete_segments
) > 0) {
3285 ASSERT(vd
->vdev_removing
||
3286 vd
->vdev_ops
== &vdev_indirect_ops
);
3288 vdev_indirect_sync_obsolete(vd
, tx
);
3291 * If the vdev is indirect, it can't have dirty
3292 * metaslabs or DTLs.
3294 if (vd
->vdev_ops
== &vdev_indirect_ops
) {
3295 ASSERT(txg_list_empty(&vd
->vdev_ms_list
, txg
));
3296 ASSERT(txg_list_empty(&vd
->vdev_dtl_list
, txg
));
3302 ASSERT(vdev_is_concrete(vd
));
3304 if (vd
->vdev_ms_array
== 0 && vd
->vdev_ms_shift
!= 0 &&
3305 !vd
->vdev_removing
) {
3306 ASSERT(vd
== vd
->vdev_top
);
3307 ASSERT0(vd
->vdev_indirect_config
.vic_mapping_object
);
3308 vd
->vdev_ms_array
= dmu_object_alloc(spa
->spa_meta_objset
,
3309 DMU_OT_OBJECT_ARRAY
, 0, DMU_OT_NONE
, 0, tx
);
3310 ASSERT(vd
->vdev_ms_array
!= 0);
3311 vdev_config_dirty(vd
);
3314 while ((msp
= txg_list_remove(&vd
->vdev_ms_list
, txg
)) != NULL
) {
3315 metaslab_sync(msp
, txg
);
3316 (void) txg_list_add(&vd
->vdev_ms_list
, msp
, TXG_CLEAN(txg
));
3319 while ((lvd
= txg_list_remove(&vd
->vdev_dtl_list
, txg
)) != NULL
)
3320 vdev_dtl_sync(lvd
, txg
);
3323 * If this is an empty log device being removed, destroy the
3324 * metadata associated with it.
3326 if (vd
->vdev_islog
&& vd
->vdev_stat
.vs_alloc
== 0 && vd
->vdev_removing
)
3327 vdev_remove_empty_log(vd
, txg
);
3329 (void) txg_list_add(&spa
->spa_vdev_txg_list
, vd
, TXG_CLEAN(txg
));
3334 vdev_psize_to_asize(vdev_t
*vd
, uint64_t psize
)
3336 return (vd
->vdev_ops
->vdev_op_asize(vd
, psize
));
3340 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
3341 * not be opened, and no I/O is attempted.
3344 vdev_fault(spa_t
*spa
, uint64_t guid
, vdev_aux_t aux
)
3348 spa_vdev_state_enter(spa
, SCL_NONE
);
3350 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
3351 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENODEV
)));
3353 if (!vd
->vdev_ops
->vdev_op_leaf
)
3354 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENOTSUP
)));
3359 * If user did a 'zpool offline -f' then make the fault persist across
3362 if (aux
== VDEV_AUX_EXTERNAL_PERSIST
) {
3364 * There are two kinds of forced faults: temporary and
3365 * persistent. Temporary faults go away at pool import, while
3366 * persistent faults stay set. Both types of faults can be
3367 * cleared with a zpool clear.
3369 * We tell if a vdev is persistently faulted by looking at the
3370 * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at
3371 * import then it's a persistent fault. Otherwise, it's
3372 * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external"
3373 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This
3374 * tells vdev_config_generate() (which gets run later) to set
3375 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
3377 vd
->vdev_stat
.vs_aux
= VDEV_AUX_EXTERNAL
;
3378 vd
->vdev_tmpoffline
= B_FALSE
;
3379 aux
= VDEV_AUX_EXTERNAL
;
3381 vd
->vdev_tmpoffline
= B_TRUE
;
3385 * We don't directly use the aux state here, but if we do a
3386 * vdev_reopen(), we need this value to be present to remember why we
3389 vd
->vdev_label_aux
= aux
;
3392 * Faulted state takes precedence over degraded.
3394 vd
->vdev_delayed_close
= B_FALSE
;
3395 vd
->vdev_faulted
= 1ULL;
3396 vd
->vdev_degraded
= 0ULL;
3397 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_FAULTED
, aux
);
3400 * If this device has the only valid copy of the data, then
3401 * back off and simply mark the vdev as degraded instead.
3403 if (!tvd
->vdev_islog
&& vd
->vdev_aux
== NULL
&& vdev_dtl_required(vd
)) {
3404 vd
->vdev_degraded
= 1ULL;
3405 vd
->vdev_faulted
= 0ULL;
3408 * If we reopen the device and it's not dead, only then do we
3413 if (vdev_readable(vd
))
3414 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, aux
);
3417 return (spa_vdev_state_exit(spa
, vd
, 0));
3421 * Mark the given vdev degraded. A degraded vdev is purely an indication to the
3422 * user that something is wrong. The vdev continues to operate as normal as far
3423 * as I/O is concerned.
3426 vdev_degrade(spa_t
*spa
, uint64_t guid
, vdev_aux_t aux
)
3430 spa_vdev_state_enter(spa
, SCL_NONE
);
3432 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
3433 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENODEV
)));
3435 if (!vd
->vdev_ops
->vdev_op_leaf
)
3436 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENOTSUP
)));
3439 * If the vdev is already faulted, then don't do anything.
3441 if (vd
->vdev_faulted
|| vd
->vdev_degraded
)
3442 return (spa_vdev_state_exit(spa
, NULL
, 0));
3444 vd
->vdev_degraded
= 1ULL;
3445 if (!vdev_is_dead(vd
))
3446 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
,
3449 return (spa_vdev_state_exit(spa
, vd
, 0));
3453 * Online the given vdev.
3455 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
3456 * spare device should be detached when the device finishes resilvering.
3457 * Second, the online should be treated like a 'test' online case, so no FMA
3458 * events are generated if the device fails to open.
3461 vdev_online(spa_t
*spa
, uint64_t guid
, uint64_t flags
, vdev_state_t
*newstate
)
3463 vdev_t
*vd
, *tvd
, *pvd
, *rvd
= spa
->spa_root_vdev
;
3464 boolean_t wasoffline
;
3465 vdev_state_t oldstate
;
3467 spa_vdev_state_enter(spa
, SCL_NONE
);
3469 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
3470 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENODEV
)));
3472 if (!vd
->vdev_ops
->vdev_op_leaf
)
3473 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENOTSUP
)));
3475 wasoffline
= (vd
->vdev_offline
|| vd
->vdev_tmpoffline
);
3476 oldstate
= vd
->vdev_state
;
3479 vd
->vdev_offline
= B_FALSE
;
3480 vd
->vdev_tmpoffline
= B_FALSE
;
3481 vd
->vdev_checkremove
= !!(flags
& ZFS_ONLINE_CHECKREMOVE
);
3482 vd
->vdev_forcefault
= !!(flags
& ZFS_ONLINE_FORCEFAULT
);
3484 /* XXX - L2ARC 1.0 does not support expansion */
3485 if (!vd
->vdev_aux
) {
3486 for (pvd
= vd
; pvd
!= rvd
; pvd
= pvd
->vdev_parent
)
3487 pvd
->vdev_expanding
= !!((flags
& ZFS_ONLINE_EXPAND
) ||
3488 spa
->spa_autoexpand
);
3489 vd
->vdev_expansion_time
= gethrestime_sec();
3493 vd
->vdev_checkremove
= vd
->vdev_forcefault
= B_FALSE
;
3495 if (!vd
->vdev_aux
) {
3496 for (pvd
= vd
; pvd
!= rvd
; pvd
= pvd
->vdev_parent
)
3497 pvd
->vdev_expanding
= B_FALSE
;
3501 *newstate
= vd
->vdev_state
;
3502 if ((flags
& ZFS_ONLINE_UNSPARE
) &&
3503 !vdev_is_dead(vd
) && vd
->vdev_parent
&&
3504 vd
->vdev_parent
->vdev_ops
== &vdev_spare_ops
&&
3505 vd
->vdev_parent
->vdev_child
[0] == vd
)
3506 vd
->vdev_unspare
= B_TRUE
;
3508 if ((flags
& ZFS_ONLINE_EXPAND
) || spa
->spa_autoexpand
) {
3510 /* XXX - L2ARC 1.0 does not support expansion */
3512 return (spa_vdev_state_exit(spa
, vd
, ENOTSUP
));
3513 spa_async_request(spa
, SPA_ASYNC_CONFIG_UPDATE
);
3516 /* Restart initializing if necessary */
3517 mutex_enter(&vd
->vdev_initialize_lock
);
3518 if (vdev_writeable(vd
) &&
3519 vd
->vdev_initialize_thread
== NULL
&&
3520 vd
->vdev_initialize_state
== VDEV_INITIALIZE_ACTIVE
) {
3521 (void) vdev_initialize(vd
);
3523 mutex_exit(&vd
->vdev_initialize_lock
);
3525 /* Restart trimming if necessary */
3526 mutex_enter(&vd
->vdev_trim_lock
);
3527 if (vdev_writeable(vd
) &&
3528 vd
->vdev_trim_thread
== NULL
&&
3529 vd
->vdev_trim_state
== VDEV_TRIM_ACTIVE
) {
3530 (void) vdev_trim(vd
, vd
->vdev_trim_rate
, vd
->vdev_trim_partial
,
3531 vd
->vdev_trim_secure
);
3533 mutex_exit(&vd
->vdev_trim_lock
);
3536 (oldstate
< VDEV_STATE_DEGRADED
&&
3537 vd
->vdev_state
>= VDEV_STATE_DEGRADED
))
3538 spa_event_notify(spa
, vd
, NULL
, ESC_ZFS_VDEV_ONLINE
);
3540 return (spa_vdev_state_exit(spa
, vd
, 0));
3544 vdev_offline_locked(spa_t
*spa
, uint64_t guid
, uint64_t flags
)
3548 uint64_t generation
;
3549 metaslab_group_t
*mg
;
3552 spa_vdev_state_enter(spa
, SCL_ALLOC
);
3554 if ((vd
= spa_lookup_by_guid(spa
, guid
, B_TRUE
)) == NULL
)
3555 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENODEV
)));
3557 if (!vd
->vdev_ops
->vdev_op_leaf
)
3558 return (spa_vdev_state_exit(spa
, NULL
, SET_ERROR(ENOTSUP
)));
3562 generation
= spa
->spa_config_generation
+ 1;
3565 * If the device isn't already offline, try to offline it.
3567 if (!vd
->vdev_offline
) {
3569 * If this device has the only valid copy of some data,
3570 * don't allow it to be offlined. Log devices are always
3573 if (!tvd
->vdev_islog
&& vd
->vdev_aux
== NULL
&&
3574 vdev_dtl_required(vd
))
3575 return (spa_vdev_state_exit(spa
, NULL
,
3579 * If the top-level is a slog and it has had allocations
3580 * then proceed. We check that the vdev's metaslab group
3581 * is not NULL since it's possible that we may have just
3582 * added this vdev but not yet initialized its metaslabs.
3584 if (tvd
->vdev_islog
&& mg
!= NULL
) {
3586 * Prevent any future allocations.
3588 metaslab_group_passivate(mg
);
3589 (void) spa_vdev_state_exit(spa
, vd
, 0);
3591 error
= spa_reset_logs(spa
);
3594 * If the log device was successfully reset but has
3595 * checkpointed data, do not offline it.
3598 tvd
->vdev_checkpoint_sm
!= NULL
) {
3599 ASSERT3U(space_map_allocated(
3600 tvd
->vdev_checkpoint_sm
), !=, 0);
3601 error
= ZFS_ERR_CHECKPOINT_EXISTS
;
3604 spa_vdev_state_enter(spa
, SCL_ALLOC
);
3607 * Check to see if the config has changed.
3609 if (error
|| generation
!= spa
->spa_config_generation
) {
3610 metaslab_group_activate(mg
);
3612 return (spa_vdev_state_exit(spa
,
3614 (void) spa_vdev_state_exit(spa
, vd
, 0);
3617 ASSERT0(tvd
->vdev_stat
.vs_alloc
);
3621 * Offline this device and reopen its top-level vdev.
3622 * If the top-level vdev is a log device then just offline
3623 * it. Otherwise, if this action results in the top-level
3624 * vdev becoming unusable, undo it and fail the request.
3626 vd
->vdev_offline
= B_TRUE
;
3629 if (!tvd
->vdev_islog
&& vd
->vdev_aux
== NULL
&&
3630 vdev_is_dead(tvd
)) {
3631 vd
->vdev_offline
= B_FALSE
;
3633 return (spa_vdev_state_exit(spa
, NULL
,
3638 * Add the device back into the metaslab rotor so that
3639 * once we online the device it's open for business.
3641 if (tvd
->vdev_islog
&& mg
!= NULL
)
3642 metaslab_group_activate(mg
);
3645 vd
->vdev_tmpoffline
= !!(flags
& ZFS_OFFLINE_TEMPORARY
);
3647 return (spa_vdev_state_exit(spa
, vd
, 0));
3651 vdev_offline(spa_t
*spa
, uint64_t guid
, uint64_t flags
)
3655 mutex_enter(&spa
->spa_vdev_top_lock
);
3656 error
= vdev_offline_locked(spa
, guid
, flags
);
3657 mutex_exit(&spa
->spa_vdev_top_lock
);
3663 * Clear the error counts associated with this vdev. Unlike vdev_online() and
3664 * vdev_offline(), we assume the spa config is locked. We also clear all
3665 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
3668 vdev_clear(spa_t
*spa
, vdev_t
*vd
)
3670 vdev_t
*rvd
= spa
->spa_root_vdev
;
3672 ASSERT(spa_config_held(spa
, SCL_STATE_ALL
, RW_WRITER
) == SCL_STATE_ALL
);
3677 vd
->vdev_stat
.vs_read_errors
= 0;
3678 vd
->vdev_stat
.vs_write_errors
= 0;
3679 vd
->vdev_stat
.vs_checksum_errors
= 0;
3680 vd
->vdev_stat
.vs_slow_ios
= 0;
3682 for (int c
= 0; c
< vd
->vdev_children
; c
++)
3683 vdev_clear(spa
, vd
->vdev_child
[c
]);
3686 * It makes no sense to "clear" an indirect vdev.
3688 if (!vdev_is_concrete(vd
))
3692 * If we're in the FAULTED state or have experienced failed I/O, then
3693 * clear the persistent state and attempt to reopen the device. We
3694 * also mark the vdev config dirty, so that the new faulted state is
3695 * written out to disk.
3697 if (vd
->vdev_faulted
|| vd
->vdev_degraded
||
3698 !vdev_readable(vd
) || !vdev_writeable(vd
)) {
3700 * When reopening in response to a clear event, it may be due to
3701 * a fmadm repair request. In this case, if the device is
3702 * still broken, we want to still post the ereport again.
3704 vd
->vdev_forcefault
= B_TRUE
;
3706 vd
->vdev_faulted
= vd
->vdev_degraded
= 0ULL;
3707 vd
->vdev_cant_read
= B_FALSE
;
3708 vd
->vdev_cant_write
= B_FALSE
;
3709 vd
->vdev_stat
.vs_aux
= 0;
3711 vdev_reopen(vd
== rvd
? rvd
: vd
->vdev_top
);
3713 vd
->vdev_forcefault
= B_FALSE
;
3715 if (vd
!= rvd
&& vdev_writeable(vd
->vdev_top
))
3716 vdev_state_dirty(vd
->vdev_top
);
3718 /* If a resilver isn't required, check if vdevs can be culled */
3719 if (vd
->vdev_aux
== NULL
&& !vdev_is_dead(vd
) &&
3720 !dsl_scan_resilvering(spa
->spa_dsl_pool
) &&
3721 !dsl_scan_resilver_scheduled(spa
->spa_dsl_pool
))
3722 spa_async_request(spa
, SPA_ASYNC_RESILVER_DONE
);
3724 spa_event_notify(spa
, vd
, NULL
, ESC_ZFS_VDEV_CLEAR
);
3728 * When clearing a FMA-diagnosed fault, we always want to
3729 * unspare the device, as we assume that the original spare was
3730 * done in response to the FMA fault.
3732 if (!vdev_is_dead(vd
) && vd
->vdev_parent
!= NULL
&&
3733 vd
->vdev_parent
->vdev_ops
== &vdev_spare_ops
&&
3734 vd
->vdev_parent
->vdev_child
[0] == vd
)
3735 vd
->vdev_unspare
= B_TRUE
;
3739 vdev_is_dead(vdev_t
*vd
)
3742 * Holes and missing devices are always considered "dead".
3743 * This simplifies the code since we don't have to check for
3744 * these types of devices in the various code paths.
3745 * Instead we rely on the fact that we skip over dead devices
3746 * before issuing I/O to them.
3748 return (vd
->vdev_state
< VDEV_STATE_DEGRADED
||
3749 vd
->vdev_ops
== &vdev_hole_ops
||
3750 vd
->vdev_ops
== &vdev_missing_ops
);
3754 vdev_readable(vdev_t
*vd
)
3756 return (!vdev_is_dead(vd
) && !vd
->vdev_cant_read
);
3760 vdev_writeable(vdev_t
*vd
)
3762 return (!vdev_is_dead(vd
) && !vd
->vdev_cant_write
&&
3763 vdev_is_concrete(vd
));
3767 vdev_allocatable(vdev_t
*vd
)
3769 uint64_t state
= vd
->vdev_state
;
3772 * We currently allow allocations from vdevs which may be in the
3773 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
3774 * fails to reopen then we'll catch it later when we're holding
3775 * the proper locks. Note that we have to get the vdev state
3776 * in a local variable because although it changes atomically,
3777 * we're asking two separate questions about it.
3779 return (!(state
< VDEV_STATE_DEGRADED
&& state
!= VDEV_STATE_CLOSED
) &&
3780 !vd
->vdev_cant_write
&& vdev_is_concrete(vd
) &&
3781 vd
->vdev_mg
->mg_initialized
);
3785 vdev_accessible(vdev_t
*vd
, zio_t
*zio
)
3787 ASSERT(zio
->io_vd
== vd
);
3789 if (vdev_is_dead(vd
) || vd
->vdev_remove_wanted
)
3792 if (zio
->io_type
== ZIO_TYPE_READ
)
3793 return (!vd
->vdev_cant_read
);
3795 if (zio
->io_type
== ZIO_TYPE_WRITE
)
3796 return (!vd
->vdev_cant_write
);
3802 vdev_get_child_stat(vdev_t
*cvd
, vdev_stat_t
*vs
, vdev_stat_t
*cvs
)
3804 for (int t
= 0; t
< VS_ZIO_TYPES
; t
++) {
3805 vs
->vs_ops
[t
] += cvs
->vs_ops
[t
];
3806 vs
->vs_bytes
[t
] += cvs
->vs_bytes
[t
];
3809 cvs
->vs_scan_removing
= cvd
->vdev_removing
;
3813 * Get extended stats
3816 vdev_get_child_stat_ex(vdev_t
*cvd
, vdev_stat_ex_t
*vsx
, vdev_stat_ex_t
*cvsx
)
3819 for (t
= 0; t
< ZIO_TYPES
; t
++) {
3820 for (b
= 0; b
< ARRAY_SIZE(vsx
->vsx_disk_histo
[0]); b
++)
3821 vsx
->vsx_disk_histo
[t
][b
] += cvsx
->vsx_disk_histo
[t
][b
];
3823 for (b
= 0; b
< ARRAY_SIZE(vsx
->vsx_total_histo
[0]); b
++) {
3824 vsx
->vsx_total_histo
[t
][b
] +=
3825 cvsx
->vsx_total_histo
[t
][b
];
3829 for (t
= 0; t
< ZIO_PRIORITY_NUM_QUEUEABLE
; t
++) {
3830 for (b
= 0; b
< ARRAY_SIZE(vsx
->vsx_queue_histo
[0]); b
++) {
3831 vsx
->vsx_queue_histo
[t
][b
] +=
3832 cvsx
->vsx_queue_histo
[t
][b
];
3834 vsx
->vsx_active_queue
[t
] += cvsx
->vsx_active_queue
[t
];
3835 vsx
->vsx_pend_queue
[t
] += cvsx
->vsx_pend_queue
[t
];
3837 for (b
= 0; b
< ARRAY_SIZE(vsx
->vsx_ind_histo
[0]); b
++)
3838 vsx
->vsx_ind_histo
[t
][b
] += cvsx
->vsx_ind_histo
[t
][b
];
3840 for (b
= 0; b
< ARRAY_SIZE(vsx
->vsx_agg_histo
[0]); b
++)
3841 vsx
->vsx_agg_histo
[t
][b
] += cvsx
->vsx_agg_histo
[t
][b
];
3847 vdev_is_spacemap_addressable(vdev_t
*vd
)
3849 if (spa_feature_is_active(vd
->vdev_spa
, SPA_FEATURE_SPACEMAP_V2
))
3853 * If double-word space map entries are not enabled we assume
3854 * 47 bits of the space map entry are dedicated to the entry's
3855 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
3856 * to calculate the maximum address that can be described by a
3857 * space map entry for the given device.
3859 uint64_t shift
= vd
->vdev_ashift
+ SM_OFFSET_BITS
;
3861 if (shift
>= 63) /* detect potential overflow */
3864 return (vd
->vdev_asize
< (1ULL << shift
));
3868 * Get statistics for the given vdev.
3871 vdev_get_stats_ex_impl(vdev_t
*vd
, vdev_stat_t
*vs
, vdev_stat_ex_t
*vsx
)
3875 * If we're getting stats on the root vdev, aggregate the I/O counts
3876 * over all top-level vdevs (i.e. the direct children of the root).
3878 if (!vd
->vdev_ops
->vdev_op_leaf
) {
3880 memset(vs
->vs_ops
, 0, sizeof (vs
->vs_ops
));
3881 memset(vs
->vs_bytes
, 0, sizeof (vs
->vs_bytes
));
3884 memset(vsx
, 0, sizeof (*vsx
));
3886 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
3887 vdev_t
*cvd
= vd
->vdev_child
[c
];
3888 vdev_stat_t
*cvs
= &cvd
->vdev_stat
;
3889 vdev_stat_ex_t
*cvsx
= &cvd
->vdev_stat_ex
;
3891 vdev_get_stats_ex_impl(cvd
, cvs
, cvsx
);
3893 vdev_get_child_stat(cvd
, vs
, cvs
);
3895 vdev_get_child_stat_ex(cvd
, vsx
, cvsx
);
3900 * We're a leaf. Just copy our ZIO active queue stats in. The
3901 * other leaf stats are updated in vdev_stat_update().
3906 memcpy(vsx
, &vd
->vdev_stat_ex
, sizeof (vd
->vdev_stat_ex
));
3908 for (t
= 0; t
< ARRAY_SIZE(vd
->vdev_queue
.vq_class
); t
++) {
3909 vsx
->vsx_active_queue
[t
] =
3910 vd
->vdev_queue
.vq_class
[t
].vqc_active
;
3911 vsx
->vsx_pend_queue
[t
] = avl_numnodes(
3912 &vd
->vdev_queue
.vq_class
[t
].vqc_queued_tree
);
3918 vdev_get_stats_ex(vdev_t
*vd
, vdev_stat_t
*vs
, vdev_stat_ex_t
*vsx
)
3920 vdev_t
*tvd
= vd
->vdev_top
;
3921 mutex_enter(&vd
->vdev_stat_lock
);
3923 bcopy(&vd
->vdev_stat
, vs
, sizeof (*vs
));
3924 vs
->vs_timestamp
= gethrtime() - vs
->vs_timestamp
;
3925 vs
->vs_state
= vd
->vdev_state
;
3926 vs
->vs_rsize
= vdev_get_min_asize(vd
);
3927 if (vd
->vdev_ops
->vdev_op_leaf
) {
3928 vs
->vs_rsize
+= VDEV_LABEL_START_SIZE
+
3929 VDEV_LABEL_END_SIZE
;
3931 * Report initializing progress. Since we don't
3932 * have the initializing locks held, this is only
3933 * an estimate (although a fairly accurate one).
3935 vs
->vs_initialize_bytes_done
=
3936 vd
->vdev_initialize_bytes_done
;
3937 vs
->vs_initialize_bytes_est
=
3938 vd
->vdev_initialize_bytes_est
;
3939 vs
->vs_initialize_state
= vd
->vdev_initialize_state
;
3940 vs
->vs_initialize_action_time
=
3941 vd
->vdev_initialize_action_time
;
3944 * Report manual TRIM progress. Since we don't have
3945 * the manual TRIM locks held, this is only an
3946 * estimate (although fairly accurate one).
3948 vs
->vs_trim_notsup
= !vd
->vdev_has_trim
;
3949 vs
->vs_trim_bytes_done
= vd
->vdev_trim_bytes_done
;
3950 vs
->vs_trim_bytes_est
= vd
->vdev_trim_bytes_est
;
3951 vs
->vs_trim_state
= vd
->vdev_trim_state
;
3952 vs
->vs_trim_action_time
= vd
->vdev_trim_action_time
;
3955 * Report expandable space on top-level, non-auxiliary devices
3956 * only. The expandable space is reported in terms of metaslab
3957 * sized units since that determines how much space the pool
3960 if (vd
->vdev_aux
== NULL
&& tvd
!= NULL
) {
3961 vs
->vs_esize
= P2ALIGN(
3962 vd
->vdev_max_asize
- vd
->vdev_asize
,
3963 1ULL << tvd
->vdev_ms_shift
);
3965 if (vd
->vdev_aux
== NULL
&& vd
== vd
->vdev_top
&&
3966 vdev_is_concrete(vd
)) {
3967 vs
->vs_fragmentation
= (vd
->vdev_mg
!= NULL
) ?
3968 vd
->vdev_mg
->mg_fragmentation
: 0;
3970 if (vd
->vdev_ops
->vdev_op_leaf
)
3971 vs
->vs_resilver_deferred
= vd
->vdev_resilver_deferred
;
3974 vdev_get_stats_ex_impl(vd
, vs
, vsx
);
3975 mutex_exit(&vd
->vdev_stat_lock
);
3979 vdev_get_stats(vdev_t
*vd
, vdev_stat_t
*vs
)
3981 return (vdev_get_stats_ex(vd
, vs
, NULL
));
3985 vdev_clear_stats(vdev_t
*vd
)
3987 mutex_enter(&vd
->vdev_stat_lock
);
3988 vd
->vdev_stat
.vs_space
= 0;
3989 vd
->vdev_stat
.vs_dspace
= 0;
3990 vd
->vdev_stat
.vs_alloc
= 0;
3991 mutex_exit(&vd
->vdev_stat_lock
);
3995 vdev_scan_stat_init(vdev_t
*vd
)
3997 vdev_stat_t
*vs
= &vd
->vdev_stat
;
3999 for (int c
= 0; c
< vd
->vdev_children
; c
++)
4000 vdev_scan_stat_init(vd
->vdev_child
[c
]);
4002 mutex_enter(&vd
->vdev_stat_lock
);
4003 vs
->vs_scan_processed
= 0;
4004 mutex_exit(&vd
->vdev_stat_lock
);
4008 vdev_stat_update(zio_t
*zio
, uint64_t psize
)
4010 spa_t
*spa
= zio
->io_spa
;
4011 vdev_t
*rvd
= spa
->spa_root_vdev
;
4012 vdev_t
*vd
= zio
->io_vd
? zio
->io_vd
: rvd
;
4014 uint64_t txg
= zio
->io_txg
;
4015 vdev_stat_t
*vs
= &vd
->vdev_stat
;
4016 vdev_stat_ex_t
*vsx
= &vd
->vdev_stat_ex
;
4017 zio_type_t type
= zio
->io_type
;
4018 int flags
= zio
->io_flags
;
4021 * If this i/o is a gang leader, it didn't do any actual work.
4023 if (zio
->io_gang_tree
)
4026 if (zio
->io_error
== 0) {
4028 * If this is a root i/o, don't count it -- we've already
4029 * counted the top-level vdevs, and vdev_get_stats() will
4030 * aggregate them when asked. This reduces contention on
4031 * the root vdev_stat_lock and implicitly handles blocks
4032 * that compress away to holes, for which there is no i/o.
4033 * (Holes never create vdev children, so all the counters
4034 * remain zero, which is what we want.)
4036 * Note: this only applies to successful i/o (io_error == 0)
4037 * because unlike i/o counts, errors are not additive.
4038 * When reading a ditto block, for example, failure of
4039 * one top-level vdev does not imply a root-level error.
4044 ASSERT(vd
== zio
->io_vd
);
4046 if (flags
& ZIO_FLAG_IO_BYPASS
)
4049 mutex_enter(&vd
->vdev_stat_lock
);
4051 if (flags
& ZIO_FLAG_IO_REPAIR
) {
4052 if (flags
& ZIO_FLAG_SCAN_THREAD
) {
4053 dsl_scan_phys_t
*scn_phys
=
4054 &spa
->spa_dsl_pool
->dp_scan
->scn_phys
;
4055 uint64_t *processed
= &scn_phys
->scn_processed
;
4058 if (vd
->vdev_ops
->vdev_op_leaf
)
4059 atomic_add_64(processed
, psize
);
4060 vs
->vs_scan_processed
+= psize
;
4063 if (flags
& ZIO_FLAG_SELF_HEAL
)
4064 vs
->vs_self_healed
+= psize
;
4068 * The bytes/ops/histograms are recorded at the leaf level and
4069 * aggregated into the higher level vdevs in vdev_get_stats().
4071 if (vd
->vdev_ops
->vdev_op_leaf
&&
4072 (zio
->io_priority
< ZIO_PRIORITY_NUM_QUEUEABLE
)) {
4073 zio_type_t vs_type
= type
;
4076 * TRIM ops and bytes are reported to user space as
4077 * ZIO_TYPE_IOCTL. This is done to preserve the
4078 * vdev_stat_t structure layout for user space.
4080 if (type
== ZIO_TYPE_TRIM
)
4081 vs_type
= ZIO_TYPE_IOCTL
;
4083 vs
->vs_ops
[vs_type
]++;
4084 vs
->vs_bytes
[vs_type
] += psize
;
4086 if (flags
& ZIO_FLAG_DELEGATED
) {
4087 vsx
->vsx_agg_histo
[zio
->io_priority
]
4088 [RQ_HISTO(zio
->io_size
)]++;
4090 vsx
->vsx_ind_histo
[zio
->io_priority
]
4091 [RQ_HISTO(zio
->io_size
)]++;
4094 if (zio
->io_delta
&& zio
->io_delay
) {
4095 vsx
->vsx_queue_histo
[zio
->io_priority
]
4096 [L_HISTO(zio
->io_delta
- zio
->io_delay
)]++;
4097 vsx
->vsx_disk_histo
[type
]
4098 [L_HISTO(zio
->io_delay
)]++;
4099 vsx
->vsx_total_histo
[type
]
4100 [L_HISTO(zio
->io_delta
)]++;
4104 mutex_exit(&vd
->vdev_stat_lock
);
4108 if (flags
& ZIO_FLAG_SPECULATIVE
)
4112 * If this is an I/O error that is going to be retried, then ignore the
4113 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
4114 * hard errors, when in reality they can happen for any number of
4115 * innocuous reasons (bus resets, MPxIO link failure, etc).
4117 if (zio
->io_error
== EIO
&&
4118 !(zio
->io_flags
& ZIO_FLAG_IO_RETRY
))
4122 * Intent logs writes won't propagate their error to the root
4123 * I/O so don't mark these types of failures as pool-level
4126 if (zio
->io_vd
== NULL
&& (zio
->io_flags
& ZIO_FLAG_DONT_PROPAGATE
))
4129 if (spa
->spa_load_state
== SPA_LOAD_NONE
&&
4130 type
== ZIO_TYPE_WRITE
&& txg
!= 0 &&
4131 (!(flags
& ZIO_FLAG_IO_REPAIR
) ||
4132 (flags
& ZIO_FLAG_SCAN_THREAD
) ||
4133 spa
->spa_claiming
)) {
4135 * This is either a normal write (not a repair), or it's
4136 * a repair induced by the scrub thread, or it's a repair
4137 * made by zil_claim() during spa_load() in the first txg.
4138 * In the normal case, we commit the DTL change in the same
4139 * txg as the block was born. In the scrub-induced repair
4140 * case, we know that scrubs run in first-pass syncing context,
4141 * so we commit the DTL change in spa_syncing_txg(spa).
4142 * In the zil_claim() case, we commit in spa_first_txg(spa).
4144 * We currently do not make DTL entries for failed spontaneous
4145 * self-healing writes triggered by normal (non-scrubbing)
4146 * reads, because we have no transactional context in which to
4147 * do so -- and it's not clear that it'd be desirable anyway.
4149 if (vd
->vdev_ops
->vdev_op_leaf
) {
4150 uint64_t commit_txg
= txg
;
4151 if (flags
& ZIO_FLAG_SCAN_THREAD
) {
4152 ASSERT(flags
& ZIO_FLAG_IO_REPAIR
);
4153 ASSERT(spa_sync_pass(spa
) == 1);
4154 vdev_dtl_dirty(vd
, DTL_SCRUB
, txg
, 1);
4155 commit_txg
= spa_syncing_txg(spa
);
4156 } else if (spa
->spa_claiming
) {
4157 ASSERT(flags
& ZIO_FLAG_IO_REPAIR
);
4158 commit_txg
= spa_first_txg(spa
);
4160 ASSERT(commit_txg
>= spa_syncing_txg(spa
));
4161 if (vdev_dtl_contains(vd
, DTL_MISSING
, txg
, 1))
4163 for (pvd
= vd
; pvd
!= rvd
; pvd
= pvd
->vdev_parent
)
4164 vdev_dtl_dirty(pvd
, DTL_PARTIAL
, txg
, 1);
4165 vdev_dirty(vd
->vdev_top
, VDD_DTL
, vd
, commit_txg
);
4168 vdev_dtl_dirty(vd
, DTL_MISSING
, txg
, 1);
4173 vdev_deflated_space(vdev_t
*vd
, int64_t space
)
4175 ASSERT((space
& (SPA_MINBLOCKSIZE
-1)) == 0);
4176 ASSERT(vd
->vdev_deflate_ratio
!= 0 || vd
->vdev_isl2cache
);
4178 return ((space
>> SPA_MINBLOCKSHIFT
) * vd
->vdev_deflate_ratio
);
4182 * Update the in-core space usage stats for this vdev, its metaslab class,
4183 * and the root vdev.
4186 vdev_space_update(vdev_t
*vd
, int64_t alloc_delta
, int64_t defer_delta
,
4187 int64_t space_delta
)
4189 int64_t dspace_delta
;
4190 spa_t
*spa
= vd
->vdev_spa
;
4191 vdev_t
*rvd
= spa
->spa_root_vdev
;
4193 ASSERT(vd
== vd
->vdev_top
);
4196 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
4197 * factor. We must calculate this here and not at the root vdev
4198 * because the root vdev's psize-to-asize is simply the max of its
4199 * children's, thus not accurate enough for us.
4201 dspace_delta
= vdev_deflated_space(vd
, space_delta
);
4203 mutex_enter(&vd
->vdev_stat_lock
);
4204 /* ensure we won't underflow */
4205 if (alloc_delta
< 0) {
4206 ASSERT3U(vd
->vdev_stat
.vs_alloc
, >=, -alloc_delta
);
4209 vd
->vdev_stat
.vs_alloc
+= alloc_delta
;
4210 vd
->vdev_stat
.vs_space
+= space_delta
;
4211 vd
->vdev_stat
.vs_dspace
+= dspace_delta
;
4212 mutex_exit(&vd
->vdev_stat_lock
);
4214 /* every class but log contributes to root space stats */
4215 if (vd
->vdev_mg
!= NULL
&& !vd
->vdev_islog
) {
4216 ASSERT(!vd
->vdev_isl2cache
);
4217 mutex_enter(&rvd
->vdev_stat_lock
);
4218 rvd
->vdev_stat
.vs_alloc
+= alloc_delta
;
4219 rvd
->vdev_stat
.vs_space
+= space_delta
;
4220 rvd
->vdev_stat
.vs_dspace
+= dspace_delta
;
4221 mutex_exit(&rvd
->vdev_stat_lock
);
4223 /* Note: metaslab_class_space_update moved to metaslab_space_update */
4227 * Mark a top-level vdev's config as dirty, placing it on the dirty list
4228 * so that it will be written out next time the vdev configuration is synced.
4229 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
4232 vdev_config_dirty(vdev_t
*vd
)
4234 spa_t
*spa
= vd
->vdev_spa
;
4235 vdev_t
*rvd
= spa
->spa_root_vdev
;
4238 ASSERT(spa_writeable(spa
));
4241 * If this is an aux vdev (as with l2cache and spare devices), then we
4242 * update the vdev config manually and set the sync flag.
4244 if (vd
->vdev_aux
!= NULL
) {
4245 spa_aux_vdev_t
*sav
= vd
->vdev_aux
;
4249 for (c
= 0; c
< sav
->sav_count
; c
++) {
4250 if (sav
->sav_vdevs
[c
] == vd
)
4254 if (c
== sav
->sav_count
) {
4256 * We're being removed. There's nothing more to do.
4258 ASSERT(sav
->sav_sync
== B_TRUE
);
4262 sav
->sav_sync
= B_TRUE
;
4264 if (nvlist_lookup_nvlist_array(sav
->sav_config
,
4265 ZPOOL_CONFIG_L2CACHE
, &aux
, &naux
) != 0) {
4266 VERIFY(nvlist_lookup_nvlist_array(sav
->sav_config
,
4267 ZPOOL_CONFIG_SPARES
, &aux
, &naux
) == 0);
4273 * Setting the nvlist in the middle if the array is a little
4274 * sketchy, but it will work.
4276 nvlist_free(aux
[c
]);
4277 aux
[c
] = vdev_config_generate(spa
, vd
, B_TRUE
, 0);
4283 * The dirty list is protected by the SCL_CONFIG lock. The caller
4284 * must either hold SCL_CONFIG as writer, or must be the sync thread
4285 * (which holds SCL_CONFIG as reader). There's only one sync thread,
4286 * so this is sufficient to ensure mutual exclusion.
4288 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_WRITER
) ||
4289 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
4290 spa_config_held(spa
, SCL_CONFIG
, RW_READER
)));
4293 for (c
= 0; c
< rvd
->vdev_children
; c
++)
4294 vdev_config_dirty(rvd
->vdev_child
[c
]);
4296 ASSERT(vd
== vd
->vdev_top
);
4298 if (!list_link_active(&vd
->vdev_config_dirty_node
) &&
4299 vdev_is_concrete(vd
)) {
4300 list_insert_head(&spa
->spa_config_dirty_list
, vd
);
4306 vdev_config_clean(vdev_t
*vd
)
4308 spa_t
*spa
= vd
->vdev_spa
;
4310 ASSERT(spa_config_held(spa
, SCL_CONFIG
, RW_WRITER
) ||
4311 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
4312 spa_config_held(spa
, SCL_CONFIG
, RW_READER
)));
4314 ASSERT(list_link_active(&vd
->vdev_config_dirty_node
));
4315 list_remove(&spa
->spa_config_dirty_list
, vd
);
4319 * Mark a top-level vdev's state as dirty, so that the next pass of
4320 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
4321 * the state changes from larger config changes because they require
4322 * much less locking, and are often needed for administrative actions.
4325 vdev_state_dirty(vdev_t
*vd
)
4327 spa_t
*spa
= vd
->vdev_spa
;
4329 ASSERT(spa_writeable(spa
));
4330 ASSERT(vd
== vd
->vdev_top
);
4333 * The state list is protected by the SCL_STATE lock. The caller
4334 * must either hold SCL_STATE as writer, or must be the sync thread
4335 * (which holds SCL_STATE as reader). There's only one sync thread,
4336 * so this is sufficient to ensure mutual exclusion.
4338 ASSERT(spa_config_held(spa
, SCL_STATE
, RW_WRITER
) ||
4339 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
4340 spa_config_held(spa
, SCL_STATE
, RW_READER
)));
4342 if (!list_link_active(&vd
->vdev_state_dirty_node
) &&
4343 vdev_is_concrete(vd
))
4344 list_insert_head(&spa
->spa_state_dirty_list
, vd
);
4348 vdev_state_clean(vdev_t
*vd
)
4350 spa_t
*spa
= vd
->vdev_spa
;
4352 ASSERT(spa_config_held(spa
, SCL_STATE
, RW_WRITER
) ||
4353 (dsl_pool_sync_context(spa_get_dsl(spa
)) &&
4354 spa_config_held(spa
, SCL_STATE
, RW_READER
)));
4356 ASSERT(list_link_active(&vd
->vdev_state_dirty_node
));
4357 list_remove(&spa
->spa_state_dirty_list
, vd
);
4361 * Propagate vdev state up from children to parent.
4364 vdev_propagate_state(vdev_t
*vd
)
4366 spa_t
*spa
= vd
->vdev_spa
;
4367 vdev_t
*rvd
= spa
->spa_root_vdev
;
4368 int degraded
= 0, faulted
= 0;
4372 if (vd
->vdev_children
> 0) {
4373 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
4374 child
= vd
->vdev_child
[c
];
4377 * Don't factor holes or indirect vdevs into the
4380 if (!vdev_is_concrete(child
))
4383 if (!vdev_readable(child
) ||
4384 (!vdev_writeable(child
) && spa_writeable(spa
))) {
4386 * Root special: if there is a top-level log
4387 * device, treat the root vdev as if it were
4390 if (child
->vdev_islog
&& vd
== rvd
)
4394 } else if (child
->vdev_state
<= VDEV_STATE_DEGRADED
) {
4398 if (child
->vdev_stat
.vs_aux
== VDEV_AUX_CORRUPT_DATA
)
4402 vd
->vdev_ops
->vdev_op_state_change(vd
, faulted
, degraded
);
4405 * Root special: if there is a top-level vdev that cannot be
4406 * opened due to corrupted metadata, then propagate the root
4407 * vdev's aux state as 'corrupt' rather than 'insufficient
4410 if (corrupted
&& vd
== rvd
&&
4411 rvd
->vdev_state
== VDEV_STATE_CANT_OPEN
)
4412 vdev_set_state(rvd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
4413 VDEV_AUX_CORRUPT_DATA
);
4416 if (vd
->vdev_parent
)
4417 vdev_propagate_state(vd
->vdev_parent
);
4421 * Set a vdev's state. If this is during an open, we don't update the parent
4422 * state, because we're in the process of opening children depth-first.
4423 * Otherwise, we propagate the change to the parent.
4425 * If this routine places a device in a faulted state, an appropriate ereport is
4429 vdev_set_state(vdev_t
*vd
, boolean_t isopen
, vdev_state_t state
, vdev_aux_t aux
)
4431 uint64_t save_state
;
4432 spa_t
*spa
= vd
->vdev_spa
;
4434 if (state
== vd
->vdev_state
) {
4436 * Since vdev_offline() code path is already in an offline
4437 * state we can miss a statechange event to OFFLINE. Check
4438 * the previous state to catch this condition.
4440 if (vd
->vdev_ops
->vdev_op_leaf
&&
4441 (state
== VDEV_STATE_OFFLINE
) &&
4442 (vd
->vdev_prevstate
>= VDEV_STATE_FAULTED
)) {
4443 /* post an offline state change */
4444 zfs_post_state_change(spa
, vd
, vd
->vdev_prevstate
);
4446 vd
->vdev_stat
.vs_aux
= aux
;
4450 save_state
= vd
->vdev_state
;
4452 vd
->vdev_state
= state
;
4453 vd
->vdev_stat
.vs_aux
= aux
;
4456 * If we are setting the vdev state to anything but an open state, then
4457 * always close the underlying device unless the device has requested
4458 * a delayed close (i.e. we're about to remove or fault the device).
4459 * Otherwise, we keep accessible but invalid devices open forever.
4460 * We don't call vdev_close() itself, because that implies some extra
4461 * checks (offline, etc) that we don't want here. This is limited to
4462 * leaf devices, because otherwise closing the device will affect other
4465 if (!vd
->vdev_delayed_close
&& vdev_is_dead(vd
) &&
4466 vd
->vdev_ops
->vdev_op_leaf
)
4467 vd
->vdev_ops
->vdev_op_close(vd
);
4469 if (vd
->vdev_removed
&&
4470 state
== VDEV_STATE_CANT_OPEN
&&
4471 (aux
== VDEV_AUX_OPEN_FAILED
|| vd
->vdev_checkremove
)) {
4473 * If the previous state is set to VDEV_STATE_REMOVED, then this
4474 * device was previously marked removed and someone attempted to
4475 * reopen it. If this failed due to a nonexistent device, then
4476 * keep the device in the REMOVED state. We also let this be if
4477 * it is one of our special test online cases, which is only
4478 * attempting to online the device and shouldn't generate an FMA
4481 vd
->vdev_state
= VDEV_STATE_REMOVED
;
4482 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NONE
;
4483 } else if (state
== VDEV_STATE_REMOVED
) {
4484 vd
->vdev_removed
= B_TRUE
;
4485 } else if (state
== VDEV_STATE_CANT_OPEN
) {
4487 * If we fail to open a vdev during an import or recovery, we
4488 * mark it as "not available", which signifies that it was
4489 * never there to begin with. Failure to open such a device
4490 * is not considered an error.
4492 if ((spa_load_state(spa
) == SPA_LOAD_IMPORT
||
4493 spa_load_state(spa
) == SPA_LOAD_RECOVER
) &&
4494 vd
->vdev_ops
->vdev_op_leaf
)
4495 vd
->vdev_not_present
= 1;
4498 * Post the appropriate ereport. If the 'prevstate' field is
4499 * set to something other than VDEV_STATE_UNKNOWN, it indicates
4500 * that this is part of a vdev_reopen(). In this case, we don't
4501 * want to post the ereport if the device was already in the
4502 * CANT_OPEN state beforehand.
4504 * If the 'checkremove' flag is set, then this is an attempt to
4505 * online the device in response to an insertion event. If we
4506 * hit this case, then we have detected an insertion event for a
4507 * faulted or offline device that wasn't in the removed state.
4508 * In this scenario, we don't post an ereport because we are
4509 * about to replace the device, or attempt an online with
4510 * vdev_forcefault, which will generate the fault for us.
4512 if ((vd
->vdev_prevstate
!= state
|| vd
->vdev_forcefault
) &&
4513 !vd
->vdev_not_present
&& !vd
->vdev_checkremove
&&
4514 vd
!= spa
->spa_root_vdev
) {
4518 case VDEV_AUX_OPEN_FAILED
:
4519 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED
;
4521 case VDEV_AUX_CORRUPT_DATA
:
4522 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA
;
4524 case VDEV_AUX_NO_REPLICAS
:
4525 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS
;
4527 case VDEV_AUX_BAD_GUID_SUM
:
4528 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM
;
4530 case VDEV_AUX_TOO_SMALL
:
4531 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL
;
4533 case VDEV_AUX_BAD_LABEL
:
4534 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL
;
4536 case VDEV_AUX_BAD_ASHIFT
:
4537 class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT
;
4540 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN
;
4543 zfs_ereport_post(class, spa
, vd
, NULL
, NULL
,
4547 /* Erase any notion of persistent removed state */
4548 vd
->vdev_removed
= B_FALSE
;
4550 vd
->vdev_removed
= B_FALSE
;
4554 * Notify ZED of any significant state-change on a leaf vdev.
4557 if (vd
->vdev_ops
->vdev_op_leaf
) {
4558 /* preserve original state from a vdev_reopen() */
4559 if ((vd
->vdev_prevstate
!= VDEV_STATE_UNKNOWN
) &&
4560 (vd
->vdev_prevstate
!= vd
->vdev_state
) &&
4561 (save_state
<= VDEV_STATE_CLOSED
))
4562 save_state
= vd
->vdev_prevstate
;
4564 /* filter out state change due to initial vdev_open */
4565 if (save_state
> VDEV_STATE_CLOSED
)
4566 zfs_post_state_change(spa
, vd
, save_state
);
4569 if (!isopen
&& vd
->vdev_parent
)
4570 vdev_propagate_state(vd
->vdev_parent
);
4574 vdev_children_are_offline(vdev_t
*vd
)
4576 ASSERT(!vd
->vdev_ops
->vdev_op_leaf
);
4578 for (uint64_t i
= 0; i
< vd
->vdev_children
; i
++) {
4579 if (vd
->vdev_child
[i
]->vdev_state
!= VDEV_STATE_OFFLINE
)
4587 * Check the vdev configuration to ensure that it's capable of supporting
4588 * a root pool. We do not support partial configuration.
4591 vdev_is_bootable(vdev_t
*vd
)
4593 if (!vd
->vdev_ops
->vdev_op_leaf
) {
4594 const char *vdev_type
= vd
->vdev_ops
->vdev_op_type
;
4596 if (strcmp(vdev_type
, VDEV_TYPE_MISSING
) == 0 ||
4597 strcmp(vdev_type
, VDEV_TYPE_INDIRECT
) == 0) {
4602 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
4603 if (!vdev_is_bootable(vd
->vdev_child
[c
]))
4610 vdev_is_concrete(vdev_t
*vd
)
4612 vdev_ops_t
*ops
= vd
->vdev_ops
;
4613 if (ops
== &vdev_indirect_ops
|| ops
== &vdev_hole_ops
||
4614 ops
== &vdev_missing_ops
|| ops
== &vdev_root_ops
) {
4622 * Determine if a log device has valid content. If the vdev was
4623 * removed or faulted in the MOS config then we know that
4624 * the content on the log device has already been written to the pool.
4627 vdev_log_state_valid(vdev_t
*vd
)
4629 if (vd
->vdev_ops
->vdev_op_leaf
&& !vd
->vdev_faulted
&&
4633 for (int c
= 0; c
< vd
->vdev_children
; c
++)
4634 if (vdev_log_state_valid(vd
->vdev_child
[c
]))
4641 * Expand a vdev if possible.
4644 vdev_expand(vdev_t
*vd
, uint64_t txg
)
4646 ASSERT(vd
->vdev_top
== vd
);
4647 ASSERT(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
) == SCL_ALL
);
4648 ASSERT(vdev_is_concrete(vd
));
4650 vdev_set_deflate_ratio(vd
);
4652 if ((vd
->vdev_asize
>> vd
->vdev_ms_shift
) > vd
->vdev_ms_count
&&
4653 vdev_is_concrete(vd
)) {
4654 vdev_metaslab_group_create(vd
);
4655 VERIFY(vdev_metaslab_init(vd
, txg
) == 0);
4656 vdev_config_dirty(vd
);
4664 vdev_split(vdev_t
*vd
)
4666 vdev_t
*cvd
, *pvd
= vd
->vdev_parent
;
4668 vdev_remove_child(pvd
, vd
);
4669 vdev_compact_children(pvd
);
4671 cvd
= pvd
->vdev_child
[0];
4672 if (pvd
->vdev_children
== 1) {
4673 vdev_remove_parent(cvd
);
4674 cvd
->vdev_splitting
= B_TRUE
;
4676 vdev_propagate_state(cvd
);
4680 vdev_deadman(vdev_t
*vd
, char *tag
)
4682 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
4683 vdev_t
*cvd
= vd
->vdev_child
[c
];
4685 vdev_deadman(cvd
, tag
);
4688 if (vd
->vdev_ops
->vdev_op_leaf
) {
4689 vdev_queue_t
*vq
= &vd
->vdev_queue
;
4691 mutex_enter(&vq
->vq_lock
);
4692 if (avl_numnodes(&vq
->vq_active_tree
) > 0) {
4693 spa_t
*spa
= vd
->vdev_spa
;
4697 zfs_dbgmsg("slow vdev: %s has %d active IOs",
4698 vd
->vdev_path
, avl_numnodes(&vq
->vq_active_tree
));
4701 * Look at the head of all the pending queues,
4702 * if any I/O has been outstanding for longer than
4703 * the spa_deadman_synctime invoke the deadman logic.
4705 fio
= avl_first(&vq
->vq_active_tree
);
4706 delta
= gethrtime() - fio
->io_timestamp
;
4707 if (delta
> spa_deadman_synctime(spa
))
4708 zio_deadman(fio
, tag
);
4710 mutex_exit(&vq
->vq_lock
);
4715 vdev_defer_resilver(vdev_t
*vd
)
4717 ASSERT(vd
->vdev_ops
->vdev_op_leaf
);
4719 vd
->vdev_resilver_deferred
= B_TRUE
;
4720 vd
->vdev_spa
->spa_resilver_deferred
= B_TRUE
;
4724 * Clears the resilver deferred flag on all leaf devs under vd. Returns
4725 * B_TRUE if we have devices that need to be resilvered and are available to
4726 * accept resilver I/Os.
4729 vdev_clear_resilver_deferred(vdev_t
*vd
, dmu_tx_t
*tx
)
4731 boolean_t resilver_needed
= B_FALSE
;
4732 spa_t
*spa
= vd
->vdev_spa
;
4734 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
4735 vdev_t
*cvd
= vd
->vdev_child
[c
];
4736 resilver_needed
|= vdev_clear_resilver_deferred(cvd
, tx
);
4739 if (vd
== spa
->spa_root_vdev
&&
4740 spa_feature_is_active(spa
, SPA_FEATURE_RESILVER_DEFER
)) {
4741 spa_feature_decr(spa
, SPA_FEATURE_RESILVER_DEFER
, tx
);
4742 vdev_config_dirty(vd
);
4743 spa
->spa_resilver_deferred
= B_FALSE
;
4744 return (resilver_needed
);
4747 if (!vdev_is_concrete(vd
) || vd
->vdev_aux
||
4748 !vd
->vdev_ops
->vdev_op_leaf
)
4749 return (resilver_needed
);
4751 vd
->vdev_resilver_deferred
= B_FALSE
;
4753 return (!vdev_is_dead(vd
) && !vd
->vdev_offline
&&
4754 vdev_resilver_needed(vd
, NULL
, NULL
));
4758 * Translate a logical range to the physical range for the specified vdev_t.
4759 * This function is initially called with a leaf vdev and will walk each
4760 * parent vdev until it reaches a top-level vdev. Once the top-level is
4761 * reached the physical range is initialized and the recursive function
4762 * begins to unwind. As it unwinds it calls the parent's vdev specific
4763 * translation function to do the real conversion.
4766 vdev_xlate(vdev_t
*vd
, const range_seg64_t
*logical_rs
,
4767 range_seg64_t
*physical_rs
)
4770 * Walk up the vdev tree
4772 if (vd
!= vd
->vdev_top
) {
4773 vdev_xlate(vd
->vdev_parent
, logical_rs
, physical_rs
);
4776 * We've reached the top-level vdev, initialize the
4777 * physical range to the logical range and start to
4780 physical_rs
->rs_start
= logical_rs
->rs_start
;
4781 physical_rs
->rs_end
= logical_rs
->rs_end
;
4785 vdev_t
*pvd
= vd
->vdev_parent
;
4786 ASSERT3P(pvd
, !=, NULL
);
4787 ASSERT3P(pvd
->vdev_ops
->vdev_op_xlate
, !=, NULL
);
4790 * As this recursive function unwinds, translate the logical
4791 * range into its physical components by calling the
4792 * vdev specific translate function.
4794 range_seg64_t intermediate
= { 0 };
4795 pvd
->vdev_ops
->vdev_op_xlate(vd
, physical_rs
, &intermediate
);
4797 physical_rs
->rs_start
= intermediate
.rs_start
;
4798 physical_rs
->rs_end
= intermediate
.rs_end
;
4802 * Look at the vdev tree and determine whether any devices are currently being
4806 vdev_replace_in_progress(vdev_t
*vdev
)
4808 ASSERT(spa_config_held(vdev
->vdev_spa
, SCL_ALL
, RW_READER
) != 0);
4810 if (vdev
->vdev_ops
== &vdev_replacing_ops
)
4814 * A 'spare' vdev indicates that we have a replace in progress, unless
4815 * it has exactly two children, and the second, the hot spare, has
4816 * finished being resilvered.
4818 if (vdev
->vdev_ops
== &vdev_spare_ops
&& (vdev
->vdev_children
> 2 ||
4819 !vdev_dtl_empty(vdev
->vdev_child
[1], DTL_MISSING
)))
4822 for (int i
= 0; i
< vdev
->vdev_children
; i
++) {
4823 if (vdev_replace_in_progress(vdev
->vdev_child
[i
]))
4830 EXPORT_SYMBOL(vdev_fault
);
4831 EXPORT_SYMBOL(vdev_degrade
);
4832 EXPORT_SYMBOL(vdev_online
);
4833 EXPORT_SYMBOL(vdev_offline
);
4834 EXPORT_SYMBOL(vdev_clear
);
4837 ZFS_MODULE_PARAM(zfs_vdev
, zfs_vdev_
, default_ms_count
, INT
, ZMOD_RW
,
4838 "Target number of metaslabs per top-level vdev");
4840 ZFS_MODULE_PARAM(zfs_vdev
, zfs_vdev_
, default_ms_shift
, INT
, ZMOD_RW
,
4841 "Default limit for metaslab size");
4843 ZFS_MODULE_PARAM(zfs_vdev
, zfs_vdev_
, min_ms_count
, INT
, ZMOD_RW
,
4844 "Minimum number of metaslabs per top-level vdev");
4846 ZFS_MODULE_PARAM(zfs_vdev
, zfs_vdev_
, ms_count_limit
, INT
, ZMOD_RW
,
4847 "Practical upper limit of total metaslabs per top-level vdev");
4849 ZFS_MODULE_PARAM(zfs
, zfs_
, slow_io_events_per_second
, UINT
, ZMOD_RW
,
4850 "Rate limit slow IO (delay) events to this many per second");
4852 ZFS_MODULE_PARAM(zfs
, zfs_
, checksum_events_per_second
, UINT
, ZMOD_RW
,
4853 "Rate limit checksum events to this many checksum errors per second "
4854 "(do not set below zed threshold).");
4856 ZFS_MODULE_PARAM(zfs
, zfs_
, scan_ignore_errors
, INT
, ZMOD_RW
,
4857 "Ignore errors during resilver/scrub");
4859 ZFS_MODULE_PARAM(zfs_vdev
, vdev_
, validate_skip
, INT
, ZMOD_RW
,
4860 "Bypass vdev_validate()");
4862 ZFS_MODULE_PARAM(zfs
, zfs_
, nocacheflush
, INT
, ZMOD_RW
,
4863 "Disable cache flushes");