4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2016, Intel Corporation.
28 * The ZFS retire agent is responsible for managing hot spares across all pools.
29 * When we see a device fault or a device removal, we try to open the associated
30 * pool and look for any hot spares. We iterate over any available hot spares
31 * and attempt a 'zpool replace' for each one.
33 * For vdevs diagnosed as faulty, the agent is also responsible for proactively
34 * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
37 #include <sys/fs/zfs.h>
38 #include <sys/fm/protocol.h>
39 #include <sys/fm/fs/zfs.h>
43 #include "zfs_agents.h"
47 typedef struct zfs_retire_repaired
{
48 struct zfs_retire_repaired
*zrr_next
;
51 } zfs_retire_repaired_t
;
53 typedef struct zfs_retire_data
{
54 libzfs_handle_t
*zrd_hdl
;
55 zfs_retire_repaired_t
*zrd_repaired
;
59 zfs_retire_clear_data(fmd_hdl_t
*hdl
, zfs_retire_data_t
*zdp
)
61 zfs_retire_repaired_t
*zrp
;
63 while ((zrp
= zdp
->zrd_repaired
) != NULL
) {
64 zdp
->zrd_repaired
= zrp
->zrr_next
;
65 fmd_hdl_free(hdl
, zrp
, sizeof (zfs_retire_repaired_t
));
70 * Find a pool with a matching GUID.
72 typedef struct find_cbdata
{
75 zpool_handle_t
*cb_zhp
;
80 find_pool(zpool_handle_t
*zhp
, void *data
)
82 find_cbdata_t
*cbp
= data
;
85 zpool_get_prop_int(zhp
, ZPOOL_PROP_GUID
, NULL
)) {
95 * Find a vdev within a tree with a matching GUID.
98 find_vdev(libzfs_handle_t
*zhdl
, nvlist_t
*nv
, const char *search_fru
,
107 if (search_fru
!= NULL
) {
108 if (nvlist_lookup_string(nv
, ZPOOL_CONFIG_FRU
, &fru
) == 0 &&
109 libzfs_fru_compare(zhdl
, fru
, search_fru
))
112 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_GUID
, &guid
) == 0 &&
113 guid
== search_guid
) {
114 fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
115 "matched vdev %llu", guid
);
120 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_CHILDREN
,
121 &child
, &children
) != 0)
124 for (c
= 0; c
< children
; c
++) {
125 if ((ret
= find_vdev(zhdl
, child
[c
], search_fru
,
126 search_guid
)) != NULL
)
130 if (nvlist_lookup_nvlist_array(nv
, ZPOOL_CONFIG_L2CACHE
,
131 &child
, &children
) != 0)
134 for (c
= 0; c
< children
; c
++) {
135 if ((ret
= find_vdev(zhdl
, child
[c
], search_fru
,
136 search_guid
)) != NULL
)
144 * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
146 static zpool_handle_t
*
147 find_by_guid(libzfs_handle_t
*zhdl
, uint64_t pool_guid
, uint64_t vdev_guid
,
152 nvlist_t
*config
, *nvroot
;
155 * Find the corresponding pool and make sure the vdev still exists.
157 cb
.cb_guid
= pool_guid
;
158 if (zpool_iter(zhdl
, find_pool
, &cb
) != 1)
162 config
= zpool_get_config(zhp
, NULL
);
163 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
169 if (vdev_guid
!= 0) {
170 if ((*vdevp
= find_vdev(zhdl
, nvroot
, NULL
,
171 vdev_guid
)) == NULL
) {
182 search_pool(zpool_handle_t
*zhp
, void *data
)
184 find_cbdata_t
*cbp
= data
;
188 config
= zpool_get_config(zhp
, NULL
);
189 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
195 if ((cbp
->cb_vdev
= find_vdev(zpool_get_handle(zhp
), nvroot
,
196 cbp
->cb_fru
, 0)) != NULL
) {
206 * Given a FRU FMRI, find the matching pool and vdev.
208 static zpool_handle_t
*
209 find_by_fru(libzfs_handle_t
*zhdl
, const char *fru
, nvlist_t
**vdevp
)
215 if (zpool_iter(zhdl
, search_pool
, &cb
) != 1)
221 #endif /* HAVE_LIBTOPO */
224 * Given a vdev, attempt to replace it with every known spare until one
228 replace_with_spare(fmd_hdl_t
*hdl
, zpool_handle_t
*zhp
, nvlist_t
*vdev
)
230 nvlist_t
*config
, *nvroot
, *replacement
;
235 config
= zpool_get_config(zhp
, NULL
);
236 if (nvlist_lookup_nvlist(config
, ZPOOL_CONFIG_VDEV_TREE
,
241 * Find out if there are any hot spares available in the pool.
243 if (nvlist_lookup_nvlist_array(nvroot
, ZPOOL_CONFIG_SPARES
,
244 &spares
, &nspares
) != 0)
247 replacement
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
249 (void) nvlist_add_string(replacement
, ZPOOL_CONFIG_TYPE
,
252 dev_name
= zpool_vdev_name(NULL
, zhp
, vdev
, B_FALSE
);
255 * Try to replace each spare, ending when we successfully
258 for (s
= 0; s
< nspares
; s
++) {
261 if (nvlist_lookup_string(spares
[s
], ZPOOL_CONFIG_PATH
,
265 (void) nvlist_add_nvlist_array(replacement
,
266 ZPOOL_CONFIG_CHILDREN
, &spares
[s
], 1);
268 fmd_hdl_debug(hdl
, "zpool_vdev_replace '%s' with spare '%s'",
269 dev_name
, basename(spare_name
));
271 if (zpool_vdev_attach(zhp
, dev_name
, spare_name
,
272 replacement
, B_TRUE
) == 0)
277 nvlist_free(replacement
);
281 * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
282 * ASRU is now usable. ZFS has found the device to be present and
287 zfs_vdev_repair(fmd_hdl_t
*hdl
, nvlist_t
*nvl
)
289 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
290 zfs_retire_repaired_t
*zrp
;
291 uint64_t pool_guid
, vdev_guid
;
296 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
297 &pool_guid
) != 0 || nvlist_lookup_uint64(nvl
,
298 FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
, &vdev_guid
) != 0)
302 * Before checking the state of the ASRU, go through and see if we've
303 * already made an attempt to repair this ASRU. This list is cleared
304 * whenever we receive any kind of list event, and is designed to
305 * prevent us from generating a feedback loop when we attempt repairs
306 * against a faulted pool. The problem is that checking the unusable
307 * state of the ASRU can involve opening the pool, which can post
308 * statechange events but otherwise leave the pool in the faulted
309 * state. This list allows us to detect when a statechange event is
310 * due to our own request.
312 for (zrp
= zdp
->zrd_repaired
; zrp
!= NULL
; zrp
= zrp
->zrr_next
) {
313 if (zrp
->zrr_pool
== pool_guid
&&
314 zrp
->zrr_vdev
== vdev_guid
)
319 asru
= fmd_nvl_alloc(hdl
, FMD_SLEEP
);
321 (void) nvlist_add_uint8(asru
, FM_VERSION
, ZFS_SCHEME_VERSION0
);
322 (void) nvlist_add_string(asru
, FM_FMRI_SCHEME
, FM_FMRI_SCHEME_ZFS
);
323 (void) nvlist_add_uint64(asru
, FM_FMRI_ZFS_POOL
, pool_guid
);
324 (void) nvlist_add_uint64(asru
, FM_FMRI_ZFS_VDEV
, vdev_guid
);
327 * We explicitly check for the unusable state here to make sure we
328 * aren't responding to a transient state change. As part of opening a
329 * vdev, it's possible to see the 'statechange' event, only to be
330 * followed by a vdev failure later. If we don't check the current
331 * state of the vdev (or pool) before marking it repaired, then we risk
332 * generating spurious repair events followed immediately by the same
335 * This assumes that the ZFS scheme code associated unusable (i.e.
336 * isolated) with its own definition of faulty state. In the case of a
337 * DEGRADED leaf vdev (due to checksum errors), this is not the case.
338 * This works, however, because the transient state change is not
339 * posted in this case. This could be made more explicit by not
340 * relying on the scheme's unusable callback and instead directly
341 * checking the vdev state, where we could correctly account for
344 if (!fmd_nvl_fmri_unusable(hdl
, asru
) && fmd_nvl_fmri_has_fault(hdl
,
345 asru
, FMD_HAS_FAULT_ASRU
, NULL
)) {
350 thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
);
351 if (topo_fmri_nvl2str(thp
, asru
, &fmri
, &err
) == 0)
352 (void) fmd_repair_asru(hdl
, fmri
);
353 fmd_hdl_topo_rele(hdl
, thp
);
355 topo_hdl_strfree(thp
, fmri
);
359 zrp
= fmd_hdl_alloc(hdl
, sizeof (zfs_retire_repaired_t
), FMD_SLEEP
);
360 zrp
->zrr_next
= zdp
->zrd_repaired
;
361 zrp
->zrr_pool
= pool_guid
;
362 zrp
->zrr_vdev
= vdev_guid
;
363 zdp
->zrd_repaired
= zrp
;
365 fmd_hdl_debug(hdl
, "marking repaired vdev %llu on pool %llu",
366 vdev_guid
, pool_guid
);
371 zfs_retire_recv(fmd_hdl_t
*hdl
, fmd_event_t
*ep
, nvlist_t
*nvl
,
374 uint64_t pool_guid
, vdev_guid
;
376 nvlist_t
*resource
, *fault
;
379 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
380 libzfs_handle_t
*zhdl
= zdp
->zrd_hdl
;
381 boolean_t fault_device
, degrade_device
;
384 nvlist_t
*vdev
= NULL
;
392 fmd_hdl_debug(hdl
, "zfs_retire_recv: '%s'", class);
395 * If this is a resource notifying us of device removal, then simply
396 * check for an available spare and continue.
398 if (strcmp(class, "resource.fs.zfs.removed") == 0) {
399 if (nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID
,
401 nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID
,
405 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
409 if (fmd_prop_get_int32(hdl
, "spare_on_remove"))
410 replace_with_spare(hdl
, zhp
, vdev
);
415 if (strcmp(class, FM_LIST_RESOLVED_CLASS
) == 0)
419 * Note: on zfsonlinux statechange events are more than just
420 * healthy ones so we need to confirm the actual state value.
422 if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
423 nvlist_lookup_uint64(nvl
, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE
,
424 &state
) == 0 && state
== VDEV_STATE_HEALTHY
) {
425 zfs_vdev_repair(hdl
, nvl
);
428 if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
429 zfs_vdev_repair(hdl
, nvl
);
433 zfs_retire_clear_data(hdl
, zdp
);
435 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0)
441 * We subscribe to zfs faults as well as all repair events.
443 if (nvlist_lookup_nvlist_array(nvl
, FM_SUSPECT_FAULT_LIST
,
444 &faults
, &nfaults
) != 0)
447 for (f
= 0; f
< nfaults
; f
++) {
450 fault_device
= B_FALSE
;
451 degrade_device
= B_FALSE
;
454 if (nvlist_lookup_boolean_value(fault
, FM_SUSPECT_RETIRE
,
455 &retire
) == 0 && retire
== 0)
459 * While we subscribe to fault.fs.zfs.*, we only take action
460 * for faults targeting a specific vdev (open failure or SERD
461 * failure). We also subscribe to fault.io.* events, so that
462 * faulty disks will be faulted in the ZFS configuration.
464 if (fmd_nvl_class_match(hdl
, fault
, "fault.fs.zfs.vdev.io")) {
465 fault_device
= B_TRUE
;
466 } else if (fmd_nvl_class_match(hdl
, fault
,
467 "fault.fs.zfs.vdev.checksum")) {
468 degrade_device
= B_TRUE
;
469 } else if (fmd_nvl_class_match(hdl
, fault
,
470 "fault.fs.zfs.device")) {
471 fault_device
= B_FALSE
;
472 } else if (fmd_nvl_class_match(hdl
, fault
, "fault.io.*")) {
474 fault_device
= B_TRUE
;
482 * This is a disk fault. Lookup the FRU, convert it to
483 * an FMRI string, and attempt to find a matching vdev.
485 if (nvlist_lookup_nvlist(fault
, FM_FAULT_FRU
,
487 nvlist_lookup_string(fru
, FM_FMRI_SCHEME
,
491 if (strcmp(scheme
, FM_FMRI_SCHEME_HC
) != 0)
494 thp
= fmd_hdl_topo_hold(hdl
, TOPO_VERSION
);
495 if (topo_fmri_nvl2str(thp
, fru
, &fmri
, &err
) != 0) {
496 fmd_hdl_topo_rele(hdl
, thp
);
500 zhp
= find_by_fru(zhdl
, fmri
, &vdev
);
501 topo_hdl_strfree(thp
, fmri
);
502 fmd_hdl_topo_rele(hdl
, thp
);
507 (void) nvlist_lookup_uint64(vdev
,
508 ZPOOL_CONFIG_GUID
, &vdev_guid
);
509 aux
= VDEV_AUX_EXTERNAL
;
515 * This is a ZFS fault. Lookup the resource, and
516 * attempt to find the matching vdev.
518 if (nvlist_lookup_nvlist(fault
, FM_FAULT_RESOURCE
,
520 nvlist_lookup_string(resource
, FM_FMRI_SCHEME
,
524 if (strcmp(scheme
, FM_FMRI_SCHEME_ZFS
) != 0)
527 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_POOL
,
531 if (nvlist_lookup_uint64(resource
, FM_FMRI_ZFS_VDEV
,
539 if ((zhp
= find_by_guid(zhdl
, pool_guid
, vdev_guid
,
543 aux
= VDEV_AUX_ERR_EXCEEDED
;
546 if (vdev_guid
== 0) {
548 * For pool-level repair events, clear the entire pool.
550 fmd_hdl_debug(hdl
, "zpool_clear of pool '%s'",
551 zpool_get_name(zhp
));
552 (void) zpool_clear(zhp
, NULL
, NULL
);
558 * If this is a repair event, then mark the vdev as repaired and
563 fmd_hdl_debug(hdl
, "zpool_clear of pool '%s' vdev %llu",
564 zpool_get_name(zhp
), vdev_guid
);
565 (void) zpool_vdev_clear(zhp
, vdev_guid
);
571 * Actively fault the device if needed.
574 (void) zpool_vdev_fault(zhp
, vdev_guid
, aux
);
576 (void) zpool_vdev_degrade(zhp
, vdev_guid
, aux
);
578 if (fault_device
|| degrade_device
)
579 fmd_hdl_debug(hdl
, "zpool_vdev_%s: vdev %llu on '%s'",
580 fault_device
? "fault" : "degrade", vdev_guid
,
581 zpool_get_name(zhp
));
584 * Attempt to substitute a hot spare.
586 replace_with_spare(hdl
, zhp
, vdev
);
590 if (strcmp(class, FM_LIST_REPAIRED_CLASS
) == 0 && repair_done
&&
591 nvlist_lookup_string(nvl
, FM_SUSPECT_UUID
, &uuid
) == 0)
592 fmd_case_uuresolved(hdl
, uuid
);
595 static const fmd_hdl_ops_t fmd_ops
= {
596 zfs_retire_recv
, /* fmdo_recv */
597 NULL
, /* fmdo_timeout */
598 NULL
, /* fmdo_close */
599 NULL
, /* fmdo_stats */
603 static const fmd_prop_t fmd_props
[] = {
604 { "spare_on_remove", FMD_TYPE_BOOL
, "true" },
608 static const fmd_hdl_info_t fmd_info
= {
609 "ZFS Retire Agent", "1.0", &fmd_ops
, fmd_props
613 _zfs_retire_init(fmd_hdl_t
*hdl
)
615 zfs_retire_data_t
*zdp
;
616 libzfs_handle_t
*zhdl
;
618 if ((zhdl
= __libzfs_init()) == NULL
)
621 if (fmd_hdl_register(hdl
, FMD_API_VERSION
, &fmd_info
) != 0) {
626 zdp
= fmd_hdl_zalloc(hdl
, sizeof (zfs_retire_data_t
), FMD_SLEEP
);
629 fmd_hdl_setspecific(hdl
, zdp
);
633 _zfs_retire_fini(fmd_hdl_t
*hdl
)
635 zfs_retire_data_t
*zdp
= fmd_hdl_getspecific(hdl
);
638 zfs_retire_clear_data(hdl
, zdp
);
639 __libzfs_fini(zdp
->zrd_hdl
);
640 fmd_hdl_free(hdl
, zdp
, sizeof (zfs_retire_data_t
));