3 # Replace a device with a hot spare in response to IO or checksum errors.
4 # The following actions will be performed automatically when the number
5 # of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or
6 # ZED_SPARE_ON_CHECKSUM_ERRORS.
8 # 1) FAULT the device on IO errors, no futher IO will be attempted.
9 # DEGRADE the device on checksum errors, the device is still
10 # functional and can be used to service IO requests.
11 # 2) Set the SES fault beacon for the device.
12 # 3) Replace the device with a hot spare if any are available.
14 # Once the hot sparing operation is complete either the failed device or
15 # the hot spare must be manually retired using the 'zpool detach' command.
16 # The 'autoreplace' functionality which would normally take care of this
17 # under Illumos has not yet been implemented.
19 # Full support for autoreplace is planned, but it requires that the full
20 # ZFS Diagnosis Engine be ported. In the meanwhile this script provides
21 # the majority of the expected hot spare functionality.
24 # 0: replaced by hot spare
25 # 1: no hot spare device available
26 # 2: hot sparing disabled
27 # 3: already faulted or degraded
28 # 4: unsupported event class
31 test -f "${ZED_ZEDLET_DIR}/zed.rc" && .
"${ZED_ZEDLET_DIR}/zed.rc"
33 test -n "${ZEVENT_POOL}" ||
exit 5
34 test -n "${ZEVENT_SUBCLASS}" ||
exit 5
35 test -n "${ZEVENT_VDEV_PATH}" ||
exit 5
36 test -n "${ZEVENT_VDEV_GUID}" ||
exit 5
38 # Defaults to disabled, enable in the zed.rc file.
39 ZED_SPARE_ON_IO_ERRORS
=${ZED_SPARE_ON_IO_ERRORS:-0}
40 ZED_SPARE_ON_CHECKSUM_ERRORS
=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0}
42 if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \
43 ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then
47 # A lock file is used to serialize execution.
48 ZED_LOCKDIR
=${ZED_LOCKDIR:-/var/lock}
49 LOCKFILE
="${ZED_LOCKDIR}/zed.spare.lock"
54 # Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...).
57 local VDEV
=`basename $2`
58 local T
=' ' # tab character since '\t' isn't portable
60 ${ZPOOL} status
${POOL} |
sed -n -e \
61 "s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p"
65 # Fault devices after N I/O errors.
66 if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then
67 ERRORS
=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}`
69 if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \
70 ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then
73 # Degrade devices after N checksum errors.
74 elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then
75 ERRORS
=${ZEVENT_VDEV_CKSUM_ERRORS}
77 if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \
78 ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then
85 if [ -n "${ACTION}" ]; then
87 # Device is already FAULTED or DEGRADED
88 set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}`
89 ZEVENT_VDEV_PATH_FOUND
=$1
91 if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then
95 # Step 1) FAULT or DEGRADE the device
97 ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL}
99 # Step 2) Set the SES fault beacon.
101 # XXX: Set the 'fault' or 'ident' beacon for the device. This can
102 # be done through the sg_ses utility, the only hard part is to map
103 # the sd device to its corresponding enclosure and slot. We may
104 # be able to leverage the existing vdev_id scripts for this.
106 # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3
107 # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3
109 # Step 3) Replace the device with a hot spare.
111 # Round robin through the spares selecting those which are available.
113 for SPARE
in ${ZEVENT_VDEV_SPARE_PATHS}; do
114 set -- `vdev_status ${ZEVENT_POOL} ${SPARE}`
117 if [ "${STATUS}" = "AVAIL" ]; then
118 ${ZPOOL} replace
${ZEVENT_POOL} \
119 ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0