]>
Commit | Line | Data |
---|---|---|
ea04106b AX |
1 | #!/bin/sh |
2 | # | |
3 | # Replace a device with a hot spare in response to IO or checksum errors. | |
4 | # The following actions will be performed automatically when the number | |
5 | # of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or | |
6 | # ZED_SPARE_ON_CHECKSUM_ERRORS. | |
7 | # | |
8 | # 1) FAULT the device on IO errors, no futher IO will be attempted. | |
9 | # DEGRADE the device on checksum errors, the device is still | |
10 | # functional and can be used to service IO requests. | |
11 | # 2) Set the SES fault beacon for the device. | |
12 | # 3) Replace the device with a hot spare if any are available. | |
13 | # | |
14 | # Once the hot sparing operation is complete either the failed device or | |
15 | # the hot spare must be manually retired using the 'zpool detach' command. | |
16 | # The 'autoreplace' functionality which would normally take care of this | |
17 | # under Illumos has not yet been implemented. | |
18 | # | |
19 | # Full support for autoreplace is planned, but it requires that the full | |
20 | # ZFS Diagnosis Engine be ported. In the meanwhile this script provides | |
21 | # the majority of the expected hot spare functionality. | |
22 | # | |
23 | # Exit codes: | |
24 | # 0: replaced by hot spare | |
25 | # 1: no hot spare device available | |
26 | # 2: hot sparing disabled | |
27 | # 3: already faulted or degraded | |
28 | # 4: unsupported event class | |
29 | # 5: internal error | |
30 | # | |
31 | test -f "${ZED_ZEDLET_DIR}/zed.rc" && . "${ZED_ZEDLET_DIR}/zed.rc" | |
32 | ||
33 | test -n "${ZEVENT_POOL}" || exit 5 | |
34 | test -n "${ZEVENT_SUBCLASS}" || exit 5 | |
35 | test -n "${ZEVENT_VDEV_PATH}" || exit 5 | |
36 | test -n "${ZEVENT_VDEV_GUID}" || exit 5 | |
37 | ||
38 | # Defaults to disabled, enable in the zed.rc file. | |
39 | ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} | |
40 | ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} | |
41 | ||
42 | if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ | |
43 | ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then | |
44 | exit 2 | |
45 | fi | |
46 | ||
47 | # A lock file is used to serialize execution. | |
48 | ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} | |
49 | LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" | |
50 | ||
51 | exec 8> "${LOCKFILE}" | |
52 | flock -x 8 | |
53 | ||
54 | # Given a <pool> and <device> return the status, (ONLINE, FAULTED, etc...). | |
55 | vdev_status() { | |
56 | local POOL=$1 | |
57 | local VDEV=`basename $2` | |
58 | local T=' ' # tab character since '\t' isn't portable | |
59 | ||
60 | ${ZPOOL} status ${POOL} | sed -n -e \ | |
61 | "s,^[ $T]*\(.*$VDEV\(-part[0-9]\+\)\?\)[ $T]*\([A-Z]\+\).*,\1 \3,p" | |
62 | return 0 | |
63 | } | |
64 | ||
65 | # Fault devices after N I/O errors. | |
66 | if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then | |
67 | ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` | |
68 | ||
69 | if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ | |
70 | ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then | |
71 | ACTION="fault" | |
72 | fi | |
73 | # Degrade devices after N checksum errors. | |
74 | elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then | |
75 | ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} | |
76 | ||
77 | if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ | |
78 | ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then | |
79 | ACTION="degrade" | |
80 | fi | |
81 | else | |
82 | ACTION= | |
83 | fi | |
84 | ||
85 | if [ -n "${ACTION}" ]; then | |
86 | ||
87 | # Device is already FAULTED or DEGRADED | |
88 | set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` | |
89 | ZEVENT_VDEV_PATH_FOUND=$1 | |
90 | STATUS=$2 | |
91 | if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then | |
92 | exit 3 | |
93 | fi | |
94 | ||
95 | # Step 1) FAULT or DEGRADE the device | |
96 | # | |
97 | ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} | |
98 | ||
99 | # Step 2) Set the SES fault beacon. | |
100 | # | |
101 | # XXX: Set the 'fault' or 'ident' beacon for the device. This can | |
102 | # be done through the sg_ses utility, the only hard part is to map | |
103 | # the sd device to its corresponding enclosure and slot. We may | |
104 | # be able to leverage the existing vdev_id scripts for this. | |
105 | # | |
106 | # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 | |
107 | # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 | |
108 | ||
109 | # Step 3) Replace the device with a hot spare. | |
110 | # | |
111 | # Round robin through the spares selecting those which are available. | |
112 | # | |
113 | for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do | |
114 | set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` | |
115 | SPARE_VDEV_FOUND=$1 | |
116 | STATUS=$2 | |
117 | if [ "${STATUS}" = "AVAIL" ]; then | |
118 | ${ZPOOL} replace ${ZEVENT_POOL} \ | |
119 | ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 | |
120 | fi | |
121 | done | |
122 | ||
123 | exit 1 | |
124 | fi | |
125 | ||
126 | exit 4 |