]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0042-Add-SMART-self-test-results-to-zpool-status-c.patch
revert potentially buggy zap_add change
[zfsonlinux.git] / zfs-patches / 0042-Add-SMART-self-test-results-to-zpool-status-c.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Tony Hutter <hutter2@llnl.gov>
3 Date: Tue, 27 Feb 2018 09:31:27 -0800
4 Subject: [PATCH] Add SMART self-test results to zpool status -c
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 Add in SMART self-test results to zpool status|iostat -c. This
10 works for both SAS and SATA drives.
11
12 Also, add plumbing to allow the 'smart' script to take smartctl
13 output from a directory of output text files instead of running
14 it against the vdevs.
15
16 Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
17 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
18 Signed-off-by: Tony Hutter <hutter2@llnl.gov>
19 Closes #7178
20 (cherry picked from commit 5e3085e360161456fe2af697494c479de0ee2085)
21 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
22 ---
23 cmd/zpool/Makefile.am | 14 ++++-
24 cmd/zpool/zpool.d/smart | 132 +++++++++++++++++++++++++++++++++++-----
25 cmd/zpool/zpool.d/smart_test | 1 +
26 cmd/zpool/zpool.d/test_ended | 1 +
27 cmd/zpool/zpool.d/test_progress | 1 +
28 cmd/zpool/zpool.d/test_status | 1 +
29 cmd/zpool/zpool.d/test_type | 1 +
30 7 files changed, 133 insertions(+), 18 deletions(-)
31 create mode 120000 cmd/zpool/zpool.d/smart_test
32 create mode 120000 cmd/zpool/zpool.d/test_ended
33 create mode 120000 cmd/zpool/zpool.d/test_progress
34 create mode 120000 cmd/zpool/zpool.d/test_status
35 create mode 120000 cmd/zpool/zpool.d/test_type
36
37 diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am
38 index c7b8b76e3..d07f8d616 100644
39 --- a/cmd/zpool/Makefile.am
40 +++ b/cmd/zpool/Makefile.am
41 @@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \
42 zpool.d/nvme_err \
43 zpool.d/pwr_cyc \
44 zpool.d/upath \
45 - zpool.d/vendor
46 + zpool.d/vendor \
47 + zpool.d/smart_test \
48 + zpool.d/test_type \
49 + zpool.d/test_status \
50 + zpool.d/test_progress \
51 + zpool.d/test_ended
52
53 zpoolconfdefaults = \
54 enc \
55 @@ -102,7 +107,12 @@ zpoolconfdefaults = \
56 nvme_err \
57 pwr_cyc \
58 upath \
59 - vendor
60 + vendor \
61 + smart_test \
62 + test_type \
63 + test_status \
64 + test_progress \
65 + test_ended
66
67 install-data-hook:
68 $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
69 diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart
70 index 4bc3af39d..64b5f6e4e 100755
71 --- a/cmd/zpool/zpool.d/smart
72 +++ b/cmd/zpool/zpool.d/smart
73 @@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA).
74 pwr_cyc: Show SMART power cycle count (ATA).
75 serial: Show disk serial number.
76 nvme_err: Show SMART NVMe errors (NVMe).
77 +smart_test: Show SMART self-test results summary.
78 +test_type: Show SMART self-test type (short, long... ).
79 +test_status: Show SMART self-test status.
80 +test_progress: Show SMART self-test percentage done.
81 +test_ended: Show when the last SMART self-test ended (if supported).
82 "
83
84 +# Hack for developer testing
85 +#
86 +# If you set $samples to a directory containing smartctl output text files,
87 +# we will use them instead of running smartctl on the vdevs. This can be
88 +# useful if you want to test a bunch of different smartctl outputs. Also, if
89 +# $samples is set, and additional 'file' column is added to the zpool output
90 +# showing the filename.
91 +samples=
92 +
93 +# get_filename_from_dir DIR
94 +#
95 +# Look in directory DIR and return a filename from it. The filename returned
96 +# is chosen quasi-sequentially (based off our PID). This allows us to return
97 +# a different filename every time this script is invoked (which we do for each
98 +# vdev), without having to maintain state.
99 +get_filename_from_dir()
100 +{
101 + dir=$1
102 + pid="$$"
103 + num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
104 + mod=$((pid % num_files))
105 + i=0
106 + find "$dir" -type f -printf "%f\n" | while read -r file ; do
107 + if [ "$mod" = "$i" ] ; then
108 + echo "$file"
109 + break
110 + fi
111 + i=$((i+1))
112 + done
113 +}
114 +
115 script=$(basename "$0")
116
117 if [ "$1" = "-h" ] ; then
118 @@ -35,8 +71,16 @@ fi
119
120 smartctl_path=$(which smartctl)
121
122 -if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
123 - raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
124 +if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
125 + if [ -n "$samples" ] ; then
126 + # cat a smartctl output text file instead of running smartctl
127 + # on a vdev (only used for developer testing).
128 + file=$(get_filename_from_dir $samples)
129 + echo "file=$file"
130 + raw_out=$(cat "$samples/$file")
131 + else
132 + raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
133 + fi
134
135 # What kind of drive are we? Look for the right line in smartctl:
136 #
137 @@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
138 # NVMe:
139 # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
140 #
141 - type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$')
142 out=$(echo "$raw_out" | awk '
143 # SAS specific
144 /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
145 @@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
146 /Elements in grown defect list/{print "defect="$6}
147
148 # SAS common
149 +/SAS/{type="sas"}
150 /Drive Temperature:/{print "temp="$4}
151 # Status can be a long string, substitute spaces for '_'
152 /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
153 -/number of hours powered up/{print "hours_on="$7}
154 +/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
155 /Serial number:/{print "serial="$3}
156
157 # SATA specific
158 @@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
159 /Power_Cycle_Count/{print "pwr_cyc="$10}
160
161 # SATA common
162 +/SATA/{type="sata"}
163 /Temperature_Celsius/{print "temp="$10}
164 /Airflow_Temperature_Cel/{print "temp="$10}
165 +/Current Temperature:/{print "temp="$3}
166 /SMART overall-health self-assessment test result:/{print "health="$6}
167 -/Power_On_Hours/{print "hours_on="$10}
168 +/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
169 /Serial Number:/{print "serial="$3}
170
171 # NVMe common
172 +/NVMe/{type="nvme"}
173 /Temperature:/{print "temp="$2}
174 /SMART overall-health self-assessment test result:/{print "health="$6}
175 /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
176 @@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
177 # NVMe specific
178 /Media and Data Integrity Errors:/{print "nvme_err="$6}
179
180 -END {ORS="\n"; print ""}
181 +# SMART self-test info
182 +/Self-test execution status:/{progress=tolower($4)} # SAS
183 +/SMART Self-test log/{test_seen=1} # SAS
184 +/SMART Extended Self-test Log/{test_seen=1} # SATA
185 +/# 1/{
186 + test_type=tolower($3"_"$4);
187 + # Status could be one word ("Completed") or multiple ("Completed: read
188 + # failure"). Look for the ":" to see if we need to grab more words.
189 +
190 + if ($5 ~ ":")
191 + status=tolower($5""$6"_"$7)
192 + else
193 + status=tolower($5)
194 + if (status=="self")
195 + status="running";
196 +
197 + if (type == "sas") {
198 + hours=int($(NF-4))
199 + } else {
200 + hours=int($(NF-1))
201 + # SATA reports percent remaining, rather than percent done
202 + # Convert it to percent done.
203 + progress=(100-int($(NF-2)))"%"
204 + }
205 + # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
206 + # 0. In those cases, set it to hours_on, so they will cancel out in
207 + # the "hours_ago" calculation later on.
208 + if (hours == 0)
209 + hours=hours_on
210 +
211 + if (test_seen) {
212 + print "test="hours_on
213 + print "test_type="test_type
214 + print "test_status="status
215 + print "test_progress="progress
216 + }
217 + # Not all drives report hours_on
218 + if (hours_on && hours) {
219 + total_hours_ago=(hours_on-hours)
220 + days_ago=int(total_hours_ago/24)
221 + hours_ago=(total_hours_ago % 24)
222 + if (days_ago != 0)
223 + ago_str=days_ago"d"
224 + if (hours_ago !=0)
225 + ago_str=ago_str""hours_ago"h"
226 + print "test_ended="ago_str
227 + }
228 +}
229 +
230 +END {print "type="type; ORS="\n"; print ""}
231 ');
232 fi
233 +type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
234
235 -# if type is not set by now, either we don't have a block device
236 -# or smartctl failed. Either way, default to ATA and set out to
237 -# nothing
238 +# If type is not set by now, either we don't have a block device
239 +# or smartctl failed. Either way, default to ATA and set $out to
240 +# nothing.
241 if [ -z "$type" ]; then
242 - type="ATA"
243 + type="sata"
244 out=
245 fi
246
247 case $script in
248 smart)
249 # Print temperature plus common predictors of drive failure
250 - if [ "$type" = "SAS" ] ; then
251 + if [ "$type" = "sas" ] ; then
252 scripts="temp|health|r_ucor|w_ucor"
253 - elif [ "$type" = "ATA" ] ; then
254 + elif [ "$type" = "sata" ] ; then
255 scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
256 - elif [ "$type" = "NVMe" ] ; then
257 + elif [ "$type" = "nvme" ] ; then
258 scripts="temp|health|nvme_err"
259 fi
260 ;;
261 smartx)
262 # Print some other interesting stats
263 - if [ "$type" = "SAS" ] ; then
264 + if [ "$type" = "sas" ] ; then
265 scripts="hours_on|defect|nonmed|r_proc|w_proc"
266 - elif [ "$type" = "ATA" ] ; then
267 + elif [ "$type" = "sata" ] ; then
268 scripts="hours_on|pwr_cyc"
269 - elif [ "$type" = "NVMe" ] ; then
270 + elif [ "$type" = "nvme" ] ; then
271 scripts="hours_on|pwr_cyc"
272 fi
273 ;;
274 +smart_test)
275 + scripts="test_type|test_status|test_progress|test_ended"
276 + ;;
277 *)
278 scripts="$script"
279 esac
280 diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test
281 new file mode 120000
282 index 000000000..94f22861f
283 --- /dev/null
284 +++ b/cmd/zpool/zpool.d/smart_test
285 @@ -0,0 +1 @@
286 +smart
287 \ No newline at end of file
288 diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended
289 new file mode 120000
290 index 000000000..94f22861f
291 --- /dev/null
292 +++ b/cmd/zpool/zpool.d/test_ended
293 @@ -0,0 +1 @@
294 +smart
295 \ No newline at end of file
296 diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress
297 new file mode 120000
298 index 000000000..94f22861f
299 --- /dev/null
300 +++ b/cmd/zpool/zpool.d/test_progress
301 @@ -0,0 +1 @@
302 +smart
303 \ No newline at end of file
304 diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status
305 new file mode 120000
306 index 000000000..94f22861f
307 --- /dev/null
308 +++ b/cmd/zpool/zpool.d/test_status
309 @@ -0,0 +1 @@
310 +smart
311 \ No newline at end of file
312 diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type
313 new file mode 120000
314 index 000000000..94f22861f
315 --- /dev/null
316 +++ b/cmd/zpool/zpool.d/test_type
317 @@ -0,0 +1 @@
318 +smart
319 \ No newline at end of file
320 --
321 2.14.2
322