]>
Commit | Line | Data |
---|---|---|
75b07eca FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Tony Hutter <hutter2@llnl.gov> | |
3 | Date: Tue, 27 Feb 2018 09:31:27 -0800 | |
4 | Subject: [PATCH] Add SMART self-test results to zpool status -c | |
5 | MIME-Version: 1.0 | |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | Add in SMART self-test results to zpool status|iostat -c. This | |
10 | works for both SAS and SATA drives. | |
11 | ||
12 | Also, add plumbing to allow the 'smart' script to take smartctl | |
13 | output from a directory of output text files instead of running | |
14 | it against the vdevs. | |
15 | ||
16 | Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov> | |
17 | Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
18 | Signed-off-by: Tony Hutter <hutter2@llnl.gov> | |
19 | Closes #7178 | |
20 | (cherry picked from commit 5e3085e360161456fe2af697494c479de0ee2085) | |
21 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
22 | --- | |
23 | cmd/zpool/Makefile.am | 14 ++++- | |
24 | cmd/zpool/zpool.d/smart | 132 +++++++++++++++++++++++++++++++++++----- | |
25 | cmd/zpool/zpool.d/smart_test | 1 + | |
26 | cmd/zpool/zpool.d/test_ended | 1 + | |
27 | cmd/zpool/zpool.d/test_progress | 1 + | |
28 | cmd/zpool/zpool.d/test_status | 1 + | |
29 | cmd/zpool/zpool.d/test_type | 1 + | |
30 | 7 files changed, 133 insertions(+), 18 deletions(-) | |
31 | create mode 120000 cmd/zpool/zpool.d/smart_test | |
32 | create mode 120000 cmd/zpool/zpool.d/test_ended | |
33 | create mode 120000 cmd/zpool/zpool.d/test_progress | |
34 | create mode 120000 cmd/zpool/zpool.d/test_status | |
35 | create mode 120000 cmd/zpool/zpool.d/test_type | |
36 | ||
37 | diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am | |
38 | index c7b8b76e3..d07f8d616 100644 | |
39 | --- a/cmd/zpool/Makefile.am | |
40 | +++ b/cmd/zpool/Makefile.am | |
41 | @@ -63,7 +63,12 @@ dist_zpoolexec_SCRIPTS = \ | |
42 | zpool.d/nvme_err \ | |
43 | zpool.d/pwr_cyc \ | |
44 | zpool.d/upath \ | |
45 | - zpool.d/vendor | |
46 | + zpool.d/vendor \ | |
47 | + zpool.d/smart_test \ | |
48 | + zpool.d/test_type \ | |
49 | + zpool.d/test_status \ | |
50 | + zpool.d/test_progress \ | |
51 | + zpool.d/test_ended | |
52 | ||
53 | zpoolconfdefaults = \ | |
54 | enc \ | |
55 | @@ -102,7 +107,12 @@ zpoolconfdefaults = \ | |
56 | nvme_err \ | |
57 | pwr_cyc \ | |
58 | upath \ | |
59 | - vendor | |
60 | + vendor \ | |
61 | + smart_test \ | |
62 | + test_type \ | |
63 | + test_status \ | |
64 | + test_progress \ | |
65 | + test_ended | |
66 | ||
67 | install-data-hook: | |
68 | $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" | |
69 | diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart | |
70 | index 4bc3af39d..64b5f6e4e 100755 | |
71 | --- a/cmd/zpool/zpool.d/smart | |
72 | +++ b/cmd/zpool/zpool.d/smart | |
73 | @@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA). | |
74 | pwr_cyc: Show SMART power cycle count (ATA). | |
75 | serial: Show disk serial number. | |
76 | nvme_err: Show SMART NVMe errors (NVMe). | |
77 | +smart_test: Show SMART self-test results summary. | |
78 | +test_type: Show SMART self-test type (short, long... ). | |
79 | +test_status: Show SMART self-test status. | |
80 | +test_progress: Show SMART self-test percentage done. | |
81 | +test_ended: Show when the last SMART self-test ended (if supported). | |
82 | " | |
83 | ||
84 | +# Hack for developer testing | |
85 | +# | |
86 | +# If you set $samples to a directory containing smartctl output text files, | |
87 | +# we will use them instead of running smartctl on the vdevs. This can be | |
88 | +# useful if you want to test a bunch of different smartctl outputs. Also, if | |
89 | +# $samples is set, and additional 'file' column is added to the zpool output | |
90 | +# showing the filename. | |
91 | +samples= | |
92 | + | |
93 | +# get_filename_from_dir DIR | |
94 | +# | |
95 | +# Look in directory DIR and return a filename from it. The filename returned | |
96 | +# is chosen quasi-sequentially (based off our PID). This allows us to return | |
97 | +# a different filename every time this script is invoked (which we do for each | |
98 | +# vdev), without having to maintain state. | |
99 | +get_filename_from_dir() | |
100 | +{ | |
101 | + dir=$1 | |
102 | + pid="$$" | |
103 | + num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) | |
104 | + mod=$((pid % num_files)) | |
105 | + i=0 | |
106 | + find "$dir" -type f -printf "%f\n" | while read -r file ; do | |
107 | + if [ "$mod" = "$i" ] ; then | |
108 | + echo "$file" | |
109 | + break | |
110 | + fi | |
111 | + i=$((i+1)) | |
112 | + done | |
113 | +} | |
114 | + | |
115 | script=$(basename "$0") | |
116 | ||
117 | if [ "$1" = "-h" ] ; then | |
118 | @@ -35,8 +71,16 @@ fi | |
119 | ||
120 | smartctl_path=$(which smartctl) | |
121 | ||
122 | -if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then | |
123 | - raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") | |
124 | +if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then | |
125 | + if [ -n "$samples" ] ; then | |
126 | + # cat a smartctl output text file instead of running smartctl | |
127 | + # on a vdev (only used for developer testing). | |
128 | + file=$(get_filename_from_dir $samples) | |
129 | + echo "file=$file" | |
130 | + raw_out=$(cat "$samples/$file") | |
131 | + else | |
132 | + raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") | |
133 | + fi | |
134 | ||
135 | # What kind of drive are we? Look for the right line in smartctl: | |
136 | # | |
137 | @@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then | |
138 | # NVMe: | |
139 | # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) | |
140 | # | |
141 | - type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$') | |
142 | out=$(echo "$raw_out" | awk ' | |
143 | # SAS specific | |
144 | /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} | |
145 | @@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then | |
146 | /Elements in grown defect list/{print "defect="$6} | |
147 | ||
148 | # SAS common | |
149 | +/SAS/{type="sas"} | |
150 | /Drive Temperature:/{print "temp="$4} | |
151 | # Status can be a long string, substitute spaces for '_' | |
152 | /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} | |
153 | -/number of hours powered up/{print "hours_on="$7} | |
154 | +/number of hours powered up/{print "hours_on="$7; hours_on=int($7)} | |
155 | /Serial number:/{print "serial="$3} | |
156 | ||
157 | # SATA specific | |
158 | @@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then | |
159 | /Power_Cycle_Count/{print "pwr_cyc="$10} | |
160 | ||
161 | # SATA common | |
162 | +/SATA/{type="sata"} | |
163 | /Temperature_Celsius/{print "temp="$10} | |
164 | /Airflow_Temperature_Cel/{print "temp="$10} | |
165 | +/Current Temperature:/{print "temp="$3} | |
166 | /SMART overall-health self-assessment test result:/{print "health="$6} | |
167 | -/Power_On_Hours/{print "hours_on="$10} | |
168 | +/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} | |
169 | /Serial Number:/{print "serial="$3} | |
170 | ||
171 | # NVMe common | |
172 | +/NVMe/{type="nvme"} | |
173 | /Temperature:/{print "temp="$2} | |
174 | /SMART overall-health self-assessment test result:/{print "health="$6} | |
175 | /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} | |
176 | @@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then | |
177 | # NVMe specific | |
178 | /Media and Data Integrity Errors:/{print "nvme_err="$6} | |
179 | ||
180 | -END {ORS="\n"; print ""} | |
181 | +# SMART self-test info | |
182 | +/Self-test execution status:/{progress=tolower($4)} # SAS | |
183 | +/SMART Self-test log/{test_seen=1} # SAS | |
184 | +/SMART Extended Self-test Log/{test_seen=1} # SATA | |
185 | +/# 1/{ | |
186 | + test_type=tolower($3"_"$4); | |
187 | + # Status could be one word ("Completed") or multiple ("Completed: read | |
188 | + # failure"). Look for the ":" to see if we need to grab more words. | |
189 | + | |
190 | + if ($5 ~ ":") | |
191 | + status=tolower($5""$6"_"$7) | |
192 | + else | |
193 | + status=tolower($5) | |
194 | + if (status=="self") | |
195 | + status="running"; | |
196 | + | |
197 | + if (type == "sas") { | |
198 | + hours=int($(NF-4)) | |
199 | + } else { | |
200 | + hours=int($(NF-1)) | |
201 | + # SATA reports percent remaining, rather than percent done | |
202 | + # Convert it to percent done. | |
203 | + progress=(100-int($(NF-2)))"%" | |
204 | + } | |
205 | + # When we int()-ify "hours", it converts stuff like "NOW" and "-" into | |
206 | + # 0. In those cases, set it to hours_on, so they will cancel out in | |
207 | + # the "hours_ago" calculation later on. | |
208 | + if (hours == 0) | |
209 | + hours=hours_on | |
210 | + | |
211 | + if (test_seen) { | |
212 | + print "test="hours_on | |
213 | + print "test_type="test_type | |
214 | + print "test_status="status | |
215 | + print "test_progress="progress | |
216 | + } | |
217 | + # Not all drives report hours_on | |
218 | + if (hours_on && hours) { | |
219 | + total_hours_ago=(hours_on-hours) | |
220 | + days_ago=int(total_hours_ago/24) | |
221 | + hours_ago=(total_hours_ago % 24) | |
222 | + if (days_ago != 0) | |
223 | + ago_str=days_ago"d" | |
224 | + if (hours_ago !=0) | |
225 | + ago_str=ago_str""hours_ago"h" | |
226 | + print "test_ended="ago_str | |
227 | + } | |
228 | +} | |
229 | + | |
230 | +END {print "type="type; ORS="\n"; print ""} | |
231 | '); | |
232 | fi | |
233 | +type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) | |
234 | ||
235 | -# if type is not set by now, either we don't have a block device | |
236 | -# or smartctl failed. Either way, default to ATA and set out to | |
237 | -# nothing | |
238 | +# If type is not set by now, either we don't have a block device | |
239 | +# or smartctl failed. Either way, default to ATA and set $out to | |
240 | +# nothing. | |
241 | if [ -z "$type" ]; then | |
242 | - type="ATA" | |
243 | + type="sata" | |
244 | out= | |
245 | fi | |
246 | ||
247 | case $script in | |
248 | smart) | |
249 | # Print temperature plus common predictors of drive failure | |
250 | - if [ "$type" = "SAS" ] ; then | |
251 | + if [ "$type" = "sas" ] ; then | |
252 | scripts="temp|health|r_ucor|w_ucor" | |
253 | - elif [ "$type" = "ATA" ] ; then | |
254 | + elif [ "$type" = "sata" ] ; then | |
255 | scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" | |
256 | - elif [ "$type" = "NVMe" ] ; then | |
257 | + elif [ "$type" = "nvme" ] ; then | |
258 | scripts="temp|health|nvme_err" | |
259 | fi | |
260 | ;; | |
261 | smartx) | |
262 | # Print some other interesting stats | |
263 | - if [ "$type" = "SAS" ] ; then | |
264 | + if [ "$type" = "sas" ] ; then | |
265 | scripts="hours_on|defect|nonmed|r_proc|w_proc" | |
266 | - elif [ "$type" = "ATA" ] ; then | |
267 | + elif [ "$type" = "sata" ] ; then | |
268 | scripts="hours_on|pwr_cyc" | |
269 | - elif [ "$type" = "NVMe" ] ; then | |
270 | + elif [ "$type" = "nvme" ] ; then | |
271 | scripts="hours_on|pwr_cyc" | |
272 | fi | |
273 | ;; | |
274 | +smart_test) | |
275 | + scripts="test_type|test_status|test_progress|test_ended" | |
276 | + ;; | |
277 | *) | |
278 | scripts="$script" | |
279 | esac | |
280 | diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test | |
281 | new file mode 120000 | |
282 | index 000000000..94f22861f | |
283 | --- /dev/null | |
284 | +++ b/cmd/zpool/zpool.d/smart_test | |
285 | @@ -0,0 +1 @@ | |
286 | +smart | |
287 | \ No newline at end of file | |
288 | diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended | |
289 | new file mode 120000 | |
290 | index 000000000..94f22861f | |
291 | --- /dev/null | |
292 | +++ b/cmd/zpool/zpool.d/test_ended | |
293 | @@ -0,0 +1 @@ | |
294 | +smart | |
295 | \ No newline at end of file | |
296 | diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress | |
297 | new file mode 120000 | |
298 | index 000000000..94f22861f | |
299 | --- /dev/null | |
300 | +++ b/cmd/zpool/zpool.d/test_progress | |
301 | @@ -0,0 +1 @@ | |
302 | +smart | |
303 | \ No newline at end of file | |
304 | diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status | |
305 | new file mode 120000 | |
306 | index 000000000..94f22861f | |
307 | --- /dev/null | |
308 | +++ b/cmd/zpool/zpool.d/test_status | |
309 | @@ -0,0 +1 @@ | |
310 | +smart | |
311 | \ No newline at end of file | |
312 | diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type | |
313 | new file mode 120000 | |
314 | index 000000000..94f22861f | |
315 | --- /dev/null | |
316 | +++ b/cmd/zpool/zpool.d/test_type | |
317 | @@ -0,0 +1 @@ | |
318 | +smart | |
319 | \ No newline at end of file | |
320 | -- | |
321 | 2.14.2 | |
322 |