]>
Commit | Line | Data |
---|---|---|
cae5b340 AX |
1 | #!/bin/sh |
2 | # | |
3 | # Show SMART stats | |
4 | # | |
5 | ||
6 | helpstr=" | |
7 | smart: Show SMART temperature and error stats (specific to drive type) | |
8 | smartx: Show SMART extended drive stats (specific to drive type). | |
9 | temp: Show SMART drive temperature in celsius (all drives). | |
10 | health: Show reported SMART status (all drives). | |
11 | r_proc: Show SMART read GBytes processed over drive lifetime (SAS). | |
12 | w_proc: Show SMART write GBytes processed over drive lifetime (SAS). | |
13 | r_ucor: Show SMART read uncorrectable errors (SAS). | |
14 | w_ucor: Show SMART write uncorrectable errors (SAS). | |
15 | nonmed: Show SMART non-medium errors (SAS). | |
16 | defect: Show SMART grown defect list (SAS). | |
17 | hours_on: Show number of hours drive powered on (all drives). | |
18 | realloc: Show SMART reallocated sectors count (ATA). | |
19 | rep_ucor: Show SMART reported uncorrectable count (ATA). | |
20 | cmd_to: Show SMART command timeout count (ATA). | |
21 | pend_sec: Show SMART current pending sector count (ATA). | |
22 | off_ucor: Show SMART offline uncorrectable errors (ATA). | |
23 | ata_err: Show SMART ATA errors (ATA). | |
24 | pwr_cyc: Show SMART power cycle count (ATA). | |
25 | serial: Show disk serial number. | |
42f7b73b AX |
26 | nvme_err: Show SMART NVMe errors (NVMe). |
27 | smart_test: Show SMART self-test results summary. | |
28 | test_type: Show SMART self-test type (short, long... ). | |
29 | test_status: Show SMART self-test status. | |
30 | test_progress: Show SMART self-test percentage done. | |
31 | test_ended: Show when the last SMART self-test ended (if supported). | |
cae5b340 AX |
32 | " |
33 | ||
42f7b73b AX |
34 | # Hack for developer testing |
35 | # | |
36 | # If you set $samples to a directory containing smartctl output text files, | |
37 | # we will use them instead of running smartctl on the vdevs. This can be | |
38 | # useful if you want to test a bunch of different smartctl outputs. Also, if | |
39 | # $samples is set, and additional 'file' column is added to the zpool output | |
40 | # showing the filename. | |
41 | samples= | |
42 | ||
43 | # get_filename_from_dir DIR | |
44 | # | |
45 | # Look in directory DIR and return a filename from it. The filename returned | |
46 | # is chosen quasi-sequentially (based off our PID). This allows us to return | |
47 | # a different filename every time this script is invoked (which we do for each | |
48 | # vdev), without having to maintain state. | |
49 | get_filename_from_dir() | |
50 | { | |
51 | dir=$1 | |
52 | pid="$$" | |
53 | num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) | |
54 | mod=$((pid % num_files)) | |
55 | i=0 | |
56 | find "$dir" -type f -printf "%f\n" | while read -r file ; do | |
57 | if [ "$mod" = "$i" ] ; then | |
58 | echo "$file" | |
59 | break | |
60 | fi | |
61 | i=$((i+1)) | |
62 | done | |
63 | } | |
64 | ||
cae5b340 AX |
65 | script=$(basename "$0") |
66 | ||
67 | if [ "$1" = "-h" ] ; then | |
68 | echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- | |
69 | exit | |
70 | fi | |
71 | ||
72 | smartctl_path=$(which smartctl) | |
73 | ||
42f7b73b AX |
74 | if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then |
75 | if [ -n "$samples" ] ; then | |
76 | # cat a smartctl output text file instead of running smartctl | |
77 | # on a vdev (only used for developer testing). | |
78 | file=$(get_filename_from_dir $samples) | |
79 | echo "file=$file" | |
80 | raw_out=$(cat "$samples/$file") | |
81 | else | |
82 | raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") | |
83 | fi | |
cae5b340 | 84 | |
42f7b73b | 85 | # What kind of drive are we? Look for the right line in smartctl: |
cae5b340 AX |
86 | # |
87 | # SAS: | |
88 | # Transport protocol: SAS | |
89 | # | |
90 | # SATA: | |
91 | # ATA Version is: 8 | |
92 | # | |
42f7b73b AX |
93 | # NVMe: |
94 | # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) | |
95 | # | |
cae5b340 AX |
96 | out=$(echo "$raw_out" | awk ' |
97 | # SAS specific | |
98 | /read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} | |
99 | /write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8} | |
100 | /Non-medium error count/{print "nonmed="$4} | |
101 | /Elements in grown defect list/{print "defect="$6} | |
102 | ||
103 | # SAS common | |
42f7b73b | 104 | /SAS/{type="sas"} |
cae5b340 AX |
105 | /Drive Temperature:/{print "temp="$4} |
106 | # Status can be a long string, substitute spaces for '_' | |
107 | /SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} | |
42f7b73b | 108 | /number of hours powered up/{print "hours_on="$7; hours_on=int($7)} |
cae5b340 AX |
109 | /Serial number:/{print "serial="$3} |
110 | ||
111 | # SATA specific | |
112 | /Reallocated_Sector_Ct/{print "realloc="$10} | |
113 | /Reported_Uncorrect/{print "rep_ucor="$10} | |
114 | /Command_Timeout/{print "cmd_to="$10} | |
115 | /Current_Pending_Sector/{print "pend_sec="$10} | |
116 | /Offline_Uncorrectable/{print "off_ucor="$10} | |
117 | /ATA Error Count:/{print "ata_err="$4} | |
118 | /Power_Cycle_Count/{print "pwr_cyc="$10} | |
119 | ||
120 | # SATA common | |
42f7b73b | 121 | /SATA/{type="sata"} |
cae5b340 | 122 | /Temperature_Celsius/{print "temp="$10} |
42f7b73b AX |
123 | /Airflow_Temperature_Cel/{print "temp="$10} |
124 | /Current Temperature:/{print "temp="$3} | |
125 | /SMART overall-health self-assessment test result:/{print "health="$6} | |
126 | /Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} | |
127 | /Serial Number:/{print "serial="$3} | |
128 | ||
129 | # NVMe common | |
130 | /NVMe/{type="nvme"} | |
131 | /Temperature:/{print "temp="$2} | |
cae5b340 | 132 | /SMART overall-health self-assessment test result:/{print "health="$6} |
42f7b73b | 133 | /Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} |
cae5b340 | 134 | /Serial Number:/{print "serial="$3} |
42f7b73b AX |
135 | /Power Cycles:/{print "pwr_cyc="$3} |
136 | ||
137 | # NVMe specific | |
138 | /Media and Data Integrity Errors:/{print "nvme_err="$6} | |
139 | ||
140 | # SMART self-test info | |
141 | /Self-test execution status:/{progress=tolower($4)} # SAS | |
142 | /SMART Self-test log/{test_seen=1} # SAS | |
143 | /SMART Extended Self-test Log/{test_seen=1} # SATA | |
144 | /# 1/{ | |
145 | test_type=tolower($3"_"$4); | |
146 | # Status could be one word ("Completed") or multiple ("Completed: read | |
147 | # failure"). Look for the ":" to see if we need to grab more words. | |
148 | ||
149 | if ($5 ~ ":") | |
150 | status=tolower($5""$6"_"$7) | |
151 | else | |
152 | status=tolower($5) | |
153 | if (status=="self") | |
154 | status="running"; | |
cae5b340 | 155 | |
42f7b73b AX |
156 | if (type == "sas") { |
157 | hours=int($(NF-4)) | |
158 | } else { | |
159 | hours=int($(NF-1)) | |
160 | # SATA reports percent remaining, rather than percent done | |
161 | # Convert it to percent done. | |
162 | progress=(100-int($(NF-2)))"%" | |
163 | } | |
164 | # When we int()-ify "hours", it converts stuff like "NOW" and "-" into | |
165 | # 0. In those cases, set it to hours_on, so they will cancel out in | |
166 | # the "hours_ago" calculation later on. | |
167 | if (hours == 0) | |
168 | hours=hours_on | |
169 | ||
170 | if (test_seen) { | |
171 | print "test="hours_on | |
172 | print "test_type="test_type | |
173 | print "test_status="status | |
174 | print "test_progress="progress | |
175 | } | |
176 | # Not all drives report hours_on | |
177 | if (hours_on && hours) { | |
178 | total_hours_ago=(hours_on-hours) | |
179 | days_ago=int(total_hours_ago/24) | |
180 | hours_ago=(total_hours_ago % 24) | |
181 | if (days_ago != 0) | |
182 | ago_str=days_ago"d" | |
183 | if (hours_ago !=0) | |
184 | ago_str=ago_str""hours_ago"h" | |
185 | print "test_ended="ago_str | |
186 | } | |
187 | } | |
188 | ||
189 | END {print "type="type; ORS="\n"; print ""} | |
cae5b340 AX |
190 | '); |
191 | fi | |
42f7b73b | 192 | type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) |
cae5b340 | 193 | |
42f7b73b AX |
194 | # If type is not set by now, either we don't have a block device |
195 | # or smartctl failed. Either way, default to ATA and set $out to | |
196 | # nothing. | |
cae5b340 | 197 | if [ -z "$type" ]; then |
42f7b73b | 198 | type="sata" |
cae5b340 AX |
199 | out= |
200 | fi | |
201 | ||
202 | case $script in | |
203 | smart) | |
204 | # Print temperature plus common predictors of drive failure | |
42f7b73b | 205 | if [ "$type" = "sas" ] ; then |
cae5b340 | 206 | scripts="temp|health|r_ucor|w_ucor" |
42f7b73b | 207 | elif [ "$type" = "sata" ] ; then |
cae5b340 | 208 | scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" |
42f7b73b AX |
209 | elif [ "$type" = "nvme" ] ; then |
210 | scripts="temp|health|nvme_err" | |
cae5b340 AX |
211 | fi |
212 | ;; | |
213 | smartx) | |
214 | # Print some other interesting stats | |
42f7b73b | 215 | if [ "$type" = "sas" ] ; then |
cae5b340 | 216 | scripts="hours_on|defect|nonmed|r_proc|w_proc" |
42f7b73b AX |
217 | elif [ "$type" = "sata" ] ; then |
218 | scripts="hours_on|pwr_cyc" | |
219 | elif [ "$type" = "nvme" ] ; then | |
cae5b340 AX |
220 | scripts="hours_on|pwr_cyc" |
221 | fi | |
222 | ;; | |
42f7b73b AX |
223 | smart_test) |
224 | scripts="test_type|test_status|test_progress|test_ended" | |
225 | ;; | |
cae5b340 AX |
226 | *) |
227 | scripts="$script" | |
228 | esac | |
229 | ||
230 | with_vals=$(echo "$out" | grep -E "$scripts") | |
231 | if [ ! -z "$with_vals" ]; then | |
232 | echo "$with_vals" | |
233 | without_vals=$(echo "$scripts" | tr "|" "\n" | | |
234 | grep -v -E "$(echo "$with_vals" | | |
235 | awk -F "=" '{print $1}')" | awk '{print $0"="}') | |
236 | else | |
237 | without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}') | |
238 | fi | |
239 | ||
240 | if [ ! -z "$without_vals" ]; then | |
241 | echo "$without_vals" | |
242 | fi |