]> git.proxmox.com Git - mirror_zfs-debian.git/blobdiff - cmd/zpool/zpool.d/nvme_err
New upstream version 0.7.9
[mirror_zfs-debian.git] / cmd / zpool / zpool.d / nvme_err
diff --git a/cmd/zpool/zpool.d/nvme_err b/cmd/zpool/zpool.d/nvme_err
new file mode 100755 (executable)
index 0000000..64b5f6e
--- /dev/null
@@ -0,0 +1,242 @@
+#!/bin/sh
+#
+# Show SMART stats
+#
+
+helpstr="
+smart:         Show SMART temperature and error stats (specific to drive type)
+smartx:                Show SMART extended drive stats (specific to drive type).
+temp:          Show SMART drive temperature in celsius (all drives).
+health:                Show reported SMART status (all drives).
+r_proc:                Show SMART read GBytes processed over drive lifetime (SAS).
+w_proc:                Show SMART write GBytes processed over drive lifetime (SAS).
+r_ucor:                Show SMART read uncorrectable errors (SAS).
+w_ucor:                Show SMART write uncorrectable errors (SAS).
+nonmed:                Show SMART non-medium errors (SAS).
+defect:                Show SMART grown defect list (SAS).
+hours_on:      Show number of hours drive powered on (all drives).
+realloc:       Show SMART reallocated sectors count (ATA).
+rep_ucor:      Show SMART reported uncorrectable count (ATA).
+cmd_to:                Show SMART command timeout count (ATA).
+pend_sec:      Show SMART current pending sector count (ATA).
+off_ucor:      Show SMART offline uncorrectable errors (ATA).
+ata_err:       Show SMART ATA errors (ATA).
+pwr_cyc:       Show SMART power cycle count (ATA).
+serial:                Show disk serial number.
+nvme_err:      Show SMART NVMe errors (NVMe).
+smart_test:    Show SMART self-test results summary.
+test_type:     Show SMART self-test type (short, long... ).
+test_status:   Show SMART self-test status.
+test_progress: Show SMART self-test percentage done.
+test_ended:    Show when the last SMART self-test ended (if supported).
+"
+
+# Hack for developer testing
+#
+# If you set $samples to a directory containing smartctl output text files,
+# we will use them instead of running smartctl on the vdevs.  This can be
+# useful if you want to test a bunch of different smartctl outputs.  Also, if
+# $samples is set, and additional 'file' column is added to the zpool output
+# showing the filename.
+samples=
+
+# get_filename_from_dir DIR
+#
+# Look in directory DIR and return a filename from it.  The filename returned
+# is chosen quasi-sequentially (based off our PID).  This allows us to return
+# a different filename every time this script is invoked (which we do for each
+# vdev), without having to maintain state.
+get_filename_from_dir()
+{
+       dir=$1
+       pid="$$"
+       num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
+       mod=$((pid % num_files))
+       i=0
+       find "$dir" -type f -printf "%f\n" | while read -r file ; do
+               if [ "$mod" = "$i" ] ; then
+                       echo "$file"
+                       break
+               fi
+               i=$((i+1))
+       done
+}
+
+script=$(basename "$0")
+
+if [ "$1" = "-h" ] ; then
+        echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+        exit
+fi
+
+smartctl_path=$(which smartctl)
+
+if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
+       if [ -n "$samples" ] ; then
+               # cat a smartctl output text file instead of running smartctl
+               # on a vdev (only used for developer testing).
+               file=$(get_filename_from_dir $samples)
+               echo "file=$file"
+               raw_out=$(cat "$samples/$file")
+       else
+               raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
+       fi
+
+       # What kind of drive are we?  Look for the right line in smartctl:
+       #
+       # SAS:
+       #       Transport protocol:   SAS
+       #
+       # SATA:
+       #       ATA Version is:   8
+       #
+       # NVMe:
+       #       SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
+       #
+       out=$(echo "$raw_out" | awk '
+# SAS specific
+/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
+/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
+/Non-medium error count/{print "nonmed="$4}
+/Elements in grown defect list/{print "defect="$6}
+
+# SAS common
+/SAS/{type="sas"}
+/Drive Temperature:/{print "temp="$4}
+# Status can be a long string, substitute spaces for '_'
+/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
+/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
+/Serial number:/{print "serial="$3}
+
+# SATA specific
+/Reallocated_Sector_Ct/{print "realloc="$10}
+/Reported_Uncorrect/{print "rep_ucor="$10}
+/Command_Timeout/{print "cmd_to="$10}
+/Current_Pending_Sector/{print "pend_sec="$10}
+/Offline_Uncorrectable/{print "off_ucor="$10}
+/ATA Error Count:/{print "ata_err="$4}
+/Power_Cycle_Count/{print "pwr_cyc="$10}
+
+# SATA common
+/SATA/{type="sata"}
+/Temperature_Celsius/{print "temp="$10}
+/Airflow_Temperature_Cel/{print "temp="$10}
+/Current Temperature:/{print "temp="$3}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
+/Serial Number:/{print "serial="$3}
+
+# NVMe common
+/NVMe/{type="nvme"}
+/Temperature:/{print "temp="$2}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
+/Serial Number:/{print "serial="$3}
+/Power Cycles:/{print "pwr_cyc="$3}
+
+# NVMe specific
+/Media and Data Integrity Errors:/{print "nvme_err="$6}
+
+# SMART self-test info
+/Self-test execution status:/{progress=tolower($4)} # SAS
+/SMART Self-test log/{test_seen=1} # SAS
+/SMART Extended Self-test Log/{test_seen=1} # SATA
+/# 1/{
+       test_type=tolower($3"_"$4);
+       # Status could be one word ("Completed") or multiple ("Completed: read
+       # failure").  Look for the ":" to see if we need to grab more words.
+
+       if ($5 ~ ":")
+               status=tolower($5""$6"_"$7)
+       else
+               status=tolower($5)
+       if (status=="self")
+               status="running";
+
+       if (type == "sas") {
+               hours=int($(NF-4))
+       } else {
+               hours=int($(NF-1))
+               # SATA reports percent remaining, rather than percent done
+               # Convert it to percent done.
+               progress=(100-int($(NF-2)))"%"
+       }
+       # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
+       # 0.  In those cases, set it to hours_on, so they will cancel out in
+       # the "hours_ago" calculation later on.
+       if (hours == 0)
+               hours=hours_on
+
+       if (test_seen) {
+               print "test="hours_on
+               print "test_type="test_type
+               print "test_status="status
+               print "test_progress="progress
+       }
+       # Not all drives report hours_on
+       if (hours_on && hours) {
+               total_hours_ago=(hours_on-hours)
+               days_ago=int(total_hours_ago/24)
+               hours_ago=(total_hours_ago % 24)
+               if (days_ago != 0)
+                       ago_str=days_ago"d"
+               if (hours_ago !=0)
+                       ago_str=ago_str""hours_ago"h"
+               print "test_ended="ago_str
+       }
+}
+
+END {print "type="type; ORS="\n"; print ""}
+');
+fi
+type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
+
+# If type is not set by now, either we don't have a block device
+# or smartctl failed. Either way, default to ATA and set $out to
+# nothing.
+if [ -z "$type" ]; then
+       type="sata"
+       out=
+fi
+
+case $script in
+smart)
+       # Print temperature plus common predictors of drive failure
+       if [ "$type" = "sas" ] ; then
+               scripts="temp|health|r_ucor|w_ucor"
+       elif [ "$type" = "sata" ] ; then
+               scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
+       elif [ "$type" = "nvme" ] ; then
+               scripts="temp|health|nvme_err"
+       fi
+       ;;
+smartx)
+       # Print some other interesting stats
+       if [ "$type" = "sas" ] ; then
+               scripts="hours_on|defect|nonmed|r_proc|w_proc"
+       elif [ "$type" = "sata" ] ; then
+               scripts="hours_on|pwr_cyc"
+       elif [ "$type" = "nvme" ] ; then
+               scripts="hours_on|pwr_cyc"
+       fi
+       ;;
+smart_test)
+       scripts="test_type|test_status|test_progress|test_ended"
+       ;;
+*)
+       scripts="$script"
+esac
+
+with_vals=$(echo "$out" | grep -E "$scripts")
+if [ ! -z "$with_vals" ]; then
+       echo "$with_vals"
+       without_vals=$(echo "$scripts" | tr "|" "\n" |
+               grep -v -E "$(echo "$with_vals" |
+               awk -F "=" '{print $1}')" | awk '{print $0"="}')
+else
+       without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}')
+fi
+
+if [ ! -z "$without_vals" ]; then
+       echo "$without_vals"
+fi