]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add slow disk diagnosis to ZED
authorDon Brady <don.brady@delphix.com>
Thu, 8 Feb 2024 17:19:52 +0000 (10:19 -0700)
committerGitHub <noreply@github.com>
Thu, 8 Feb 2024 17:19:52 +0000 (09:19 -0800)
Slow disk response times can be indicative of a failing drive. ZFS
currently tracks slow I/Os (slower than zio_slow_io_ms) and generates
events (ereport.fs.zfs.delay).  However, no action is taken by ZED,
like is done for checksum or I/O errors.  This change adds slow disk
diagnosis to ZED which is opt-in using new VDEV properties:
  VDEV_PROP_SLOW_IO_N
  VDEV_PROP_SLOW_IO_T

If multiple VDEVs in a pool are undergoing slow I/Os, then it skips
the zpool_vdev_degrade().

Sponsored-By: OpenDrives Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15469

29 files changed:
cmd/zed/agents/fmd_api.c
cmd/zed/agents/fmd_api.h
cmd/zed/agents/fmd_serd.c
cmd/zed/agents/fmd_serd.h
cmd/zed/agents/zfs_diagnosis.c
cmd/zed/agents/zfs_retire.c
cmd/zinject/zinject.c
cmd/zpool/zpool_main.c
include/sys/fm/fs/zfs.h
include/sys/fs/zfs.h
include/sys/vdev_impl.h
lib/libzfs/libzfs.abi
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_util.c
man/man7/vdevprops.7
man/man7/zpoolconcepts.7
man/man8/zinject.8
module/zcommon/zpool_prop.c
module/zfs/vdev.c
module/zfs/zfs_fm.c
module/zfs/zio_inject.c
tests/runfiles/linux.run
tests/zfs-tests/tests/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
tests/zfs-tests/tests/functional/events/cleanup.ksh
tests/zfs-tests/tests/functional/events/zed_slow_io.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/fault/cleanup.ksh
tests/zfs-tests/tests/functional/fault/setup.ksh

index 4a6cfbf8c05c1f51c2bdf29fd0613f7bb7e339c7..fe43e2ab971e05e9c2857f7c466973f6498ee243 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  *
  * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 /*
@@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
        if (strcmp(name, "spare_on_remove") == 0)
                return (1);
 
-       if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
-               return (10);    /* N = 10 events */
-
-       return (0);
-}
-
-int64_t
-fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
-{
-       (void) hdl;
-
-       /*
-        * These can be looked up in mp->modinfo->fmdi_props
-        * For now we just hard code for phase 2. In the
-        * future, there can be a ZED based override.
-        */
-       if (strcmp(name, "remove_timeout") == 0)
-               return (15ULL * 1000ULL * 1000ULL * 1000ULL);   /* 15 sec */
-
-       if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
-               return (1000ULL * 1000ULL * 1000ULL * 600ULL);  /* 10 min */
-
        return (0);
 }
 
@@ -535,20 +514,31 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
        return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
 }
 
-void
-fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+int
+fmd_serd_active(fmd_hdl_t *hdl, const char *name)
 {
        fmd_module_t *mp = (fmd_module_t *)hdl;
        fmd_serd_eng_t *sgp;
 
        if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
                zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
-               return;
+               return (0);
        }
+       return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
+}
 
-       fmd_serd_eng_reset(sgp);
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+       fmd_module_t *mp = (fmd_module_t *)hdl;
+       fmd_serd_eng_t *sgp;
 
-       fmd_hdl_debug(hdl, "serd_reset %s", name);
+       if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+               zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+       } else {
+               fmd_serd_eng_reset(sgp);
+               fmd_hdl_debug(hdl, "serd_reset %s", name);
+       }
 }
 
 int
@@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
 {
        fmd_module_t *mp = (fmd_module_t *)hdl;
        fmd_serd_eng_t *sgp;
-       int err;
 
        if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
                zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
                    name);
                return (0);
        }
-       err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+       return (fmd_serd_eng_record(sgp, ep->ev_hrt));
+}
+
+void
+fmd_serd_gc(fmd_hdl_t *hdl)
+{
+       fmd_module_t *mp = (fmd_module_t *)hdl;
 
-       return (err);
+       fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
 }
 
 /* FMD Timers */
@@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
        const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
        struct itimerspec its;
 
-       fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+       fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
 
        /* disarm the timer */
        memset(&its, 0, sizeof (struct itimerspec));
index b940d0d395ecec8414ac03e71173e6ce9d948bb2..8471feecf33f8330da28ee2b73de9c9c46330140 100644 (file)
@@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
 extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
 
 extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
-extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
 
 #define        FMD_STAT_NOALLOC        0x0     /* fmd should use caller's memory */
 #define        FMD_STAT_ALLOC          0x1     /* fmd should allocate stats memory */
@@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
 extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
 extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
 extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern int fmd_serd_active(fmd_hdl_t *, const char *);
 extern void fmd_serd_reset(fmd_hdl_t *, const char *);
 extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
 extern int fmd_serd_fired(fmd_hdl_t *, const char *);
 extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+extern void fmd_serd_gc(fmd_hdl_t *);
 
 extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
 extern void fmd_timer_remove(fmd_hdl_t *, id_t);
index 0bb2c535f094db7bc092ba903850ab1003ecad36..f942e62b3f48a254cddf206504526d7ee8c88e04 100644 (file)
@@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
 }
 
 void
-fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
 {
+       (void) arg;
        fmd_serd_elem_t *sep, *nep;
        hrtime_t hrt;
 
index 25b6888e61f29fa2bda520e7aa8fc5fd104b1c62..80ff9a3b25b87b737e5fa1d263328111eccd6d46 100644 (file)
@@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
 extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
 
 extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
-extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
 
 #ifdef __cplusplus
 }
index f6ba334a3ba3aabb88e6feb7562bc3dcfc8c3c20..e0ad00800add0a1c534a9f677f5d2a596129e28f 100644 (file)
@@ -23,6 +23,7 @@
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #include <stddef.h>
 #define        DEFAULT_CHECKSUM_T      600     /* seconds */
 #define        DEFAULT_IO_N            10      /* events */
 #define        DEFAULT_IO_T            600     /* seconds */
+#define        DEFAULT_SLOW_IO_N       10      /* events */
+#define        DEFAULT_SLOW_IO_T       30      /* seconds */
+
+#define        CASE_GC_TIMEOUT_SECS    43200   /* 12 hours */
 
 /*
- * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
- * #define reserves enough space for two 64-bit hex values plus the length of
- * the longest string.
+ * Our serd engines are named in the following format:
+ *     'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
+ * This #define reserves enough space for two 64-bit hex values plus the
+ * length of the longest string.
  */
 #define        MAX_SERDLEN     (16 * 2 + sizeof ("zfs___checksum"))
 
@@ -68,6 +74,7 @@ typedef struct zfs_case_data {
        int             zc_pool_state;
        char            zc_serd_checksum[MAX_SERDLEN];
        char            zc_serd_io[MAX_SERDLEN];
+       char            zc_serd_slow_io[MAX_SERDLEN];
        int             zc_has_remove_timer;
 } zfs_case_data_t;
 
@@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
        { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
 };
 
-static hrtime_t zfs_remove_timeout;
+/* wait 15 seconds after a removal */
+static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
 
 uu_list_pool_t *zfs_case_pool;
 uu_list_t *zfs_cases;
@@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
 #define        ZFS_MAKE_EREPORT(type)  \
     FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
 
+static void zfs_purge_cases(fmd_hdl_t *hdl);
+
 /*
  * Write out the persistent representation of an active case.
  */
@@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
        return (zcp);
 }
 
+/*
+ * count other unique slow-io cases in a pool
+ */
+static uint_t
+zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
+{
+       zfs_case_t *zcp;
+       uint_t cases = 0;
+       static hrtime_t next_check = 0;
+
+       /*
+        * Note that plumbing in some external GC would require adding locking,
+        * since most of this module code is not thread safe and assumes there
+        * is only one thread running against the module. So we perform GC here
+        * inline periodically so that future delay induced faults will be
+        * possible once the issue causing multiple vdev delays is resolved.
+        */
+       if (gethrestime_sec() > next_check) {
+               /* Periodically purge old SERD entries and stale cases */
+               fmd_serd_gc(hdl);
+               zfs_purge_cases(hdl);
+               next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
+       }
+
+       for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+           zcp = uu_list_next(zfs_cases, zcp)) {
+               if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
+                   zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
+                   zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+                   fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
+                       cases++;
+               }
+       }
+       return (cases);
+}
+
 /*
  * Iterate over any active cases.  If any cases are associated with a pool or
  * vdev which is no longer present on the system, close the associated case.
@@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
            (long long unsigned int)vdev_guid, type);
 }
 
+static void
+zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+       fmd_hdl_debug(hdl, "retiring case");
+
+       fmd_case_close(hdl, zcp->zc_case);
+}
+
 /*
  * Solve a given ZFS case.  This first checks to make sure the diagnosis is
  * still valid, as well as cleaning up any pending timer associated with the
@@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                if (strcmp(class,
                    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
                    strcmp(class,
-                   ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
-                   strcmp(class,
-                   ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+                   ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
                        zfs_stats.resource_drops.fmds_value.ui64++;
                        return;
                }
@@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                        if (zcp->zc_data.zc_serd_checksum[0] != '\0')
                                fmd_serd_reset(hdl,
                                    zcp->zc_data.zc_serd_checksum);
+                       if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+                               fmd_serd_reset(hdl,
+                                   zcp->zc_data.zc_serd_slow_io);
                } else if (fmd_nvl_class_match(hdl, nvl,
                    ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
                        uint64_t state = 0;
@@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
        if (fmd_case_solved(hdl, zcp->zc_case))
                return;
 
-       fmd_hdl_debug(hdl, "error event '%s'", class);
+       if (vdev_guid)
+               fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
+                   vdev_guid);
+       else
+               fmd_hdl_debug(hdl, "error event '%s'", class);
 
        /*
         * Determine if we should solve the case and generate a fault.  We solve
@@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
            fmd_nvl_class_match(hdl, nvl,
            ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
            fmd_nvl_class_match(hdl, nvl,
+           ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
+           fmd_nvl_class_match(hdl, nvl,
            ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
                const char *failmode = NULL;
                boolean_t checkremove = B_FALSE;
@@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                        }
                        if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
                                checkremove = B_TRUE;
+               } else if (fmd_nvl_class_match(hdl, nvl,
+                   ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
+                       uint64_t slow_io_n, slow_io_t;
+
+                       /*
+                        * Create a slow io SERD engine when the VDEV has the
+                        * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
+                        */
+                       if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
+                           nvlist_lookup_uint64(nvl,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+                           &slow_io_n) == 0 &&
+                           nvlist_lookup_uint64(nvl,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+                           &slow_io_t) == 0) {
+                               zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
+                                   pool_guid, vdev_guid, "slow_io");
+                               fmd_serd_create(hdl,
+                                   zcp->zc_data.zc_serd_slow_io,
+                                   slow_io_n,
+                                   SEC2NSEC(slow_io_t));
+                               zfs_case_serialize(zcp);
+                       }
+                       /* Pass event to SERD engine and see if this triggers */
+                       if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
+                           fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
+                           ep)) {
+                               /*
+                                * Ignore a slow io diagnosis when other
+                                * VDEVs in the pool show signs of being slow.
+                                */
+                               if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
+                                       zfs_case_retire(hdl, zcp);
+                                       fmd_hdl_debug(hdl, "pool %llu has "
+                                           "multiple slow io cases -- skip "
+                                           "degrading vdev %llu",
+                                           (u_longlong_t)
+                                           zcp->zc_data.zc_pool_guid,
+                                           (u_longlong_t)
+                                           zcp->zc_data.zc_vdev_guid);
+                               } else {
+                                       zfs_case_solve(hdl, zcp,
+                                           "fault.fs.zfs.vdev.slow_io");
+                               }
+                       }
                } else if (fmd_nvl_class_match(hdl, nvl,
                    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
                        /*
@@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
                fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
        if (zcp->zc_data.zc_serd_io[0] != '\0')
                fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+       if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
+               fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
        if (zcp->zc_data.zc_has_remove_timer)
                fmd_timer_remove(hdl, zcp->zc_remove_timer);
 
@@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
        fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
 }
 
-/*
- * We use the fmd gc entry point to look for old cases that no longer apply.
- * This allows us to keep our set of case data small in a long running system.
- */
-static void
-zfs_fm_gc(fmd_hdl_t *hdl)
-{
-       zfs_purge_cases(hdl);
-}
-
 static const fmd_hdl_ops_t fmd_ops = {
        zfs_fm_recv,    /* fmdo_recv */
        zfs_fm_timeout, /* fmdo_timeout */
        zfs_fm_close,   /* fmdo_close */
        NULL,           /* fmdo_stats */
-       zfs_fm_gc,      /* fmdo_gc */
+       NULL,   /* fmdo_gc */
 };
 
 static const fmd_prop_t fmd_props[] = {
-       { "checksum_N", FMD_TYPE_UINT32, "10" },
-       { "checksum_T", FMD_TYPE_TIME, "10min" },
-       { "io_N", FMD_TYPE_UINT32, "10" },
-       { "io_T", FMD_TYPE_TIME, "10min" },
-       { "remove_timeout", FMD_TYPE_TIME, "15sec" },
        { NULL, 0, NULL }
 };
 
@@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
 
        (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
            sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
-
-       zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
 }
 
 void
index a0e377a4a0c80ef577236c2f3bbc9b4c616dc391..1ef5c631a4383bab49c3f59acd53f69905189613 100644 (file)
@@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
                } else if (fmd_nvl_class_match(hdl, fault,
                    "fault.fs.zfs.vdev.checksum")) {
                        degrade_device = B_TRUE;
+               } else if (fmd_nvl_class_match(hdl, fault,
+                   "fault.fs.zfs.vdev.slow_io")) {
+                       degrade_device = B_TRUE;
                } else if (fmd_nvl_class_match(hdl, fault,
                    "fault.fs.zfs.device")) {
                        fault_device = B_FALSE;
index f1262ed772de8c33a86af84759b0a777a7ef4911..a11b6d0b7fac8aa92cf963bc10b5ed580f21a9b5 100644 (file)
@@ -1083,6 +1083,22 @@ main(int argc, char **argv)
                        libzfs_fini(g_zfs);
                        return (1);
                }
+
+               if (record.zi_nlanes) {
+                       switch (io_type) {
+                       case ZIO_TYPE_READ:
+                       case ZIO_TYPE_WRITE:
+                       case ZIO_TYPES:
+                               break;
+                       default:
+                               (void) fprintf(stderr, "I/O type for a delay "
+                                   "must be 'read' or 'write'\n");
+                               usage();
+                               libzfs_fini(g_zfs);
+                               return (1);
+                       }
+               }
+
                if (!error)
                        error = ENXIO;
 
index 8753d7263914c1592c49e092e8c0581c9f97a8b6..0783271f4734e32a1e16e4325e91fe1e721bfedd 100644 (file)
@@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
                        break;
 
                case VDEV_AUX_ERR_EXCEEDED:
-                       (void) printf(gettext("too many errors"));
+                       if (vs->vs_read_errors + vs->vs_write_errors +
+                           vs->vs_checksum_errors == 0 && children == 0 &&
+                           vs->vs_slow_ios > 0) {
+                               (void) printf(gettext("too many slow I/Os"));
+                       } else {
+                               (void) printf(gettext("too many errors"));
+                       }
                        break;
 
                case VDEV_AUX_IO_FAILURE:
index fb9e8649221e314c3eb2ce12133a48a258001f5d..c746600cd2d544f3928dce773bb56bd848a325c8 100644 (file)
@@ -82,6 +82,8 @@ extern "C" {
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T     "vdev_cksum_t"
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N        "vdev_io_n"
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T        "vdev_io_t"
+#define        FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N   "vdev_slow_io_n"
+#define        FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T   "vdev_slow_io_t"
 #define        FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS      "vdev_delays"
 #define        FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID      "parent_guid"
 #define        FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE      "parent_type"
index c6f7dcca78b3c665e684a4ff6ee29e2113e2e6fc..025567e2183f3ccc1bffb2b3b75c558eccccfe29 100644 (file)
@@ -366,6 +366,8 @@ typedef enum {
        VDEV_PROP_IO_N,
        VDEV_PROP_IO_T,
        VDEV_PROP_RAIDZ_EXPANDING,
+       VDEV_PROP_SLOW_IO_N,
+       VDEV_PROP_SLOW_IO_T,
        VDEV_NUM_PROPS
 } vdev_prop_t;
 
index dafab66c70f964d533486aa3a01dd885b7631a83..f39ebf031ceab7a41118c01ef738ce872384fc92 100644 (file)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -454,12 +455,14 @@ struct vdev {
        zfs_ratelimit_t vdev_checksum_rl;
 
        /*
-        * Checksum and IO thresholds for tuning ZED
+        * Vdev properties for tuning ZED
         */
        uint64_t        vdev_checksum_n;
        uint64_t        vdev_checksum_t;
        uint64_t        vdev_io_n;
        uint64_t        vdev_io_t;
+       uint64_t        vdev_slow_io_n;
+       uint64_t        vdev_slow_io_t;
 };
 
 #define        VDEV_PAD_SIZE           (8 << 10)
index 7c39b134d1ef0ac9417bb2f765e5d49bb490094a..cdd2f04c26290006167ef9b2ed3520b55ff5bdff 100644 (file)
       <enumerator name='VDEV_PROP_IO_N' value='44'/>
       <enumerator name='VDEV_PROP_IO_T' value='45'/>
       <enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
-      <enumerator name='VDEV_NUM_PROPS' value='47'/>
+      <enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
+      <enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
+      <enumerator name='VDEV_NUM_PROPS' value='49'/>
     </enum-decl>
     <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
     <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
index c7b8617ef35e506896fd105510a5bfb92e3f8bb4..402c14a6baee85b7a7a6d4e8ca5f9c1eec374b57 100644 (file)
@@ -5264,6 +5264,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
                case VDEV_PROP_CHECKSUM_T:
                case VDEV_PROP_IO_N:
                case VDEV_PROP_IO_T:
+               case VDEV_PROP_SLOW_IO_N:
+               case VDEV_PROP_SLOW_IO_T:
                        if (intval == UINT64_MAX) {
                                (void) strlcpy(buf, "-", len);
                        } else {
index d0a63a4daa8c81743b27b42260fb87332df99c99..8e70af2e5830a5ed519837ddb9ea62fae03430e1 100644 (file)
@@ -1704,7 +1704,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
                    (prop == VDEV_PROP_CHECKSUM_N ||
                    prop == VDEV_PROP_CHECKSUM_T ||
                    prop == VDEV_PROP_IO_N ||
-                   prop == VDEV_PROP_IO_T)) {
+                   prop == VDEV_PROP_IO_T ||
+                   prop == VDEV_PROP_SLOW_IO_N ||
+                   prop == VDEV_PROP_SLOW_IO_T)) {
                        *ivalp = UINT64_MAX;
                }
 
index 6eebfa0060de788bef84fd44af9585a7d1457879..3d3ebc072915a0e0946f11d378a85c366b1a672b 100644 (file)
@@ -44,7 +44,7 @@ section, below.
 Every vdev has a set of properties that export statistics about the vdev
 as well as control various behaviors.
 Properties are not inherited from top-level vdevs, with the exception of
-checksum_n, checksum_t, io_n, and io_t.
+checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
 .Pp
 The values of numeric properties can be specified using human-readable suffixes
 .Po for example,
@@ -117,7 +117,7 @@ If this device is currently being removed from the pool
 .Pp
 The following native properties can be used to change the behavior of a vdev.
 .Bl -tag -width "allocating"
-.It Sy checksum_n , checksum_t , io_n , io_t
+.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
 Tune the fault management daemon by specifying checksum/io thresholds of <N>
 errors in <T> seconds, respectively.
 These properties can be set on leaf and top-level vdevs.
index 98f3ee7cd660307d5e59cbadb74ba686eff0dd5f..18dfca6dc8acb662f4c712845599aeffcaca4acb 100644 (file)
@@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
 The underlying conditions are as follows:
 .Bl -bullet -compact
 .It
-The number of checksum errors exceeds acceptable levels and the device is
-degraded as an indication that something may be wrong.
+The number of checksum errors or slow I/Os exceeds acceptable levels and the
+device is degraded as an indication that something may be wrong.
 ZFS continues to use the device as necessary.
 .It
 The number of I/O errors exceeds acceptable levels.
index 4f0bbae81212462ba216a9cddd2b8a00b9070740..b692f12130a816700f598944d80423a9d8693a3e 100644 (file)
@@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
 .Nm zinject
 .Fl d Ar vdev
 .Fl D Ar latency : Ns Ar lanes
+.Op Fl T Ar read|write
 .Ar pool
 .Xc
 Add an artificial delay to I/O requests on a particular
index e98063e8bc570d5bbf0d9df38c42d71389bdab14..e2e3bf5be69e9d1537540858e234b5449c080fba 100644 (file)
@@ -431,6 +431,12 @@ vdev_prop_init(void)
        zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
            PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
            sfeatures);
+       zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
+           PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
+           sfeatures);
+       zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
+           PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
+           sfeatures);
 
        /* default index (boolean) properties */
        zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,
index d6286dc5920bd8294022f25ab5d29a16c000a5b5..ebba453e2b144a3fa2faf70dc43454f896103f93 100644 (file)
@@ -677,6 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
        vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
        vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+       vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
+       vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
        list_link_init(&vd->vdev_config_dirty_node);
        list_link_init(&vd->vdev_state_dirty_node);
@@ -3755,6 +3757,18 @@ vdev_load(vdev_t *vd)
                if (error && error != ENOENT)
                        vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
                            "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
+                   &vd->vdev_slow_io_n);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+               error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
+                   &vd->vdev_slow_io_t);
+               if (error && error != ENOENT)
+                       vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+                           "failed [error=%d]", (u_longlong_t)zapobj, error);
        }
 
        /*
@@ -5970,6 +5984,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
                        }
                        vd->vdev_io_t = intval;
                        break;
+               case VDEV_PROP_SLOW_IO_N:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_slow_io_n = intval;
+                       break;
+               case VDEV_PROP_SLOW_IO_T:
+                       if (nvpair_value_uint64(elem, &intval) != 0) {
+                               error = EINVAL;
+                               break;
+                       }
+                       vd->vdev_slow_io_t = intval;
+                       break;
                default:
                        /* Most processing is done in vdev_props_set_sync */
                        break;
@@ -6313,6 +6341,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
                        case VDEV_PROP_CHECKSUM_T:
                        case VDEV_PROP_IO_N:
                        case VDEV_PROP_IO_T:
+                       case VDEV_PROP_SLOW_IO_N:
+                       case VDEV_PROP_SLOW_IO_T:
                                err = vdev_prop_get_int(vd, prop, &intval);
                                if (err && err != ENOENT)
                                        break;
index c4eb74e873db2b6bd9fadea01e8c5c3f5b73f38d..481af2ba826b295075d230a863b0fef44399a43f 100644 (file)
@@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
                case VDEV_PROP_IO_T:
                        propval = vd->vdev_io_t;
                        break;
+               case VDEV_PROP_SLOW_IO_N:
+                       propval = vd->vdev_slow_io_n;
+                       break;
+               case VDEV_PROP_SLOW_IO_T:
+                       propval = vd->vdev_slow_io_t;
+                       break;
                default:
                        propval = propdef;
                        break;
@@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
                            NULL);
        }
 
+       if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+               uint64_t slow_io_n, slow_io_t;
+
+               slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
+               if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
+                       fm_payload_set(ereport,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+                           DATA_TYPE_UINT64,
+                           slow_io_n,
+                           NULL);
+
+               slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
+               if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
+                       fm_payload_set(ereport,
+                           FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+                           DATA_TYPE_UINT64,
+                           slow_io_t,
+                           NULL);
+       }
+
        mutex_exit(&spa->spa_errlist_lock);
 
        *ereport_out = ereport;
index 3598351c499dc3ff2dba886793b6ab9199fcb498..609182f4a2cd0826db6c5b0070ff21f57e817090 100644 (file)
@@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
                if (vd->vdev_guid != handler->zi_record.zi_guid)
                        continue;
 
+               if (handler->zi_record.zi_iotype != ZIO_TYPES &&
+                   handler->zi_record.zi_iotype != zio->io_type)
+                               continue;
+
                /*
                 * Defensive; should never happen as the array allocation
                 * occurs prior to inserting this handler on the list.
index 6a4cd3fe691cf78b809ada882fdc11989a5ced77..a0b74ef4a8c6daa8f42dfe5781c8b77e82356e05 100644 (file)
@@ -104,7 +104,8 @@ tags = ['functional', 'devices']
 
 [tests/functional/events:Linux]
 tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
-    'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config']
+    'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
+    'zed_slow_io', 'zed_slow_io_many_vdevs']
 tags = ['functional', 'events']
 
 [tests/functional/fadvise:Linux]
index 01af258d59fed4bcd16020b968a382ca645499fa..fe9c92108725dcf55b5d0a0e53dea42f6198579d 100644 (file)
@@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
        functional/events/zed_fd_spill.ksh \
        functional/events/zed_io_config.ksh \
        functional/events/zed_rc_filter.ksh \
+       functional/events/zed_slow_io.ksh \
+       functional/events/zed_slow_io_many_vdevs.ksh \
        functional/exec/cleanup.ksh \
        functional/exec/exec_001_pos.ksh \
        functional/exec/exec_002_neg.ksh \
index 71a64d4fae7a6fd4c47bea4f745bc5b9213edd10..c3b9efd6464afb0a9109a0dfb214674472255f52 100644 (file)
@@ -70,4 +70,6 @@ typeset -a properties=(
     checksum_t
     io_n
     io_t
+    slow_io_n
+    slow_io_t
 )
index ef6e098cf42aefbce0aa615cec1545e256ea98d3..669b8ae99456a17d862ffef4108c319637560c81 100755 (executable)
 
 . $STF_SUITE/include/libtest.shlib
 
+zed_stop
+
 zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
 
-zed_stop
+zed_events_drain
 
 default_cleanup
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh
new file mode 100755 (executable)
index 0000000..d9fabb2
--- /dev/null
@@ -0,0 +1,205 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+#      Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
+#
+# STRATEGY:
+#      1. Create a pool with single vdev
+#      2. Set slow_io_n/slow_io_t to non-default values
+#      3. Inject slow io errors
+#      4. Verify that ZED degrades vdev
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV="$TEST_BASE_DIR/vdevfile.$$"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function do_setup
+{
+       log_must truncate -s 1G $VDEV
+       default_setup_noexit $VDEV
+       zed_events_drain
+       log_must zfs set compression=off $TESTPOOL
+       log_must zfs set primarycache=none $TESTPOOL
+       log_must zfs set prefetch=none $TESTPOOL
+       log_must zfs set recordsize=512 $TESTPOOL
+       for i in {1..10}; do
+               dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
+       done
+       zpool sync
+}
+
+# intermediate cleanup
+function do_clean
+{
+       log_must zinject -c all
+       log_must zpool destroy $TESTPOOL
+       log_must rm -f $VDEV
+}
+
+# final cleanup
+function cleanup
+{
+       log_must zinject -c all
+
+       # if pool still exists then something failed so log additional info
+       if poolexists $TESTPOOL ; then
+               log_note "$(zpool status -s $TESTPOOL)"
+               echo "=================== zed log search ==================="
+               grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+               destroy_pool $TESTPOOL
+       fi
+       log_must zed_stop
+
+       log_must rm -f $VDEV
+
+       log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+       zpool sync
+       log_must set_tunable64 ZIO_SLOW_IO_MS 10
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+       log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
+       zpool sync
+}
+
+function stop_slow_io
+{
+       log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+       log_must zinject -c all
+}
+
+# Test default ZED settings:
+#    inject 10 events over 2.5 seconds, should not degrade.
+function default_degrade
+{
+       do_setup
+
+       start_slow_io
+       for i in {1..10}; do
+               dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+               sleep 0.25
+       done
+       stop_slow_io
+       log_note "$(zpool status -s $TESTPOOL)"
+
+       # give slow ZED a chance to process the delay events
+       sleep 18
+       log_note "$(zpool status -s $TESTPOOL)"
+
+       degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+       log_note $degrades vdev degrades in ZED log
+       [ $degrades -eq "0" ] || \
+               log_fail "expecting no degrade events, found $degrades"
+
+       do_clean
+}
+
+# change slow_io_n, slow_io_t to 5 events in 60 seconds
+# fire more than 5 events, should degrade
+function slow_io_degrade
+{
+       do_setup
+
+       zpool set slow_io_n=5 $TESTPOOL $VDEV
+       zpool set slow_io_t=60 $TESTPOOL $VDEV
+
+       start_slow_io
+       for i in {1..16}; do
+               dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+               sleep 0.5
+       done
+       stop_slow_io
+       zpool sync
+
+       #
+       # wait up to 60 seconds for kernel to produce at least 5 delay events
+       #
+       typeset -i i=0
+       typeset -i events=0
+       while [[ $i -lt 60 ]]; do
+               events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+               [[ $events -ge "5" ]] && break
+               i=$((i+1))
+               sleep 1
+       done
+       log_note "$events delay events found"
+
+       if [[ $events -ge "5" ]]; then
+               log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
+       fi
+
+       do_clean
+}
+
+# change slow_io_n, slow_io_t to 10 events in 1 second
+# inject events spaced 0.5 seconds apart, should not degrade
+function slow_io_no_degrade
+{
+       do_setup
+
+       zpool set slow_io_n=10 $TESTPOOL $VDEV
+       zpool set slow_io_t=1 $TESTPOOL $VDEV
+
+       start_slow_io
+       for i in {1..16}; do
+               dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
+               sleep 0.5
+       done
+       stop_slow_io
+       zpool sync
+
+       log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
+
+       do_clean
+}
+
+log_assert "Test ZED slow io configurability"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+
+default_degrade
+slow_io_degrade
+slow_io_no_degrade
+
+log_pass "Test ZED slow io configurability"
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
new file mode 100755 (executable)
index 0000000..3357ae2
--- /dev/null
@@ -0,0 +1,177 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2023, Klara Inc.
+#
+
+# DESCRIPTION:
+#      Verify that delay events from multiple vdevs doesnt degrade
+#
+# STRATEGY:
+#      1. Create a pool with a 3 disk raidz vdev
+#      2. Inject slow io errors
+#      3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_slow_io"
+VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
+VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
+VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
+VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
+VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
+TESTPOOL="slow_io_pool"
+FILEPATH="$TESTDIR/slow_io.testfile"
+
+OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
+OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
+
+verify_runnable "both"
+
+function cleanup
+{
+       log_must zinject -c all
+
+       # if pool still exists then something failed so log additional info
+       if poolexists $TESTPOOL ; then
+               log_note "$(zpool status -s $TESTPOOL)"
+               echo "=================== zed log search ==================="
+               grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+               destroy_pool $TESTPOOL
+       fi
+       log_must zed_stop
+
+       log_must rm -f $VDEVS
+       log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+}
+
+function start_slow_io
+{
+       for vdev in $VDEVS
+       do
+               log_must zpool set slow_io_n=4 $TESTPOOL $vdev
+               log_must zpool set slow_io_t=60 $TESTPOOL $vdev
+       done
+       zpool sync
+
+       log_must set_tunable64 ZIO_SLOW_IO_MS 10
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
+
+       for vdev in $VDEVS
+       do
+               log_must zinject -d $vdev -D10:1 $TESTPOOL
+       done
+       zpool sync
+}
+
+function stop_slow_io
+{
+       log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
+       log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
+
+       log_must zinject -c all
+}
+
+function multiple_slow_vdevs_test
+{
+       log_must truncate -s 1G $VDEVS
+       default_raidz_setup_noexit $VDEVS
+
+       log_must zpool events -c
+       log_must zfs set compression=off $TESTPOOL
+       log_must zfs set primarycache=none $TESTPOOL
+       log_must zfs set recordsize=4K $TESTPOOL
+
+       log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
+       zpool sync
+
+       #
+       # Read the file with slow io injected on the disks
+       # This will cause multiple errors on each disk to trip ZED SERD
+       #
+       #   pool: slow_io_pool
+       #  state: ONLINE
+       # config:
+       #
+       #         NAME                           STATE  READ WRITE CKSUM  SLOW
+       #         slow_io_pool                   ONLINE    0     0     0     -
+       #           raidz1-0                     ONLINE    0     0     0     -
+       #             /var/tmp/vdevfile1.499278  ONLINE    0     0     0   113
+       #             /var/tmp/vdevfile2.499278  ONLINE    0     0     0   109
+       #             /var/tmp/vdevfile3.499278  ONLINE    0     0     0    96
+       #             /var/tmp/vdevfile4.499278  ONLINE    0     0     0   109
+       #
+       start_slow_io
+       dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
+       stop_slow_io
+
+       # count events available for processing
+       typeset -i i=0
+       typeset -i events=0
+       while [[ $i -lt 60 ]]; do
+               events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
+               [[ $events -ge "50" ]] && break
+               i=$((i+1))
+               sleep 1
+       done
+       log_note "$events delay events found"
+       if [[ $events -lt "50" ]]; then
+               log_note "bailing: not enough events to complete the test"
+               destroy_pool $TESTPOOL
+               return
+       fi
+
+       #
+       # give slow ZED a chance to process the delay events
+       #
+       typeset -i i=0
+       typeset -i skips=0
+       while [[ $i -lt 75 ]]; do
+               skips=$(grep "retiring case" \
+                       $ZEDLET_DIR/zed.log | wc -l)
+               [[ $skips -gt "0" ]] && break
+               i=$((i+1))
+               sleep 1
+       done
+
+       log_note $skips degrade skips in ZED log after $i seconds
+       [ $skips -gt "0" ] || log_fail "expecting to see skips"
+
+       degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
+       log_note $degrades vdev degrades in ZED log
+       [ $degrades -eq "0" ] || \
+               log_fail "expecting no degrade events, found $degrades"
+
+       destroy_pool $TESTPOOL
+}
+
+log_assert "Test ZED slow io across multiple vdevs"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+multiple_slow_vdevs_test
+
+log_pass "Test ZED slow io across multiple vdevs"
index 654343c0cf00d94f7a6c64db9df3e47c0889677c..2959236b59a37e24de42613d94f236d08f8fef7f 100755 (executable)
@@ -32,5 +32,6 @@ cleanup_devices $DISKS
 
 zed_stop
 zed_cleanup resilver_finish-start-scrub.sh
+zed_events_drain
 
 log_pass
index 62f1c8ab56cb5856c09256cd80102c10a9338fae..61b9206ec1a65624b4bb9c34763e49c9463a256a 100755 (executable)
@@ -28,6 +28,7 @@
 
 verify_runnable "global"
 
+zed_events_drain
 zed_setup resilver_finish-start-scrub.sh
 zed_start