Avoid fault diagnosis if multiple vdevs have errors

author Don Brady <don.brady@klarasystems.com>

Wed, 18 Sep 2024 18:36:48 +0000 (12:36 -0600)

committer GitHub <noreply@github.com>

Wed, 18 Sep 2024 18:36:48 +0000 (11:36 -0700)
author Don Brady <don.brady@klarasystems.com>
Wed, 18 Sep 2024 18:36:48 +0000 (12:36 -0600)
committer GitHub <noreply@github.com>
Wed, 18 Sep 2024 18:36:48 +0000 (11:36 -0700)
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c

index e35cd0756c60b35e53ed03bb9cdf69c150cf77c3..b45b89af430fb12266160db35bc4fe7e063c54f3 100644 (file)
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -71,6 +71,7 @@ typedef struct zfs_case_data {
         uint64_t        zc_ena;
         uint64_t        zc_pool_guid;
         uint64_t        zc_vdev_guid;
+       uint64_t        zc_parent_guid;
         int             zc_pool_state;
         char            zc_serd_checksum[MAX_SERDLEN];
         char            zc_serd_io[MAX_SERDLEN];
@@ -181,10 +182,10 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
  }
  
  /*
- * count other unique slow-io cases in a pool
+ * Return count of other unique SERD cases under same vdev parent
   */
  static uint_t
-zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
+zfs_other_serd_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
  {
         zfs_case_t *zcp;
         uint_t cases = 0;
@@ -206,10 +207,32 @@ zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
  
         for (zcp = uu_list_first(zfs_cases); zcp != NULL;
             zcp = uu_list_next(zfs_cases, zcp)) {
-               if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
-                   zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
-                   zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
-                   fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
+               zfs_case_data_t *zcd = &zcp->zc_data;
+
+               /*
+                * must be same pool and parent vdev but different leaf vdev
+                */
+               if (zcd->zc_pool_guid != zfs_case->zc_pool_guid ||
+                   zcd->zc_parent_guid != zfs_case->zc_parent_guid ||
+                   zcd->zc_vdev_guid == zfs_case->zc_vdev_guid) {
+                       continue;
+               }
+
+               /*
+                * Check if there is another active serd case besides zfs_case
+                *
+                * Only one serd engine will be assigned to the case
+                */
+               if (zcd->zc_serd_checksum[0] == zfs_case->zc_serd_checksum[0] &&
+                   fmd_serd_active(hdl, zcd->zc_serd_checksum)) {
+                       cases++;
+               }
+               if (zcd->zc_serd_io[0] == zfs_case->zc_serd_io[0] &&
+                   fmd_serd_active(hdl, zcd->zc_serd_io)) {
+                       cases++;
+               }
+               if (zcd->zc_serd_slow_io[0] == zfs_case->zc_serd_slow_io[0] &&
+                   fmd_serd_active(hdl, zcd->zc_serd_slow_io)) {
                         cases++;
                 }
         }
@@ -502,6 +525,34 @@ zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
         }
  }
  
+/*
+ * Record the specified event in the SERD engine and return a
+ * boolean value indicating whether or not the engine fired as
+ * the result of inserting this event.
+ *
+ * When the pool has similar active cases on other vdevs, then
+ * the fired state is disregarded and the case is retired.
+ */
+static int
+zfs_fm_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep,
+    zfs_case_t *zcp, const char *err_type)
+{
+       int fired = fmd_serd_record(hdl, name, ep);
+       int peers = 0;
+
+       if (fired && (peers = zfs_other_serd_cases(hdl, &zcp->zc_data)) > 0) {
+               fmd_hdl_debug(hdl, "pool %llu is tracking %d other %s cases "
+                   "-- skip faulting the vdev %llu",
+                   (u_longlong_t)zcp->zc_data.zc_pool_guid,
+                   peers, err_type,
+                   (u_longlong_t)zcp->zc_data.zc_vdev_guid);
+               zfs_case_retire(hdl, zcp);
+               fired = 0;
+       }
+
+       return (fired);
+}
+
  /*
   * Main fmd entry point.
   */
@@ -510,7 +561,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
  {
         zfs_case_t *zcp, *dcp;
         int32_t pool_state;
-       uint64_t ena, pool_guid, vdev_guid;
+       uint64_t ena, pool_guid, vdev_guid, parent_guid;
         uint64_t checksum_n, checksum_t;
         uint64_t io_n, io_t;
         er_timeval_t pool_load;
@@ -600,6 +651,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
         if (nvlist_lookup_uint64(nvl,
             FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
                 vdev_guid = 0;
+       if (nvlist_lookup_uint64(nvl,
+           FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, &parent_guid) != 0)
+               parent_guid = 0;
         if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
                 ena = 0;
  
@@ -710,6 +764,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                 data.zc_ena = ena;
                 data.zc_pool_guid = pool_guid;
                 data.zc_vdev_guid = vdev_guid;
+               data.zc_parent_guid = parent_guid;
                 data.zc_pool_state = (int)pool_state;
  
                 fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
@@ -872,8 +927,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                                     SEC2NSEC(io_t));
                                 zfs_case_serialize(zcp);
                         }
-                       if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
+                       if (zfs_fm_serd_record(hdl, zcp->zc_data.zc_serd_io,
+                           ep, zcp, "io error")) {
                                 checkremove = B_TRUE;
+                       }
                 } else if (fmd_nvl_class_match(hdl, nvl,
                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
                         uint64_t slow_io_n, slow_io_t;
@@ -899,25 +956,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                         }
                         /* Pass event to SERD engine and see if this triggers */
                         if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
-                           fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
-                           ep)) {
-                               /*
-                                * Ignore a slow io diagnosis when other
-                                * VDEVs in the pool show signs of being slow.
-                                */
-                               if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
-                                       zfs_case_retire(hdl, zcp);
-                                       fmd_hdl_debug(hdl, "pool %llu has "
-                                           "multiple slow io cases -- skip "
-                                           "degrading vdev %llu",
-                                           (u_longlong_t)
-                                           zcp->zc_data.zc_pool_guid,
-                                           (u_longlong_t)
-                                           zcp->zc_data.zc_vdev_guid);
-                               } else {
-                                       zfs_case_solve(hdl, zcp,
-                                           "fault.fs.zfs.vdev.slow_io");
-                               }
+                           zfs_fm_serd_record(hdl,
+                           zcp->zc_data.zc_serd_slow_io, ep, zcp, "slow io")) {
+                               zfs_case_solve(hdl, zcp,
+                                   "fault.fs.zfs.vdev.slow_io");
                         }
                 } else if (fmd_nvl_class_match(hdl, nvl,
                     ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
@@ -968,8 +1010,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
                                     SEC2NSEC(checksum_t));
                                 zfs_case_serialize(zcp);
                         }
-                       if (fmd_serd_record(hdl,
-                           zcp->zc_data.zc_serd_checksum, ep)) {
+                       if (zfs_fm_serd_record(hdl,
+                           zcp->zc_data.zc_serd_checksum, ep, zcp,
+                           "checksum")) {
                                 zfs_case_solve(hdl, zcp,
                                     "fault.fs.zfs.vdev.checksum");
                         }
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run

index 4613c895b0cdc4ae02abc3119422b40cdd707d63..d791ce57c3febe9ce35acfcc818ea448fe9250e3 100644 (file)
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -109,7 +109,7 @@ tags = ['functional', 'direct']
  [tests/functional/events:Linux]
  tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
      'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
-    'zed_slow_io', 'zed_slow_io_many_vdevs']
+    'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple']
  tags = ['functional', 'events']
  
  [tests/functional/fadvise:Linux]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am

index 053a2c09f64944501eb9c80875e32059457719bb..131624c2e8efc9108d30eca913d1f7f80b165b31 100644 (file)
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1490,6 +1490,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
         functional/events/setup.ksh \
         functional/events/zed_cksum_config.ksh \
         functional/events/zed_cksum_reported.ksh \
+       functional/events/zed_diagnose_multiple.ksh \
         functional/events/zed_fd_spill.ksh \
         functional/events/zed_io_config.ksh \
         functional/events/zed_rc_filter.ksh \
diff --git a/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh b/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh

new file mode 100755 (executable)

index 0000000..aea9f1d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh
@@ -0,0 +1,168 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+# DESCRIPTION:
+#      Verify that simultaneous io error events from multiple vdevs
+#      doesn't generate a fault
+#
+# STRATEGY:
+#      1. Create a pool with a 4 disk raidz vdev
+#      2. Inject io errors
+#      3. Verify that ZED detects the errors but doesn't fault any vdevs
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TESTDIR="$TEST_BASE_DIR/zed_error_multiple"
+VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
+VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
+VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
+VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
+VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
+TESTPOOL="zed_test_pool"
+FILEPATH="$TESTDIR/zed.testfile"
+
+verify_runnable "both"
+
+function cleanup
+{
+       log_must zinject -c all
+
+       # if pool still exists then something failed so log additional info
+       if poolexists $TESTPOOL ; then
+               log_note "$(zpool status -s $TESTPOOL)"
+               echo "=================== zed log search ==================="
+               grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
+               destroy_pool $TESTPOOL
+       fi
+       log_must zed_stop
+
+       log_must rm -f $VDEVS
+}
+
+function start_io_errors
+{
+       for vdev in $VDEVS
+       do
+               log_must zpool set io_n=4 $TESTPOOL $vdev
+               log_must zpool set io_t=60 $TESTPOOL $vdev
+       done
+       zpool sync
+
+       for vdev in $VDEVS
+       do
+               log_must zinject -d $vdev -e io $TESTPOOL
+       done
+       zpool sync
+}
+
+function multiple_slow_vdevs_test
+{
+       log_must truncate -s 1G $VDEVS
+       default_raidz_setup_noexit $VDEVS
+
+       log_must zpool events -c
+       log_must zfs set compression=off $TESTPOOL
+       log_must zfs set primarycache=none $TESTPOOL
+       log_must zfs set recordsize=4K $TESTPOOL
+
+       log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=4
+       zpool sync
+
+       #
+       # Read the file with io errors injected on the disks
+       # This will cause multiple errors on each disk to trip ZED SERD
+       #
+       #   pool: zed_test_pool
+       #  state: ONLINE
+       # status: One or more devices has experienced an unrecoverable error.  An
+       #         attempt was made to correct the error.  Applications are unaffected.
+       # action: Determine if the device needs to be replaced, and clear the errors
+       #         using 'zpool clear' or replace the device with 'zpool replace'.
+       #    see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-9P
+       # config:
+       #
+       #         NAME                            STATE     READ WRITE CKSUM
+       #         zed_test_pool                   ONLINE       0     0     0
+       #           raidz1-0                      ONLINE       0     0     0
+       #             /var/tmp/vdevfile1.1547063  ONLINE     532   561     0
+       #             /var/tmp/vdevfile2.1547063  ONLINE     547   594     0
+       #             /var/tmp/vdevfile3.1547063  ONLINE   1.05K 1.10K     0
+       #             /var/tmp/vdevfile4.1547063  ONLINE   1.05K 1.00K     0
+       #
+
+       start_io_errors
+       dd if=$FILEPATH of=/dev/null bs=1M count=4 2>/dev/null
+       log_must zinject -c all
+
+       # count io error events available for processing
+       typeset -i i=0
+       typeset -i events=0
+       while [[ $i -lt 60 ]]; do
+               events=$(zpool events | grep "ereport\.fs\.zfs.io" | wc -l)
+               [[ $events -ge "50" ]] && break
+               i=$((i+1))
+               sleep 1
+       done
+       log_note "$events io error events found"
+       if [[ $events -lt "50" ]]; then
+               log_note "bailing: not enough events to complete the test"
+               destroy_pool $TESTPOOL
+               return
+       fi
+
+       #
+       # give slow ZED a chance to process the checkum events
+       #
+       typeset -i i=0
+       typeset -i skips=0
+       while [[ $i -lt 75 ]]; do
+               skips=$(grep "retiring case" \
+                       $ZEDLET_DIR/zed.log | wc -l)
+               [[ $skips -gt "0" ]] && break
+               i=$((i+1))
+               sleep 1
+       done
+
+       log_note $skips fault skips in ZED log after $i seconds
+       [ $skips -gt "0" ] || log_fail "expecting to see skips"
+
+       fault=$(grep "zpool_vdev_fault" $ZEDLET_DIR/zed.log | wc -l)
+       log_note $fault vdev fault in ZED log
+       [ $fault -eq "0" ] || \
+               log_fail "expecting no fault events, found $fault"
+
+       destroy_pool $TESTPOOL
+}
+
+log_assert "Test ZED io errors across multiple vdevs"
+log_onexit cleanup
+
+log_must zed_events_drain
+log_must zed_start
+multiple_slow_vdevs_test
+
+log_pass "Test ZED io errors across multiple vdevs"
diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh

index 3357ae2e35106a8c70f338d123adbd9306f41c25..e27ef91cbdc7d6f1732ade7baa79bee5ddf705d9 100755 (executable)
--- a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
+++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh
@@ -25,10 +25,10 @@
  #
  
  # DESCRIPTION:
-#      Verify that delay events from multiple vdevs doesnt degrade
+#      Verify that delay events from multiple vdevs doesn't degrade
  #
  # STRATEGY:
-#      1. Create a pool with a 3 disk raidz vdev
+#      1. Create a pool with a 4 disk raidz vdev
  #      2. Inject slow io errors
  #      3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
  #
author	Don Brady <don.brady@klarasystems.com>
	Wed, 18 Sep 2024 18:36:48 +0000 (12:36 -0600)
committer	GitHub <noreply@github.com>
	Wed, 18 Sep 2024 18:36:48 +0000 (11:36 -0700)
cmd/zed/agents/zfs_diagnosis.c		patch \| blob \| blame \| history
tests/runfiles/linux.run		patch \| blob \| blame \| history
tests/zfs-tests/tests/Makefile.am		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh	[new file with mode: 0755]	patch \| blob
tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh		patch \| blob \| blame \| history