Extend import_progress kstat with a notes field

[mirror_zfs.git] / module / zfs / spa_misc.c
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 53763e915ca83b2a959bc5efde2ec0425179a178..1e5ab59eb4d06ed25ce32ee86c9e6a4a19d62770 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -27,6 +27,7 @@
   * Copyright (c) 2017 Datto Inc.
   * Copyright (c) 2017, Intel Corporation.
   * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2023, Klara Inc.
   */
  
  #include <sys/zfs_context.h>
@@ -57,6 +58,7 @@
  #include <sys/fs/zfs.h>
  #include <sys/metaslab_impl.h>
  #include <sys/arc.h>
+#include <sys/brt.h>
  #include <sys/ddt.h>
  #include <sys/kstat.h>
  #include "zfs_prop.h"
@@ -386,8 +388,17 @@ uint_t spa_asize_inflation = 24;
  uint_t spa_slop_shift = 5;
  static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
  static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-static const int spa_allocators = 4;
  
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
+
+/*
+ * Spa active allocator.
+ * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>.
+ */
+const char *zfs_active_allocator = "dynamic";
  
  void
  spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -415,6 +426,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
  
         zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
             spa->spa_trust_config ? "trusted" : "untrusted", buf);
+
+       spa_import_progress_set_notes_nolog(spa, "%s", buf);
  }
  
  /*
@@ -492,8 +505,9 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
         return (1);
  }
  
-void
-spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+static void
+spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
+    int mmp_flag)
  {
         (void) tag;
         int wlocks_held = 0;
@@ -508,7 +522,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
                         continue;
                 mutex_enter(&scl->scl_lock);
                 if (rw == RW_READER) {
-                       while (scl->scl_writer || scl->scl_write_wanted) {
+                       while (scl->scl_writer ||
+                           (!mmp_flag && scl->scl_write_wanted)) {
                                 cv_wait(&scl->scl_cv, &scl->scl_lock);
                         }
                 } else {
@@ -526,6 +541,27 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
         ASSERT3U(wlocks_held, <=, locks);
  }
  
+void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+       spa_config_enter_impl(spa, locks, tag, rw, 0);
+}
+
+/*
+ * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * outstanding write lock requests. This is needed since the mmp updates are
+ * time sensitive and failure to service them promptly will result in a
+ * suspended pool. This pool suspension has been seen in practice when there is
+ * a single disk in a pool that is responding slowly and presumably about to
+ * fail.
+ */
+
+void
+spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+       spa_config_enter_impl(spa, locks, tag, rw, 1);
+}
+
  void
  spa_config_exit(spa_t *spa, int locks, const void *tag)
  {
@@ -686,6 +722,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
         spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
         spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+       spa_set_allocator(spa, zfs_active_allocator);
  
         zfs_refcount_create(&spa->spa_refcount);
         spa_config_lock_init(spa);
@@ -699,15 +736,18 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         if (altroot)
                 spa->spa_root = spa_strdup(altroot);
  
-       spa->spa_alloc_count = spa_allocators;
+       /* Do not allow more allocators than CPUs. */
+       spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+
         spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
             sizeof (spa_alloc_t), KM_SLEEP);
         for (int i = 0; i < spa->spa_alloc_count; i++) {
                 mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
                     NULL);
                 avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
-                   sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+                   sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
         }
+
         avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
             sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
         avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -748,6 +788,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         spa->spa_min_ashift = INT_MAX;
         spa->spa_max_ashift = 0;
         spa->spa_min_alloc = INT_MAX;
+       spa->spa_gcd_alloc = INT_MAX;
  
         /* Reset cached value */
         spa->spa_dedup_dspace = ~0ULL;
@@ -790,8 +831,7 @@ spa_remove(spa_t *spa)
         if (spa->spa_root)
                 spa_strfree(spa->spa_root);
  
-       while ((dp = list_head(&spa->spa_config_list)) != NULL) {
-               list_remove(&spa->spa_config_list, dp);
+       while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
                 if (dp->scd_path != NULL)
                         spa_strfree(dp->scd_path);
                 kmem_free(dp, sizeof (spa_config_dirent_t));
@@ -1834,7 +1874,7 @@ void
  spa_update_dspace(spa_t *spa)
  {
         spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
-           ddt_get_dedup_dspace(spa);
+           ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
         if (spa->spa_nonallocating_dspace > 0) {
                 /*
                  * Subtract the space provided by all non-allocating vdevs that
@@ -2146,6 +2186,7 @@ typedef struct spa_import_progress {
         uint64_t                pool_guid;      /* unique id for updates */
         char                    *pool_name;
         spa_load_state_t        spa_load_state;
+       char                    *spa_load_notes;
         uint64_t                mmp_sec_remaining;      /* MMP activity check */
         uint64_t                spa_load_max_txg;       /* rewind txg */
         procfs_list_node_t      smh_node;
@@ -2156,9 +2197,9 @@ spa_history_list_t *spa_import_progress_list = NULL;
  static int
  spa_import_progress_show_header(struct seq_file *f)
  {
-       seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+       seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
             "load_state", "multihost_secs", "max_txg",
-           "pool_name");
+           "pool_name", "notes");
         return (0);
  }
  
@@ -2167,11 +2208,12 @@ spa_import_progress_show(struct seq_file *f, void *data)
  {
         spa_import_progress_t *sip = (spa_import_progress_t *)data;
  
-       seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+       seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
             (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
             (u_longlong_t)sip->mmp_sec_remaining,
             (u_longlong_t)sip->spa_load_max_txg,
-           (sip->pool_name ? sip->pool_name : "-"));
+           (sip->pool_name ? sip->pool_name : "-"),
+           (sip->spa_load_notes ? sip->spa_load_notes : "-"));
  
         return (0);
  }
@@ -2185,6 +2227,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
                 sip = list_remove_head(&shl->procfs_list.pl_list);
                 if (sip->pool_name)
                         spa_strfree(sip->pool_name);
+               if (sip->spa_load_notes)
+                       kmem_strfree(sip->spa_load_notes);
                 kmem_free(sip, sizeof (spa_import_progress_t));
                 shl->size--;
         }
@@ -2240,6 +2284,10 @@ spa_import_progress_set_state(uint64_t pool_guid,
             sip = list_prev(&shl->procfs_list.pl_list, sip)) {
                 if (sip->pool_guid == pool_guid) {
                         sip->spa_load_state = load_state;
+                       if (sip->spa_load_notes != NULL) {
+                               kmem_strfree(sip->spa_load_notes);
+                               sip->spa_load_notes = NULL;
+                       }
                         error = 0;
                         break;
                 }
@@ -2249,6 +2297,59 @@ spa_import_progress_set_state(uint64_t pool_guid,
         return (error);
  }
  
+static void
+spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
+    const char *fmt, va_list adx)
+{
+       spa_history_list_t *shl = spa_import_progress_list;
+       spa_import_progress_t *sip;
+       uint64_t pool_guid = spa_guid(spa);
+
+       if (shl->size == 0)
+               return;
+
+       char *notes = kmem_vasprintf(fmt, adx);
+
+       mutex_enter(&shl->procfs_list.pl_lock);
+       for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+           sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+               if (sip->pool_guid == pool_guid) {
+                       if (sip->spa_load_notes != NULL) {
+                               kmem_strfree(sip->spa_load_notes);
+                               sip->spa_load_notes = NULL;
+                       }
+                       sip->spa_load_notes = notes;
+                       if (log_dbgmsg)
+                               zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
+                       notes = NULL;
+                       break;
+               }
+       }
+       mutex_exit(&shl->procfs_list.pl_lock);
+       if (notes != NULL)
+               kmem_strfree(notes);
+}
+
+void
+spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
+{
+       va_list adx;
+
+       va_start(adx, fmt);
+       spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
+       va_end(adx);
+}
+
+void
+spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
+{
+       va_list adx;
+
+       va_start(adx, fmt);
+       spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
+       va_end(adx);
+}
+
  int
  spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
  {
@@ -2306,7 +2407,7 @@ spa_import_progress_add(spa_t *spa)
  {
         spa_history_list_t *shl = spa_import_progress_list;
         spa_import_progress_t *sip;
-       char *poolname = NULL;
+       const char *poolname = NULL;
  
         sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
         sip->pool_guid = spa_guid(spa);
@@ -2317,6 +2418,7 @@ spa_import_progress_add(spa_t *spa)
                 poolname = spa_name(spa);
         sip->pool_name = spa_strdup(poolname);
         sip->spa_load_state = spa_load_state(spa);
+       sip->spa_load_notes = NULL;
  
         mutex_enter(&shl->procfs_list.pl_lock);
         procfs_list_add(&shl->procfs_list, sip);
@@ -2336,6 +2438,8 @@ spa_import_progress_remove(uint64_t pool_guid)
                 if (sip->pool_guid == pool_guid) {
                         if (sip->pool_name)
                                 spa_strfree(sip->pool_name);
+                       if (sip->spa_load_notes)
+                               spa_strfree(sip->spa_load_notes);
                         list_remove(&shl->procfs_list.pl_list, sip);
                         shl->size--;
                         kmem_free(sip, sizeof (spa_import_progress_t));
@@ -2410,11 +2514,11 @@ spa_init(spa_mode_t mode)
         unique_init();
         zfs_btree_init();
         metaslab_stat_init();
+       brt_init();
         ddt_init();
         zio_init();
         dmu_init();
         zil_init();
-       vdev_cache_stat_init();
         vdev_mirror_stat_init();
         vdev_raidz_math_init();
         vdev_file_init();
@@ -2438,7 +2542,6 @@ spa_fini(void)
         spa_evict_all();
  
         vdev_file_fini();
-       vdev_cache_stat_fini();
         vdev_mirror_stat_fini();
         vdev_raidz_math_fini();
         chksum_fini();
@@ -2446,6 +2549,7 @@ spa_fini(void)
         dmu_fini();
         zio_fini();
         ddt_fini();
+       brt_fini();
         metaslab_stat_fini();
         zfs_btree_fini();
         unique_fini();
@@ -2553,9 +2657,18 @@ spa_scan_stat_init(spa_t *spa)
                 spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
         else
                 spa->spa_scan_pass_scrub_pause = 0;
+
+       if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
+               spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
+       else
+               spa->spa_scan_pass_errorscrub_pause = 0;
+
         spa->spa_scan_pass_scrub_spent_paused = 0;
         spa->spa_scan_pass_exam = 0;
         spa->spa_scan_pass_issued = 0;
+
+       // error scrub stats
+       spa->spa_scan_pass_errorscrub_spent_paused = 0;
  }
  
  /*
@@ -2566,8 +2679,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
  {
         dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
  
-       if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+       if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
+           scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
                 return (SET_ERROR(ENOENT));
+
         memset(ps, 0, sizeof (pool_scan_stat_t));
  
         /* data stored on disk */
@@ -2577,7 +2692,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
         ps->pss_end_time = scn->scn_phys.scn_end_time;
         ps->pss_to_examine = scn->scn_phys.scn_to_examine;
         ps->pss_examined = scn->scn_phys.scn_examined;
-       ps->pss_to_process = scn->scn_phys.scn_to_process;
+       ps->pss_skipped = scn->scn_phys.scn_skipped;
         ps->pss_processed = scn->scn_phys.scn_processed;
         ps->pss_errors = scn->scn_phys.scn_errors;
  
@@ -2590,6 +2705,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
         ps->pss_issued =
             scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
  
+       /* error scrub data stored on disk */
+       ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
+       ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
+       ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
+       ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
+       ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
+       ps->pss_error_scrub_to_be_examined =
+           scn->errorscrub_phys.dep_to_examine;
+
+       /* error scrub data not stored on disk */
+       ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
+
         return (0);
  }
  
@@ -2709,8 +2836,7 @@ spa_state_to_name(spa_t *spa)
         vdev_state_t state = rvd->vdev_state;
         vdev_aux_t aux = rvd->vdev_stat.vs_aux;
  
-       if (spa_suspended(spa) &&
-           (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+       if (spa_suspended(spa))
                 return ("SUSPENDED");
  
         switch (state) {
@@ -2956,3 +3082,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
  
  ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
         param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+       "Number of allocators per spa, capped by ncpus");