Improve ZFS objset sync parallelism

author ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>

Mon, 6 Nov 2023 18:38:42 +0000 (11:38 -0700)

committer GitHub <noreply@github.com>

Mon, 6 Nov 2023 18:38:42 +0000 (10:38 -0800)
author ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>
Mon, 6 Nov 2023 18:38:42 +0000 (11:38 -0700)
committer GitHub <noreply@github.com>
Mon, 6 Nov 2023 18:38:42 +0000 (10:38 -0800)
diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h

index b23a939b3aa717a31c8d760b2560c3444dc2c08d..0f23eafe3d4e60061360908b43670e3ef25a0570 100644 (file)
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@@ -42,6 +42,7 @@ extern "C" {
  
  typedef struct taskq {
         struct taskqueue        *tq_queue;
+       int                     tq_nthreads;
  } taskq_t;
  
  typedef uintptr_t taskqid_t;
@@ -93,6 +94,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
      taskq_ent_t *);
  extern int taskq_empty_ent(taskq_ent_t *);
  taskq_t        *taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t        *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
  taskq_t        *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
  taskq_t        *taskq_create_proc(const char *, int, pri_t, int, int,
      struct proc *, uint_t);
diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h

index 6c1b4377a98a71a939fa52286e2c1436ca16a371..aa5860c56e83776363fb98776f9c63e2181b2756 100644 (file)
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -150,6 +150,8 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
  extern int taskq_empty_ent(taskq_ent_t *);
  extern void taskq_init_ent(taskq_ent_t *);
  extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
  extern void taskq_destroy(taskq_t *);
  extern void taskq_wait_id(taskq_t *, taskqid_t);
  extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
diff --git a/include/sys/spa.h b/include/sys/spa.h

index 88ef510b744b43b99b5a907b84ee77dd62937006..cef7933df44139b664db636169306c5a8c4b9214 100644 (file)
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -825,6 +825,11 @@ extern void spa_sync_allpools(void);
  
  extern uint_t zfs_sync_pass_deferred_free;
  
+/* spa sync taskqueues */
+taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
+void spa_sync_tq_destroy(spa_t *spa);
+void spa_select_allocator(zio_t *zio);
+
  /* spa namespace global mutex */
  extern kmutex_t spa_namespace_lock;
  
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h

index 094258d47a48d1054501f6a0d58a5da9a130ce36..b1eb06f94fcc246f2d8565c097b4fc6faa31f0f4 100644 (file)
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -188,6 +188,12 @@ typedef struct spa_taskqs {
         taskq_t **stqs_taskq;
  } spa_taskqs_t;
  
+/* one for each thread in the spa sync taskq */
+typedef struct spa_syncthread_info {
+       kthread_t       *sti_thread;
+       taskq_t         *sti_wr_iss_tq;         /* assigned wr_iss taskq */
+} spa_syncthread_info_t;
+
  typedef enum spa_all_vdev_zap_action {
         AVZ_ACTION_NONE = 0,
         AVZ_ACTION_DESTROY,     /* Destroy all per-vdev ZAPs and the AVZ. */
@@ -265,6 +271,10 @@ struct spa {
         int             spa_alloc_count;
         int             spa_active_allocator;   /* selectable allocator */
  
+       /* per-allocator sync thread taskqs */
+       taskq_t         *spa_sync_tq;
+       spa_syncthread_info_t *spa_syncthreads;
+
         spa_aux_vdev_t  spa_spares;             /* hot spares */
         spa_aux_vdev_t  spa_l2cache;            /* L2ARC cache devices */
         nvlist_t        *spa_label_features;    /* Features for reading MOS */
@@ -456,7 +466,7 @@ extern char *spa_config_path;
  extern const char *zfs_deadman_failmode;
  extern uint_t spa_slop_shift;
  extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio);
  extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
      task_func_t *func, void *arg, uint_t flags);
  extern void spa_load_spares(spa_t *spa);
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h

index 750ca612b962c0653851322e56c91f89bda2ccf5..9ec2f73b366ca5980db56005c7df008f51344fd2 100644 (file)
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -496,6 +496,8 @@ extern taskq_t *system_taskq;
  extern taskq_t *system_delay_taskq;
  
  extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+    kthread_t ***);
  #define        taskq_create_proc(a, b, c, d, e, p, f) \
             (taskq_create(a, b, c, d, e, f))
  #define        taskq_create_sysdc(a, b, d, e, p, dc, f) \
diff --git a/include/sys/zio.h b/include/sys/zio.h

index e1f4d5c0449909455470618a90535432ff642532..25a4b221f05e870f844affde24e22d32ff85eb40 100644 (file)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -223,6 +223,9 @@ typedef uint64_t zio_flag_t;
  #define        ZIO_FLAG_REEXECUTED     (1ULL << 29)
  #define        ZIO_FLAG_DELEGATED      (1ULL << 30)
  
+#define        ZIO_ALLOCATOR_NONE      (-1)
+#define        ZIO_HAS_ALLOCATOR(zio)  ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
+
  #define        ZIO_FLAG_MUSTSUCCEED            0
  #define        ZIO_FLAG_RAW    (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
  
@@ -526,6 +529,9 @@ struct zio {
  
         /* Taskq dispatching state */
         taskq_ent_t     io_tqent;
+
+       /* write issue taskq selection, based upon sync thread */
+       taskq_t         *io_wr_iss_tq;
  };
  
  enum blk_verify_flag {
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c

index a2e457ef9e6005e62ccb4b9af82e3e4d21cf654e..99a181ec3c93ddfdd6b98c52771613a45a9277ab 100644 (file)
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -337,6 +337,36 @@ taskq_destroy(taskq_t *tq)
         kmem_free(tq, sizeof (taskq_t));
  }
  
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+       taskq_t *tq;
+       kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+           KM_SLEEP);
+
+       (void) pri; (void) minalloc; (void) maxalloc;
+
+       flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+       tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+           flags | TASKQ_PREPOPULATE);
+       VERIFY(tq != NULL);
+       VERIFY(tq->tq_nthreads == nthreads);
+
+       for (int i = 0; i < nthreads; i++) {
+               kthreads[i] = tq->tq_threadlist[i];
+       }
+       *ktpp = kthreads;
+       return (tq);
+}
+
  int
  taskq_member(taskq_t *tq, kthread_t *t)
  {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4

index ddad00be412dfbfc6006ffa7b517b2dfb2c82d08..f9824ac170ea4ee5555d2ae6f8d83d2009e695ee 100644 (file)
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -496,6 +496,13 @@ If we have less than this amount of free space,
  most ZPL operations (e.g. write, create) will return
  .Sy ENOSPC .
  .
+.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
+Determines the number of block alloctators to use per spa instance.
+Capped by the number of actual CPUs in the system.
+.Pp
+Note that setting this value too high could result in performance
+degredation and/or excess fragmentation.
+.
  .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
  Limits the number of on-disk error log entries that will be converted to the
  new format when enabling the
@@ -1974,13 +1981,6 @@ and may need to load new metaslabs to satisfy these allocations.
  .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
  Rewrite new block pointers starting in this pass.
  .
-.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
-This controls the number of threads used by
-.Sy dp_sync_taskq .
-The default value of
-.Sy 75%
-will create a maximum of one thread per CPU.
-.
  .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
  Maximum size of TRIM command.
  Larger ranges will be split into chunks no larger than this value before
@@ -2265,6 +2265,14 @@ If
  .Sy 0 ,
  generate a system-dependent value close to 6 threads per taskq.
  .
+.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
+Determines the number of CPUs to run write issue taskqs.
+.Pp
+When 0 (the default), the value to use is computed internally
+as the number of actual CPUs in the system divided by the
+.Sy spa_num_allocators
+value.
+.
  .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
  Do not create zvol device nodes.
  This may slightly improve startup time on
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c

index 842b80ade1fbd020c83dec21faf741a81ce09650..6912b220a94e315a0979600d8e8ec746d9c506e1 100644 (file)
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -220,6 +220,7 @@ taskq_create_impl(const char *name, int nthreads, pri_t pri,
                 nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
  
         tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+       tq->tq_nthreads = nthreads;
         tq->tq_queue = taskqueue_create(name, M_WAITOK,
             taskqueue_thread_enqueue, &tq->tq_queue);
         taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
@@ -254,6 +255,87 @@ taskq_destroy(taskq_t *tq)
         kmem_free(tq, sizeof (*tq));
  }
  
+static void taskq_sync_assign(void *arg);
+
+typedef struct taskq_sync_arg {
+       kthread_t       *tqa_thread;
+       kcondvar_t      tqa_cv;
+       kmutex_t        tqa_lock;
+       int             tqa_ready;
+} taskq_sync_arg_t;
+
+static void
+taskq_sync_assign(void *arg)
+{
+       taskq_sync_arg_t *tqa = arg;
+
+       mutex_enter(&tqa->tqa_lock);
+       tqa->tqa_thread = curthread;
+       tqa->tqa_ready = 1;
+       cv_signal(&tqa->tqa_cv);
+       while (tqa->tqa_ready == 1)
+               cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
+       mutex_exit(&tqa->tqa_lock);
+}
+
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+       taskq_t *tq;
+       taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
+       kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+           KM_SLEEP);
+
+       flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+       tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+           flags | TASKQ_PREPOPULATE);
+       VERIFY(tq != NULL);
+       VERIFY(tq->tq_nthreads == nthreads);
+
+       /* spawn all syncthreads */
+       for (int i = 0; i < nthreads; i++) {
+               cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
+               mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
+               (void) taskq_dispatch(tq, taskq_sync_assign,
+                   &tqs[i], TQ_FRONT);
+       }
+
+       /* wait on all syncthreads to start */
+       for (int i = 0; i < nthreads; i++) {
+               mutex_enter(&tqs[i].tqa_lock);
+               while (tqs[i].tqa_ready == 0)
+                       cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
+               mutex_exit(&tqs[i].tqa_lock);
+       }
+
+       /* let all syncthreads resume, finish */
+       for (int i = 0; i < nthreads; i++) {
+               mutex_enter(&tqs[i].tqa_lock);
+               tqs[i].tqa_ready = 2;
+               cv_broadcast(&tqs[i].tqa_cv);
+               mutex_exit(&tqs[i].tqa_lock);
+       }
+       taskq_wait(tq);
+
+       for (int i = 0; i < nthreads; i++) {
+               kthreads[i] = tqs[i].tqa_thread;
+               mutex_destroy(&tqs[i].tqa_lock);
+               cv_destroy(&tqs[i].tqa_cv);
+       }
+       kmem_free(tqs, sizeof (*tqs) * nthreads);
+
+       *ktpp = kthreads;
+       return (tq);
+}
+
  int
  taskq_member(taskq_t *tq, kthread_t *thread)
  {
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c

index d18f935b167caa51b55cfe2abc62e9c99c42496f..79a1a8e5a5aa63be07c410517f892bcc14f58022 100644 (file)
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1262,6 +1262,42 @@ taskq_destroy(taskq_t *tq)
  }
  EXPORT_SYMBOL(taskq_destroy);
  
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+       taskq_t *tq;
+       taskq_thread_t *tqt;
+       int i = 0;
+       kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+           KM_SLEEP);
+
+       flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+       /* taskq_create spawns all the threads before returning */
+       tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+           flags | TASKQ_PREPOPULATE);
+       VERIFY(tq != NULL);
+       VERIFY(tq->tq_nthreads == nthreads);
+
+       list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
+               kthreads[i] = tqt->tqt_thread;
+               i++;
+       }
+
+       ASSERT3S(i, ==, nthreads);
+       *ktpp = kthreads;
+
+       return (tq);
+}
+EXPORT_SYMBOL(taskq_create_synced);
+
  static unsigned int spl_taskq_kick = 0;
  
  /*
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index c0c2692c113a9ec6b51e0a979ffb0f9f7584d081..0a179fffb16a65f12160c727391be86dd65d0c0e 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4587,6 +4587,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
         }
  }
  
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
  void
  dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
  {
@@ -5005,7 +5009,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
  }
  
  
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
  static void
  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  {
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c

index 76e65b5506a96ddea296a3dcf8afc57a929aa069..f098e1daa44bda5cc9e56b85854cbf784047a2e5 100644 (file)
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1639,28 +1639,90 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
         kmem_free(bp, sizeof (*bp));
  }
  
+typedef struct sync_objset_arg {
+       zio_t           *soa_zio;
+       objset_t        *soa_os;
+       dmu_tx_t        *soa_tx;
+       kmutex_t        soa_mutex;
+       int             soa_count;
+       taskq_ent_t     soa_tq_ent;
+} sync_objset_arg_t;
+
  typedef struct sync_dnodes_arg {
-       multilist_t *sda_list;
-       int sda_sublist_idx;
-       multilist_t *sda_newlist;
-       dmu_tx_t *sda_tx;
+       multilist_t     *sda_list;
+       int             sda_sublist_idx;
+       multilist_t     *sda_newlist;
+       sync_objset_arg_t *sda_soa;
  } sync_dnodes_arg_t;
  
+static void sync_meta_dnode_task(void *arg);
+
  static void
  sync_dnodes_task(void *arg)
  {
         sync_dnodes_arg_t *sda = arg;
+       sync_objset_arg_t *soa = sda->sda_soa;
+       objset_t *os = soa->soa_os;
  
         multilist_sublist_t *ms =
             multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
  
-       dmu_objset_sync_dnodes(ms, sda->sda_tx);
+       dmu_objset_sync_dnodes(ms, soa->soa_tx);
  
         multilist_sublist_unlock(ms);
  
         kmem_free(sda, sizeof (*sda));
+
+       mutex_enter(&soa->soa_mutex);
+       ASSERT(soa->soa_count != 0);
+       if (--soa->soa_count != 0) {
+               mutex_exit(&soa->soa_mutex);
+               return;
+       }
+       mutex_exit(&soa->soa_mutex);
+
+       taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+           sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
  }
  
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+       sync_objset_arg_t *soa = arg;
+       objset_t *os = soa->soa_os;
+       dmu_tx_t *tx = soa->soa_tx;
+       int txgoff = tx->tx_txg & TXG_MASK;
+       dbuf_dirty_record_t *dr;
+
+       ASSERT0(soa->soa_count);
+
+       list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+       while ((dr = list_remove_head(list)) != NULL) {
+               ASSERT0(dr->dr_dbuf->db_level);
+               zio_nowait(dr->dr_zio);
+       }
+
+       /* Enable dnode backfill if enough objects have been freed. */
+       if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+               os->os_rescan_dnodes = B_TRUE;
+               os->os_freed_dnodes = 0;
+       }
+
+       /*
+        * Free intent log blocks up to this tx.
+        */
+       zil_sync(os->os_zil, tx);
+       os->os_phys->os_zil_header = os->os_zil_header;
+       zio_nowait(soa->soa_zio);
+
+       mutex_destroy(&soa->soa_mutex);
+       kmem_free(soa, sizeof (*soa));
+}
  
  /* called from dsl */
  void
@@ -1670,8 +1732,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
         zbookmark_phys_t zb;
         zio_prop_t zp;
         zio_t *zio;
-       list_t *list;
-       dbuf_dirty_record_t *dr;
         int num_sublists;
         multilist_t *ml;
         blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
@@ -1758,39 +1818,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
                     offsetof(dnode_t, dn_dirty_link[txgoff]));
         }
  
+       /*
+        * zio_nowait(zio) is done after any/all sublist and meta dnode
+        * zios have been nowaited, and the zil_sync() has been performed.
+        * The soa is freed at the end of sync_meta_dnode_task.
+        */
+       sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+       soa->soa_zio = zio;
+       soa->soa_os = os;
+       soa->soa_tx = tx;
+       taskq_init_ent(&soa->soa_tq_ent);
+       mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
         ml = &os->os_dirty_dnodes[txgoff];
-       num_sublists = multilist_get_num_sublists(ml);
+       soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
         for (int i = 0; i < num_sublists; i++) {
                 if (multilist_sublist_is_empty_idx(ml, i))
-                       continue;
-               sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
-               sda->sda_list = ml;
-               sda->sda_sublist_idx = i;
-               sda->sda_tx = tx;
-               (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
-                   sync_dnodes_task, sda, 0);
-               /* callback frees sda */
-       }
-       taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
-
-       list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
-       while ((dr = list_remove_head(list)) != NULL) {
-               ASSERT0(dr->dr_dbuf->db_level);
-               zio_nowait(dr->dr_zio);
+                       soa->soa_count--;
         }
  
-       /* Enable dnode backfill if enough objects have been freed. */
-       if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
-               os->os_rescan_dnodes = B_TRUE;
-               os->os_freed_dnodes = 0;
+       if (soa->soa_count == 0) {
+               taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+                   sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+       } else {
+               /*
+                * Sync sublists in parallel. The last to finish
+                * (i.e., when soa->soa_count reaches zero) must
+                *  dispatch sync_meta_dnode_task.
+                */
+               for (int i = 0; i < num_sublists; i++) {
+                       if (multilist_sublist_is_empty_idx(ml, i))
+                               continue;
+                       sync_dnodes_arg_t *sda =
+                           kmem_alloc(sizeof (*sda), KM_SLEEP);
+                       sda->sda_list = ml;
+                       sda->sda_sublist_idx = i;
+                       sda->sda_soa = soa;
+                       (void) taskq_dispatch(
+                           dmu_objset_pool(os)->dp_sync_taskq,
+                           sync_dnodes_task, sda, 0);
+                       /* sync_dnodes_task frees sda */
+               }
         }
-
-       /*
-        * Free intent log blocks up to this tx.
-        */
-       zil_sync(os->os_zil, tx);
-       os->os_phys->os_zil_header = os->os_zil_header;
-       zio_nowait(zio);
  }
  
  boolean_t
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c

index 8e39af83bb0a9a8fda5aead96fbc16fe9c155b74..8cffbdb9d20bf647535d44b99fbf2b55b998ba05 100644 (file)
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -627,6 +627,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
  
  /*
   * Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
   */
  void
  dnode_sync(dnode_t *dn, dmu_tx_t *tx)
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c

index d6db617292234ef77d18ad37b6b6dc4cb9745a13..62a1649d3786412da6e00176e1ea0845a275ddbf 100644 (file)
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2069,8 +2069,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
         return (error);
  }
  
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
  void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
  {
         ASSERT(dmu_tx_is_syncing(tx));
         ASSERT(ds->ds_objset != NULL);
@@ -2098,7 +2099,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
                 ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
         }
  
-       dmu_objset_sync(ds->ds_objset, zio, tx);
+       dmu_objset_sync(ds->ds_objset, rio, tx);
  }
  
  /*
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index 17b971248283048946160b174c3a868bbbb31645..370c6a010dcad74332876cceef51f36111405a46 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -140,11 +140,6 @@ uint_t zfs_delay_min_dirty_percent = 60;
   */
  uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
  
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-static int zfs_sync_taskq_batch_pct = 75;
-
  /*
   * These tunables determine the behavior of how zil_itxg_clean() is
   * called via zil_clean() in the context of spa_sync(). When an itxg
@@ -214,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
         txg_list_create(&dp->dp_early_sync_tasks, spa,
             offsetof(dsl_sync_task_t, dst_node));
  
-       dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
-           zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
-           TASKQ_THREADS_CPU_PCT);
+       dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
  
         dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
             zfs_zil_clean_taskq_nthr_pct, minclsyspri,
@@ -409,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp)
         txg_list_destroy(&dp->dp_dirty_dirs);
  
         taskq_destroy(dp->dp_zil_clean_taskq);
-       taskq_destroy(dp->dp_sync_taskq);
+       spa_sync_tq_destroy(dp->dp_spa);
  
         /*
          * We can't set retry to TRUE since we're explicitly specifying
@@ -674,7 +667,7 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
  void
  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
  {
-       zio_t *zio;
+       zio_t *rio;     /* root zio for all dirty dataset syncs */
         dmu_tx_t *tx;
         dsl_dir_t *dd;
         dsl_dataset_t *ds;
@@ -704,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
         }
  
         /*
-        * Write out all dirty blocks of dirty datasets.
+        * Write out all dirty blocks of dirty datasets. Note, this could
+        * create a very large (+10k) zio tree.
          */
-       zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+       rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
                 /*
                  * We must not sync any non-MOS datasets twice, because
@@ -715,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                  */
                 ASSERT(!list_link_active(&ds->ds_synced_link));
                 list_insert_tail(&synced_datasets, ds);
-               dsl_dataset_sync(ds, zio, tx);
+               dsl_dataset_sync(ds, rio, tx);
         }
-       VERIFY0(zio_wait(zio));
+       VERIFY0(zio_wait(rio));
  
         /*
          * Update the long range free counter after
@@ -748,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
          * user accounting information (and we won't get confused
          * about which blocks are part of the snapshot).
          */
-       zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+       rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
         while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
                 objset_t *os = ds->ds_objset;
  
                 ASSERT(list_link_active(&ds->ds_synced_link));
                 dmu_buf_rele(ds->ds_dbuf, ds);
-               dsl_dataset_sync(ds, zio, tx);
+               dsl_dataset_sync(ds, rio, tx);
  
                 /*
                  * Release any key mappings created by calls to
@@ -767,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
                         key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
                 }
         }
-       VERIFY0(zio_wait(zio));
+       VERIFY0(zio_wait(rio));
  
         /*
          * Now that the datasets have been completely synced, we can
@@ -1481,9 +1475,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
  ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
         "How quickly delay approaches infinity");
  
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
-       "Max percent of CPUs that are used to sync dirty data");
-
  ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
         "Max percent of CPUs that are used per dp_sync_taskq");
  
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index aa97144f16e48bac3889800723d89c1c71c7f3aa..68f367c1c7442d2a0a85c93a43649153fc68a2d2 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -99,6 +99,7 @@
  
  #include "zfs_prop.h"
  #include "zfs_comutil.h"
+#include <cityhash.h>
  
  /*
   * spa_thread() existed on Illumos as a parent thread for the various worker
@@ -128,16 +129,16 @@ int zfs_ccw_retry_interval = 300;
  
  typedef enum zti_modes {
         ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
-       ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
         ZTI_MODE_SCALE,                 /* Taskqs scale with CPUs. */
+       ZTI_MODE_SYNC,                  /* sync thread assigned */
         ZTI_MODE_NULL,                  /* don't create a taskq */
         ZTI_NMODES
  } zti_modes_t;
  
  #define        ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  #define        ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define        ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  #define        ZTI_SCALE       { ZTI_MODE_SCALE, 0, 1 }
+#define        ZTI_SYNC        { ZTI_MODE_SYNC, 0, 1 }
  #define        ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  
  #define        ZTI_N(n)        ZTI_P(n, 1)
@@ -158,14 +159,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
   * initializing a pool, we use this table to create an appropriately sized
   * taskq. Some operations are low volume and therefore have a small, static
   * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
   * macro causes us to create a taskq oriented for throughput. Some operations
   * are so high frequency and short-lived that the taskq itself can become a
   * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
   * additional degree of parallelism specified by the number of threads per-
   * taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
   *
   * The different taskq priorities are to handle the different contexts (issue
   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -175,7 +176,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
         { ZTI_N(8),     ZTI_NULL,       ZTI_SCALE,      ZTI_NULL }, /* READ */
-       { ZTI_BATCH,    ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
+       { ZTI_SYNC,     ZTI_N(5),       ZTI_SCALE,      ZTI_N(5) }, /* WRITE */
         { ZTI_SCALE,    ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
@@ -206,6 +207,8 @@ static const uint_t zio_taskq_basedc = 80;    /* base duty cycle */
  static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
  #endif
  
+static uint_t  zio_taskq_wr_iss_ncpus = 0;
+
  /*
   * Report any spa_load_verify errors found, but do not fail spa_load.
   * This is used by zdb to analyze non-idle pools.
@@ -1054,21 +1057,34 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
         uint_t count = ztip->zti_count;
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         uint_t cpus, flags = TASKQ_DYNAMIC;
-#ifdef HAVE_SYSDC
-       boolean_t batch = B_FALSE;
-#endif
  
         switch (mode) {
         case ZTI_MODE_FIXED:
                 ASSERT3U(value, >, 0);
                 break;
  
-       case ZTI_MODE_BATCH:
-#ifdef HAVE_SYSDC
-               batch = B_TRUE;
-#endif
+       case ZTI_MODE_SYNC:
+
+               /*
+                * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+                * not to exceed the number of spa allocators.
+                */
+               if (zio_taskq_wr_iss_ncpus == 0) {
+                       count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+               } else {
+                       count = MAX(1,
+                           boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+               }
+               count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+               count = MIN(count, spa->spa_alloc_count);
+
+               /*
+                * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+                * single taskq may have more threads than 100% of online cpus.
+                */
+               value = (zio_taskq_batch_pct + count / 2) / count;
+               value = MIN(value, 100);
                 flags |= TASKQ_THREADS_CPU_PCT;
-               value = MIN(zio_taskq_batch_pct, 100);
                 break;
  
         case ZTI_MODE_SCALE:
@@ -1115,7 +1131,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  
         default:
                 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
-                   "spa_activate()",
+                   "spa_taskqs_init()",
                     zio_type_name[t], zio_taskq_types[q], mode, value);
                 break;
         }
@@ -1137,9 +1153,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  
  #ifdef HAVE_SYSDC
                 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-                       if (batch)
-                               flags |= TASKQ_DC_BATCH;
-
                         (void) zio_taskq_basedc;
                         tq = taskq_create_sysdc(name, value, 50, INT_MAX,
                             spa->spa_proc, zio_taskq_basedc, flags);
@@ -1200,12 +1213,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  /*
   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
   * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
   */
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
-    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    zio_t *zio)
  {
         spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
         taskq_t *tq;
@@ -1213,12 +1225,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
         ASSERT3P(tqs->stqs_taskq, !=, NULL);
         ASSERT3U(tqs->stqs_count, !=, 0);
  
+       if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+           (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+               /* dispatch to assigned write issue taskq */
+               tq = zio->io_wr_iss_tq;
+               return (tq);
+       }
+
         if (tqs->stqs_count == 1) {
                 tq = tqs->stqs_taskq[0];
         } else {
                 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
         }
+       return (tq);
+}
  
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+    zio_t *zio)
+{
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
         taskq_dispatch_ent(tq, func, arg, flags, ent);
  }
  
@@ -1229,20 +1256,8 @@ void
  spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
      task_func_t *func, void *arg, uint_t flags)
  {
-       spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
-       taskq_t *tq;
-       taskqid_t id;
-
-       ASSERT3P(tqs->stqs_taskq, !=, NULL);
-       ASSERT3U(tqs->stqs_count, !=, 0);
-
-       if (tqs->stqs_count == 1) {
-               tq = tqs->stqs_taskq[0];
-       } else {
-               tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
-       }
-
-       id = taskq_dispatch(tq, func, arg, flags);
+       taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+       taskqid_t id = taskq_dispatch(tq, func, arg, flags);
         if (id)
                 taskq_wait_id(tq, id);
  }
@@ -9649,6 +9664,104 @@ spa_sync_allpools(void)
         mutex_exit(&spa_namespace_lock);
  }
  
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+       kthread_t **kthreads;
+
+       ASSERT(spa->spa_sync_tq == NULL);
+       ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+       /*
+        * - do not allow more allocators than cpus.
+        * - there may be more cpus than allocators.
+        * - do not allow more sync taskq threads than allocators or cpus.
+        */
+       int nthreads = spa->spa_alloc_count;
+       spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+           nthreads, KM_SLEEP);
+
+       spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+           nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+       VERIFY(spa->spa_sync_tq != NULL);
+       VERIFY(kthreads != NULL);
+
+       spa_taskqs_t *tqs =
+           &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+       spa_syncthread_info_t *ti = spa->spa_syncthreads;
+       for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+               ti->sti_thread = kthreads[i];
+               if (w == tqs->stqs_count) {
+                       w = 0;
+               }
+               ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+       }
+
+       kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+       return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+       ASSERT(spa->spa_sync_tq != NULL);
+
+       taskq_wait(spa->spa_sync_tq);
+       taskq_destroy(spa->spa_sync_tq);
+       kmem_free(spa->spa_syncthreads,
+           sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+       spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+       zbookmark_phys_t *bm = &zio->io_bookmark;
+       spa_t *spa = zio->io_spa;
+
+       ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+       /*
+        * A gang block (for example) may have inherited its parent's
+        * allocator, in which case there is nothing further to do here.
+        */
+       if (ZIO_HAS_ALLOCATOR(zio))
+               return;
+
+       ASSERT(spa != NULL);
+       ASSERT(bm != NULL);
+
+       /*
+        * First try to use an allocator assigned to the syncthread, and set
+        * the corresponding write issue taskq for the allocator.
+        * Note, we must have an open pool to do this.
+        */
+       if (spa->spa_sync_tq != NULL) {
+               spa_syncthread_info_t *ti = spa->spa_syncthreads;
+               for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+                       if (ti->sti_thread == curthread) {
+                               zio->io_allocator = i;
+                               zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+                               return;
+                       }
+               }
+       }
+
+       /*
+        * We want to try to use as many allocators as possible to help improve
+        * performance, but we also want logically adjacent IOs to be physically
+        * adjacent to improve sequential read performance. We chunk each object
+        * into 2^20 block regions, and then hash based on the objset, object,
+        * level, and region to accomplish both of these goals.
+        */
+       uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+           bm->zb_blkid >> 20);
+
+       zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+       zio->io_wr_iss_tq = NULL;
+}
+
  /*
   * ==========================================================================
   * Miscellaneous routines
@@ -10242,3 +10355,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
         "Whether extra ALLOC blkptrs were added to a livelist entry while it "
         "was being condensed");
  /* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+       "Number of CPUs to run write issue taskqs");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index c7472f972cc2d5ffe9577e961d0c9e4b5e543299..3990af98c732d0d912b66ce333c44854244e03f1 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -388,7 +388,11 @@ uint_t spa_asize_inflation = 24;
  uint_t spa_slop_shift = 5;
  static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
  static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-static const int spa_allocators = 4;
+
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
  
  /*
   * Spa active allocator.
@@ -730,7 +734,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
         if (altroot)
                 spa->spa_root = spa_strdup(altroot);
  
-       spa->spa_alloc_count = spa_allocators;
+       /* Do not allow more allocators than CPUs. */
+       spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+
         spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
             sizeof (spa_alloc_t), KM_SLEEP);
         for (int i = 0; i < spa->spa_alloc_count; i++) {
@@ -739,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
                 avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
                     sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
         }
+
         avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
             sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
         avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -3009,3 +3016,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
  
  ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
         param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+       "Number of allocators per spa, capped by ncpus");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index 4eb276352a23984b6377db83f7e0ae573b39381e..2f5b423ee72e52a246c84e5eb0060d0dacd45d1f 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -899,6 +899,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         zio->io_orig_stage = zio->io_stage = stage;
         zio->io_orig_pipeline = zio->io_pipeline = pipeline;
         zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+       zio->io_allocator = ZIO_ALLOCATOR_NONE;
  
         zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
             (pipeline & ZIO_STAGE_READY) == 0;
@@ -2007,7 +2008,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
          */
         ASSERT(taskq_empty_ent(&zio->io_tqent));
         spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
-           &zio->io_tqent);
+           &zio->io_tqent, zio);
  }
  
  static boolean_t
@@ -2032,8 +2033,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
  static zio_t *
  zio_issue_async(zio_t *zio)
  {
+       ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
         zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
         return (NULL);
  }
  
@@ -2347,6 +2348,9 @@ zio_wait(zio_t *zio)
         ASSERT0(zio->io_queued_timestamp);
         zio->io_queued_timestamp = gethrtime();
  
+       if (zio->io_type == ZIO_TYPE_WRITE) {
+               spa_select_allocator(zio);
+       }
         __zio_execute(zio);
  
         mutex_enter(&zio->io_lock);
@@ -2399,6 +2403,9 @@ zio_nowait(zio_t *zio)
  
         ASSERT0(zio->io_queued_timestamp);
         zio->io_queued_timestamp = gethrtime();
+       if (zio->io_type == ZIO_TYPE_WRITE) {
+               spa_select_allocator(zio);
+       }
         __zio_execute(zio);
  }
  
@@ -2863,6 +2870,13 @@ zio_gang_issue(zio_t *zio)
         return (zio);
  }
  
+static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+       cio->io_allocator = pio->io_allocator;
+       cio->io_wr_iss_tq = pio->io_wr_iss_tq;
+}
+
  static void
  zio_write_gang_member_ready(zio_t *zio)
  {
@@ -2934,6 +2948,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
                 gbh_copies = MIN(2, spa_max_replication(spa));
         }
  
+       ASSERT(ZIO_HAS_ALLOCATOR(pio));
         int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
         if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
                 ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2997,6 +3012,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
             zio_write_gang_done, NULL, pio->io_priority,
             ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
  
+       zio_gang_inherit_allocator(pio, zio);
+
         /*
          * Create and nowait the gang children.
          */
@@ -3027,6 +3044,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
                     zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
                     ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
  
+               zio_gang_inherit_allocator(zio, cio);
+
                 if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
                         ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
                         ASSERT(has_data);
@@ -3539,6 +3558,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
                 return (NULL);
  
         ASSERT(IO_IS_ALLOCATING(zio));
+       ASSERT(ZIO_HAS_ALLOCATOR(zio));
  
         /*
          * Try to place a reservation for this zio. If we're unable to
@@ -3575,21 +3595,12 @@ zio_dva_throttle(zio_t *zio)
         }
  
         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+       ASSERT(ZIO_HAS_ALLOCATOR(zio));
         ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
         ASSERT3U(zio->io_queued_timestamp, >, 0);
         ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
  
-       zbookmark_phys_t *bm = &zio->io_bookmark;
-       /*
-        * We want to try to use as many allocators as possible to help improve
-        * performance, but we also want logically adjacent IOs to be physically
-        * adjacent to improve sequential read performance. We chunk each object
-        * into 2^20 block regions, and then hash based on the objset, object,
-        * level, and region to accomplish both of these goals.
-        */
-       int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
-           bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
-       zio->io_allocator = allocator;
+       int allocator = zio->io_allocator;
         zio->io_metaslab_class = mc;
         mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
         avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
@@ -3663,6 +3674,7 @@ zio_dva_allocate(zio_t *zio)
          * sync write performance.  If a log allocation fails, we will fall
          * back to spa_sync() which is abysmal for performance.
          */
+       ASSERT(ZIO_HAS_ALLOCATOR(zio));
         error = metaslab_alloc(spa, mc, zio->io_size, bp,
             zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
             &zio->io_alloc_list, zio, zio->io_allocator);
@@ -4515,6 +4527,7 @@ zio_ready(zio_t *zio)
                         ASSERT(IO_IS_ALLOCATING(zio));
                         ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
                         ASSERT(zio->io_metaslab_class != NULL);
+                       ASSERT(ZIO_HAS_ALLOCATOR(zio));
  
                         /*
                          * We were unable to allocate anything, unreserve and
@@ -4601,6 +4614,7 @@ zio_dva_throttle_done(zio_t *zio)
         }
  
         ASSERT(IO_IS_ALLOCATING(pio));
+       ASSERT(ZIO_HAS_ALLOCATOR(pio));
         ASSERT3P(zio, !=, zio->io_logical);
         ASSERT(zio->io_logical != NULL);
         ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
@@ -4663,6 +4677,7 @@ zio_done(zio_t *zio)
                 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
                 ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
                 ASSERT(zio->io_bp != NULL);
+               ASSERT(ZIO_HAS_ALLOCATOR(zio));
  
                 metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
                     zio->io_allocator);
@@ -4928,7 +4943,7 @@ zio_done(zio_t *zio)
                         ASSERT(taskq_empty_ent(&zio->io_tqent));
                         spa_taskq_dispatch_ent(zio->io_spa,
                             ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
-                           zio_reexecute, zio, 0, &zio->io_tqent);
+                           zio_reexecute, zio, 0, &zio->io_tqent, NULL);
                 }
                 return (NULL);
         }
author	ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com>
	Mon, 6 Nov 2023 18:38:42 +0000 (11:38 -0700)
committer	GitHub <noreply@github.com>
	Mon, 6 Nov 2023 18:38:42 +0000 (10:38 -0800)
include/os/freebsd/spl/sys/taskq.h		patch \| blob \| blame \| history
include/os/linux/spl/sys/taskq.h		patch \| blob \| blame \| history
include/sys/spa.h		patch \| blob \| blame \| history
include/sys/spa_impl.h		patch \| blob \| blame \| history
include/sys/zfs_context.h		patch \| blob \| blame \| history
include/sys/zio.h		patch \| blob \| blame \| history
lib/libzpool/taskq.c		patch \| blob \| blame \| history
man/man4/zfs.4		patch \| blob \| blame \| history
module/os/freebsd/spl/spl_taskq.c		patch \| blob \| blame \| history
module/os/linux/spl/spl-taskq.c		patch \| blob \| blame \| history
module/zfs/dbuf.c		patch \| blob \| blame \| history
module/zfs/dmu_objset.c		patch \| blob \| blame \| history
module/zfs/dnode_sync.c		patch \| blob \| blame \| history
module/zfs/dsl_dataset.c		patch \| blob \| blame \| history
module/zfs/dsl_pool.c		patch \| blob \| blame \| history
module/zfs/spa.c		patch \| blob \| blame \| history
module/zfs/spa_misc.c		patch \| blob \| blame \| history
module/zfs/zio.c		patch \| blob \| blame \| history