typedef struct taskq {
struct taskqueue *tq_queue;
+ int tq_nthreads;
} taskq_t;
typedef uintptr_t taskqid_t;
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+ kthread_t ***);
taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
struct proc *, uint_t);
extern int taskq_empty_ent(taskq_ent_t *);
extern void taskq_init_ent(taskq_ent_t *);
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+ kthread_t ***);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait_id(taskq_t *, taskqid_t);
extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
extern uint_t zfs_sync_pass_deferred_free;
+/* spa sync taskqueues */
+taskq_t *spa_sync_tq_create(spa_t *spa, const char *name);
+void spa_sync_tq_destroy(spa_t *spa);
+void spa_select_allocator(zio_t *zio);
+
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
taskq_t **stqs_taskq;
} spa_taskqs_t;
+/* one for each thread in the spa sync taskq */
+typedef struct spa_syncthread_info {
+ kthread_t *sti_thread;
+ taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */
+} spa_syncthread_info_t;
+
typedef enum spa_all_vdev_zap_action {
AVZ_ACTION_NONE = 0,
AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
int spa_alloc_count;
int spa_active_allocator; /* selectable allocator */
+ /* per-allocator sync thread taskqs */
+ taskq_t *spa_sync_tq;
+ spa_syncthread_info_t *spa_syncthreads;
+
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
extern const char *zfs_deadman_failmode;
extern uint_t spa_slop_shift;
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio);
extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags);
extern void spa_load_spares(spa_t *spa);
extern taskq_t *system_delay_taskq;
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
+ kthread_t ***);
#define taskq_create_proc(a, b, c, d, e, p, f) \
(taskq_create(a, b, c, d, e, f))
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30)
+#define ZIO_ALLOCATOR_NONE (-1)
+#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
+
#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
/* Taskq dispatching state */
taskq_ent_t io_tqent;
+
+ /* write issue taskq selection, based upon sync thread */
+ taskq_t *io_wr_iss_tq;
};
enum blk_verify_flag {
kmem_free(tq, sizeof (taskq_t));
}
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+ taskq_t *tq;
+ kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+ KM_SLEEP);
+
+ (void) pri; (void) minalloc; (void) maxalloc;
+
+ flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+ flags | TASKQ_PREPOPULATE);
+ VERIFY(tq != NULL);
+ VERIFY(tq->tq_nthreads == nthreads);
+
+ for (int i = 0; i < nthreads; i++) {
+ kthreads[i] = tq->tq_threadlist[i];
+ }
+ *ktpp = kthreads;
+ return (tq);
+}
+
int
taskq_member(taskq_t *tq, kthread_t *t)
{
most ZPL operations (e.g. write, create) will return
.Sy ENOSPC .
.
+.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
+Determines the number of block alloctators to use per spa instance.
+Capped by the number of actual CPUs in the system.
+.Pp
+Note that setting this value too high could result in performance
+degredation and/or excess fragmentation.
+.
.It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
Limits the number of on-disk error log entries that will be converted to the
new format when enabling the
.It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
Rewrite new block pointers starting in this pass.
.
-.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
-This controls the number of threads used by
-.Sy dp_sync_taskq .
-The default value of
-.Sy 75%
-will create a maximum of one thread per CPU.
-.
.It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
Maximum size of TRIM command.
Larger ranges will be split into chunks no larger than this value before
.Sy 0 ,
generate a system-dependent value close to 6 threads per taskq.
.
+.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 0 Pq uint
+Determines the number of CPUs to run write issue taskqs.
+.Pp
+When 0 (the default), the value to use is computed internally
+as the number of actual CPUs in the system divided by the
+.Sy spa_num_allocators
+value.
+.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
This may slightly improve startup time on
nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+ tq->tq_nthreads = nthreads;
tq->tq_queue = taskqueue_create(name, M_WAITOK,
taskqueue_thread_enqueue, &tq->tq_queue);
taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
kmem_free(tq, sizeof (*tq));
}
+static void taskq_sync_assign(void *arg);
+
+typedef struct taskq_sync_arg {
+ kthread_t *tqa_thread;
+ kcondvar_t tqa_cv;
+ kmutex_t tqa_lock;
+ int tqa_ready;
+} taskq_sync_arg_t;
+
+static void
+taskq_sync_assign(void *arg)
+{
+ taskq_sync_arg_t *tqa = arg;
+
+ mutex_enter(&tqa->tqa_lock);
+ tqa->tqa_thread = curthread;
+ tqa->tqa_ready = 1;
+ cv_signal(&tqa->tqa_cv);
+ while (tqa->tqa_ready == 1)
+ cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
+ mutex_exit(&tqa->tqa_lock);
+}
+
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+ taskq_t *tq;
+ taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
+ kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+ KM_SLEEP);
+
+ flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+ flags | TASKQ_PREPOPULATE);
+ VERIFY(tq != NULL);
+ VERIFY(tq->tq_nthreads == nthreads);
+
+ /* spawn all syncthreads */
+ for (int i = 0; i < nthreads; i++) {
+ cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
+ (void) taskq_dispatch(tq, taskq_sync_assign,
+ &tqs[i], TQ_FRONT);
+ }
+
+ /* wait on all syncthreads to start */
+ for (int i = 0; i < nthreads; i++) {
+ mutex_enter(&tqs[i].tqa_lock);
+ while (tqs[i].tqa_ready == 0)
+ cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
+ mutex_exit(&tqs[i].tqa_lock);
+ }
+
+ /* let all syncthreads resume, finish */
+ for (int i = 0; i < nthreads; i++) {
+ mutex_enter(&tqs[i].tqa_lock);
+ tqs[i].tqa_ready = 2;
+ cv_broadcast(&tqs[i].tqa_cv);
+ mutex_exit(&tqs[i].tqa_lock);
+ }
+ taskq_wait(tq);
+
+ for (int i = 0; i < nthreads; i++) {
+ kthreads[i] = tqs[i].tqa_thread;
+ mutex_destroy(&tqs[i].tqa_lock);
+ cv_destroy(&tqs[i].tqa_cv);
+ }
+ kmem_free(tqs, sizeof (*tqs) * nthreads);
+
+ *ktpp = kthreads;
+ return (tq);
+}
+
int
taskq_member(taskq_t *tq, kthread_t *thread)
{
}
EXPORT_SYMBOL(taskq_destroy);
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int i = 0;
+ kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+ KM_SLEEP);
+
+ flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+ /* taskq_create spawns all the threads before returning */
+ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+ flags | TASKQ_PREPOPULATE);
+ VERIFY(tq != NULL);
+ VERIFY(tq->tq_nthreads == nthreads);
+
+ list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
+ kthreads[i] = tqt->tqt_thread;
+ i++;
+ }
+
+ ASSERT3S(i, ==, nthreads);
+ *ktpp = kthreads;
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create_synced);
+
static unsigned int spl_taskq_kick = 0;
/*
}
}
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs. May be
+ * called recursively from dbuf_sync_indirect().
+ */
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
}
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
kmem_free(bp, sizeof (*bp));
}
+typedef struct sync_objset_arg {
+ zio_t *soa_zio;
+ objset_t *soa_os;
+ dmu_tx_t *soa_tx;
+ kmutex_t soa_mutex;
+ int soa_count;
+ taskq_ent_t soa_tq_ent;
+} sync_objset_arg_t;
+
typedef struct sync_dnodes_arg {
- multilist_t *sda_list;
- int sda_sublist_idx;
- multilist_t *sda_newlist;
- dmu_tx_t *sda_tx;
+ multilist_t *sda_list;
+ int sda_sublist_idx;
+ multilist_t *sda_newlist;
+ sync_objset_arg_t *sda_soa;
} sync_dnodes_arg_t;
+static void sync_meta_dnode_task(void *arg);
+
static void
sync_dnodes_task(void *arg)
{
sync_dnodes_arg_t *sda = arg;
+ sync_objset_arg_t *soa = sda->sda_soa;
+ objset_t *os = soa->soa_os;
multilist_sublist_t *ms =
multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
- dmu_objset_sync_dnodes(ms, sda->sda_tx);
+ dmu_objset_sync_dnodes(ms, soa->soa_tx);
multilist_sublist_unlock(ms);
kmem_free(sda, sizeof (*sda));
+
+ mutex_enter(&soa->soa_mutex);
+ ASSERT(soa->soa_count != 0);
+ if (--soa->soa_count != 0) {
+ mutex_exit(&soa->soa_mutex);
+ return;
+ }
+ mutex_exit(&soa->soa_mutex);
+
+ taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
}
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+ sync_objset_arg_t *soa = arg;
+ objset_t *os = soa->soa_os;
+ dmu_tx_t *tx = soa->soa_tx;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT0(soa->soa_count);
+
+ list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+ while ((dr = list_remove_head(list)) != NULL) {
+ ASSERT0(dr->dr_dbuf->db_level);
+ zio_nowait(dr->dr_zio);
+ }
+
+ /* Enable dnode backfill if enough objects have been freed. */
+ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+ os->os_rescan_dnodes = B_TRUE;
+ os->os_freed_dnodes = 0;
+ }
+
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
+ zio_nowait(soa->soa_zio);
+
+ mutex_destroy(&soa->soa_mutex);
+ kmem_free(soa, sizeof (*soa));
+}
/* called from dsl */
void
zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
- list_t *list;
- dbuf_dirty_record_t *dr;
int num_sublists;
multilist_t *ml;
blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
offsetof(dnode_t, dn_dirty_link[txgoff]));
}
+ /*
+ * zio_nowait(zio) is done after any/all sublist and meta dnode
+ * zios have been nowaited, and the zil_sync() has been performed.
+ * The soa is freed at the end of sync_meta_dnode_task.
+ */
+ sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+ soa->soa_zio = zio;
+ soa->soa_os = os;
+ soa->soa_tx = tx;
+ taskq_init_ent(&soa->soa_tq_ent);
+ mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
ml = &os->os_dirty_dnodes[txgoff];
- num_sublists = multilist_get_num_sublists(ml);
+ soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
for (int i = 0; i < num_sublists; i++) {
if (multilist_sublist_is_empty_idx(ml, i))
- continue;
- sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
- sda->sda_list = ml;
- sda->sda_sublist_idx = i;
- sda->sda_tx = tx;
- (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
- sync_dnodes_task, sda, 0);
- /* callback frees sda */
- }
- taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
-
- list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
- while ((dr = list_remove_head(list)) != NULL) {
- ASSERT0(dr->dr_dbuf->db_level);
- zio_nowait(dr->dr_zio);
+ soa->soa_count--;
}
- /* Enable dnode backfill if enough objects have been freed. */
- if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
- os->os_rescan_dnodes = B_TRUE;
- os->os_freed_dnodes = 0;
+ if (soa->soa_count == 0) {
+ taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+ } else {
+ /*
+ * Sync sublists in parallel. The last to finish
+ * (i.e., when soa->soa_count reaches zero) must
+ * dispatch sync_meta_dnode_task.
+ */
+ for (int i = 0; i < num_sublists; i++) {
+ if (multilist_sublist_is_empty_idx(ml, i))
+ continue;
+ sync_dnodes_arg_t *sda =
+ kmem_alloc(sizeof (*sda), KM_SLEEP);
+ sda->sda_list = ml;
+ sda->sda_sublist_idx = i;
+ sda->sda_soa = soa;
+ (void) taskq_dispatch(
+ dmu_objset_pool(os)->dp_sync_taskq,
+ sync_dnodes_task, sda, 0);
+ /* sync_dnodes_task frees sda */
+ }
}
-
- /*
- * Free intent log blocks up to this tx.
- */
- zil_sync(os->os_zil, tx);
- os->os_phys->os_zil_header = os->os_zil_header;
- zio_nowait(zio);
}
boolean_t
/*
* Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
*/
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
return (error);
}
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_objset != NULL);
ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
}
- dmu_objset_sync(ds->ds_objset, zio, tx);
+ dmu_objset_sync(ds->ds_objset, rio, tx);
}
/*
*/
uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-static int zfs_sync_taskq_batch_pct = 75;
-
/*
* These tunables determine the behavior of how zil_itxg_clean() is
* called via zil_clean() in the context of spa_sync(). When an itxg
txg_list_create(&dp->dp_early_sync_tasks, spa,
offsetof(dsl_sync_task_t, dst_node));
- dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
- zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
- TASKQ_THREADS_CPU_PCT);
+ dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
zfs_zil_clean_taskq_nthr_pct, minclsyspri,
txg_list_destroy(&dp->dp_dirty_dirs);
taskq_destroy(dp->dp_zil_clean_taskq);
- taskq_destroy(dp->dp_sync_taskq);
+ spa_sync_tq_destroy(dp->dp_spa);
/*
* We can't set retry to TRUE since we're explicitly specifying
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
- zio_t *zio;
+ zio_t *rio; /* root zio for all dirty dataset syncs */
dmu_tx_t *tx;
dsl_dir_t *dd;
dsl_dataset_t *ds;
}
/*
- * Write out all dirty blocks of dirty datasets.
+ * Write out all dirty blocks of dirty datasets. Note, this could
+ * create a very large (+10k) zio tree.
*/
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
* We must not sync any non-MOS datasets twice, because
*/
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&synced_datasets, ds);
- dsl_dataset_sync(ds, zio, tx);
+ dsl_dataset_sync(ds, rio, tx);
}
- VERIFY0(zio_wait(zio));
+ VERIFY0(zio_wait(rio));
/*
* Update the long range free counter after
* user accounting information (and we won't get confused
* about which blocks are part of the snapshot).
*/
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
objset_t *os = ds->ds_objset;
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
- dsl_dataset_sync(ds, zio, tx);
+ dsl_dataset_sync(ds, rio, tx);
/*
* Release any key mappings created by calls to
key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
}
}
- VERIFY0(zio_wait(zio));
+ VERIFY0(zio_wait(rio));
/*
* Now that the datasets have been completely synced, we can
ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
"How quickly delay approaches infinity");
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
- "Max percent of CPUs that are used to sync dirty data");
-
ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
"Max percent of CPUs that are used per dp_sync_taskq");
#include "zfs_prop.h"
#include "zfs_comutil.h"
+#include <cityhash.h>
/*
* spa_thread() existed on Illumos as a parent thread for the various worker
typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
- ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
+ ZTI_MODE_SYNC, /* sync thread assigned */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
} zti_modes_t;
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
+#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 }
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
#define ZTI_N(n) ZTI_P(n, 1)
* initializing a pool, we use this table to create an appropriately sized
* taskq. Some operations are low volume and therefore have a small, static
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
*
* The different taskq priorities are to handle the different contexts (issue
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
- { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
+ { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
{ ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
#endif
+static uint_t zio_taskq_wr_iss_ncpus = 0;
+
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
* This is used by zdb to analyze non-idle pools.
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t cpus, flags = TASKQ_DYNAMIC;
-#ifdef HAVE_SYSDC
- boolean_t batch = B_FALSE;
-#endif
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >, 0);
break;
- case ZTI_MODE_BATCH:
-#ifdef HAVE_SYSDC
- batch = B_TRUE;
-#endif
+ case ZTI_MODE_SYNC:
+
+ /*
+ * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
+ * not to exceed the number of spa allocators.
+ */
+ if (zio_taskq_wr_iss_ncpus == 0) {
+ count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
+ } else {
+ count = MAX(1,
+ boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
+ }
+ count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+ count = MIN(count, spa->spa_alloc_count);
+
+ /*
+ * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+ * single taskq may have more threads than 100% of online cpus.
+ */
+ value = (zio_taskq_batch_pct + count / 2) / count;
+ value = MIN(value, 100);
flags |= TASKQ_THREADS_CPU_PCT;
- value = MIN(zio_taskq_batch_pct, 100);
break;
case ZTI_MODE_SCALE:
default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
- "spa_activate()",
+ "spa_taskqs_init()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
#ifdef HAVE_SYSDC
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
- if (batch)
- flags |= TASKQ_DC_BATCH;
-
(void) zio_taskq_basedc;
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * on the taskq itself.
*/
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static taskq_t *
+spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ zio_t *zio)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
+ if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+ (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
+ /* dispatch to assigned write issue taskq */
+ tq = zio->io_wr_iss_tq;
+ return (tq);
+ }
+
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
+ return (tq);
+}
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent,
+ zio_t *zio)
+{
+ taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio);
taskq_dispatch_ent(tq, func, arg, flags, ent);
}
spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags)
{
- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
- taskq_t *tq;
- taskqid_t id;
-
- ASSERT3P(tqs->stqs_taskq, !=, NULL);
- ASSERT3U(tqs->stqs_count, !=, 0);
-
- if (tqs->stqs_count == 1) {
- tq = tqs->stqs_taskq[0];
- } else {
- tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
- }
-
- id = taskq_dispatch(tq, func, arg, flags);
+ taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL);
+ taskqid_t id = taskq_dispatch(tq, func, arg, flags);
if (id)
taskq_wait_id(tq, id);
}
mutex_exit(&spa_namespace_lock);
}
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+ kthread_t **kthreads;
+
+ ASSERT(spa->spa_sync_tq == NULL);
+ ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+ /*
+ * - do not allow more allocators than cpus.
+ * - there may be more cpus than allocators.
+ * - do not allow more sync taskq threads than allocators or cpus.
+ */
+ int nthreads = spa->spa_alloc_count;
+ spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+ nthreads, KM_SLEEP);
+
+ spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+ nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+ VERIFY(spa->spa_sync_tq != NULL);
+ VERIFY(kthreads != NULL);
+
+ spa_taskqs_t *tqs =
+ &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+ ti->sti_thread = kthreads[i];
+ if (w == tqs->stqs_count) {
+ w = 0;
+ }
+ ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+ }
+
+ kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+ return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+ ASSERT(spa->spa_sync_tq != NULL);
+
+ taskq_wait(spa->spa_sync_tq);
+ taskq_destroy(spa->spa_sync_tq);
+ kmem_free(spa->spa_syncthreads,
+ sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+ spa->spa_sync_tq = NULL;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * A gang block (for example) may have inherited its parent's
+ * allocator, in which case there is nothing further to do here.
+ */
+ if (ZIO_HAS_ALLOCATOR(zio))
+ return;
+
+ ASSERT(spa != NULL);
+ ASSERT(bm != NULL);
+
+ /*
+ * First try to use an allocator assigned to the syncthread, and set
+ * the corresponding write issue taskq for the allocator.
+ * Note, we must have an open pool to do this.
+ */
+ if (spa->spa_sync_tq != NULL) {
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ zio->io_allocator = i;
+ zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+ return;
+ }
+ }
+ }
+
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+ bm->zb_blkid >> 20);
+
+ zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+ zio->io_wr_iss_tq = NULL;
+}
+
/*
* ==========================================================================
* Miscellaneous routines
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
+ "Number of CPUs to run write issue taskqs");
uint_t spa_slop_shift = 5;
static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-static const int spa_allocators = 4;
+
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
/*
* Spa active allocator.
if (altroot)
spa->spa_root = spa_strdup(altroot);
- spa->spa_alloc_count = spa_allocators;
+ /* Do not allow more allocators than CPUs. */
+ spa->spa_alloc_count = MIN(MAX(spa_num_allocators, 1), boot_ncpus);
+
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) {
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
+
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+ "Number of allocators per spa, capped by ncpus");
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+ zio->io_allocator = ZIO_ALLOCATOR_NONE;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
(pipeline & ZIO_STAGE_READY) == 0;
*/
ASSERT(taskq_empty_ent(&zio->io_tqent));
spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
- &zio->io_tqent);
+ &zio->io_tqent, zio);
}
static boolean_t
static zio_t *
zio_issue_async(zio_t *zio)
{
+ ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
return (NULL);
}
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ spa_select_allocator(zio);
+ }
__zio_execute(zio);
mutex_enter(&zio->io_lock);
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ spa_select_allocator(zio);
+ }
__zio_execute(zio);
}
return (zio);
}
+static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+ cio->io_allocator = pio->io_allocator;
+ cio->io_wr_iss_tq = pio->io_wr_iss_tq;
+}
+
static void
zio_write_gang_member_ready(zio_t *zio)
{
gbh_copies = MIN(2, spa_max_replication(spa));
}
+ ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio_gang_inherit_allocator(pio, zio);
+
/*
* Create and nowait the gang children.
*/
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio_gang_inherit_allocator(zio, cio);
+
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(has_data);
return (NULL);
ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
/*
* Try to place a reservation for this zio. If we're unable to
}
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
- zbookmark_phys_t *bm = &zio->io_bookmark;
- /*
- * We want to try to use as many allocators as possible to help improve
- * performance, but we also want logically adjacent IOs to be physically
- * adjacent to improve sequential read performance. We chunk each object
- * into 2^20 block regions, and then hash based on the objset, object,
- * level, and region to accomplish both of these goals.
- */
- int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
- bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
- zio->io_allocator = allocator;
+ int allocator = zio->io_allocator;
zio->io_metaslab_class = mc;
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
* sync write performance. If a log allocation fails, we will fall
* back to spa_sync() which is abysmal for performance.
*/
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
/*
* We were unable to allocate anything, unreserve and
}
ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT(ZIO_HAS_ALLOCATOR(pio));
ASSERT3P(zio, !=, zio->io_logical);
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
ASSERT(taskq_empty_ent(&zio->io_tqent));
spa_taskq_dispatch_ent(zio->io_spa,
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
- zio_reexecute, zio, 0, &zio->io_tqent);
+ zio_reexecute, zio, 0, &zio->io_tqent, NULL);
}
return (NULL);
}