*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
+uint32_t zfs_vdev_trim_min_active = 1;
+uint32_t zfs_vdev_trim_max_active = 2;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
* they aren't able to help us aggregate at this level.
*/
int zfs_vdev_aggregation_limit = 1 << 20;
+int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
int zfs_vdev_queue_depth_pct = 300;
#endif
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+
+/*
+ * Allow TRIM I/Os to be aggregated. This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+int zfs_vdev_aggregate_trim = 0;
int
vdev_queue_offset_compare(const void *x1, const void *x2)
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
- ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+ ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
- else
+ else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
+ else
+ return (&vq->vq_trim_offset_tree);
}
int
return (zfs_vdev_scrub_min_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_min_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_min_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_min_active);
default:
panic("invalid priority %u", p);
return (0);
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_max_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_max_active);
default:
panic("invalid priority %u", p);
return (0);
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
/*
- * The synchronous i/o queues are dispatched in FIFO rather
+ * The synchronous/trim i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
- if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+ if (p == ZIO_PRIORITY_SYNC_READ ||
+ p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
- else
+ } else {
compfn = vdev_queue_offset_compare;
+ }
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
mutex_destroy(&vq->vq_lock);
}
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
- spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
- if (ssh->kstat != NULL) {
- mutex_enter(&ssh->lock);
- kstat_waitq_enter(ssh->kstat->ks_data);
- mutex_exit(&ssh->lock);
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_waitq_enter(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
}
}
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
- spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
- if (ssh->kstat != NULL) {
- mutex_enter(&ssh->lock);
- kstat_waitq_exit(ssh->kstat->ks_data);
- mutex_exit(&ssh->lock);
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_waitq_exit(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
}
}
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
- spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++;
avl_add(&vq->vq_active_tree, zio);
- if (ssh->kstat != NULL) {
- mutex_enter(&ssh->lock);
- kstat_runq_enter(ssh->kstat->ks_data);
- mutex_exit(&ssh->lock);
+ if (shk->kstat != NULL) {
+ mutex_enter(&shk->lock);
+ kstat_runq_enter(shk->kstat->ks_data);
+ mutex_exit(&shk->lock);
}
}
vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
- spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--;
avl_remove(&vq->vq_active_tree, zio);
- if (ssh->kstat != NULL) {
- kstat_io_t *ksio = ssh->kstat->ks_data;
+ if (shk->kstat != NULL) {
+ kstat_io_t *ksio = shk->kstat->ks_data;
- mutex_enter(&ssh->lock);
+ mutex_enter(&shk->lock);
kstat_runq_exit(ksio);
if (zio->io_type == ZIO_TYPE_READ) {
ksio->reads++;
ksio->writes++;
ksio->nwritten += zio->io_size;
}
- mutex_exit(&ssh->lock);
+ mutex_exit(&shk->lock);
}
}
abd_t *abd;
maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
- limit = MAX(MIN(zfs_vdev_aggregation_limit, maxblocksize), 0);
+ if (vq->vq_vdev->vdev_nonrot)
+ limit = zfs_vdev_aggregation_limit_non_rotating;
+ else
+ limit = zfs_vdev_aggregation_limit;
+ limit = MAX(MIN(limit, maxblocksize), 0);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);
+ /*
+ * While TRIM commands could be aggregated based on offset this
+ * behavior is disabled until it's determined to be beneficial.
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+ return (NULL);
+
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
}
/*
- * For LBA-ordered queues (async / scrub), issue the i/o which follows
- * the most recently issued i/o in LBA (offset) order.
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
*
- * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+ * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
* not match the child's i/o type. Fix it up here.
*/
if (zio->io_type == ZIO_TYPE_READ) {
+ ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
- } else {
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_TRIM);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
+ /*
+ * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
+ * code to issue IOs without adding them to the vdev queue. In this
+ * case, the zio is already going to be issued as quickly as possible
+ * and so it doesn't need any reprioitization to help.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_NOW)
+ return;
+
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
return (vd->vdev_queue.vq_last_offset);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
module_param(zfs_vdev_aggregation_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");
+module_param(zfs_vdev_aggregation_limit_non_rotating, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_aggregation_limit_non_rotating,
+ "Max vdev I/O aggregation size for non-rotating media");
+
+module_param(zfs_vdev_aggregate_trim, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggregated");
+
module_param(zfs_vdev_read_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
MODULE_PARM_DESC(zfs_vdev_async_write_min_active,
"Min active async write I/Os per vdev");
+module_param(zfs_vdev_initializing_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_max_active,
+ "Max active initializing I/Os per vdev");
+
+module_param(zfs_vdev_initializing_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_initializing_min_active,
+ "Min active initializing I/Os per vdev");
+
+module_param(zfs_vdev_removal_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_max_active,
+ "Max active removal I/Os per vdev");
+
+module_param(zfs_vdev_removal_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_removal_min_active,
+ "Min active removal I/Os per vdev");
+
module_param(zfs_vdev_scrub_max_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_max_active,
+ "Max active scrub I/Os per vdev");
module_param(zfs_vdev_scrub_min_active, int, 0644);
-MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev");
+MODULE_PARM_DESC(zfs_vdev_scrub_min_active,
+ "Min active scrub I/Os per vdev");
module_param(zfs_vdev_sync_read_max_active, int, 0644);
MODULE_PARM_DESC(zfs_vdev_sync_read_max_active,
MODULE_PARM_DESC(zfs_vdev_sync_write_min_active,
"Min active sync write I/Os per vdev");
+module_param(zfs_vdev_trim_max_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_max_active,
+ "Max active trim/discard I/Os per vdev");
+
+module_param(zfs_vdev_trim_min_active, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_trim_min_active,
+ "Min active trim/discard I/Os per vdev");
+
module_param(zfs_vdev_queue_depth_pct, int, 0644);
MODULE_PARM_DESC(zfs_vdev_queue_depth_pct,
"Queue depth percentage for each top-level vdev");