#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
+#ifdef HAVE_BLK_MQ
+#include <linux/blk-mq.h>
+#endif
+
+static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
+ struct request *rq, boolean_t force_sync);
+
static unsigned int zvol_major = ZVOL_MAJOR;
static unsigned int zvol_request_sync = 0;
static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
-static unsigned int zvol_threads = 32;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static const unsigned int zvol_open_timeout_ms = 1000;
#endif
+static unsigned int zvol_threads = 0;
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_threads = 0;
+static unsigned int zvol_blk_mq_actual_threads;
+static boolean_t zvol_use_blk_mq = B_FALSE;
+
+/*
+ * The maximum number of volblocksize blocks to process per thread. Typically,
+ * write heavy workloads preform better with higher values here, and read
+ * heavy workloads preform better with lower values, but that's not a hard
+ * and fast rule. It's basically a knob to tune between "less overhead with
+ * less parallelism" and "more overhead, but more parallelism".
+ *
+ * '8' was chosen as a reasonable, balanced, default based off of sequential
+ * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
+ */
+static unsigned int zvol_blk_mq_blocks_per_thread = 8;
+#endif
+
+#ifndef BLKDEV_DEFAULT_RQ
+/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
+#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
+#endif
+
+/*
+ * Finalize our BIO or request.
+ */
+#ifdef HAVE_BLK_MQ
+#define END_IO(zv, bio, rq, error) do { \
+ if (bio) { \
+ BIO_END_IO(bio, error); \
+ } else { \
+ blk_mq_end_request(rq, errno_to_bi_status(error)); \
+ } \
+} while (0)
+#else
+#define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error)
+#endif
+
+#ifdef HAVE_BLK_MQ
+static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+static unsigned int zvol_actual_blk_mq_queue_depth;
+#endif
+
struct zvol_state_os {
struct gendisk *zvo_disk; /* generic disk */
struct request_queue *zvo_queue; /* request queue */
dev_t zvo_dev; /* device id */
+
+#ifdef HAVE_BLK_MQ
+ struct blk_mq_tag_set tag_set;
+#endif
+
+ /* Set from the global 'zvol_use_blk_mq' at zvol load */
+ boolean_t use_blk_mq;
};
taskq_t *zvol_taskq;
typedef struct zv_request_stack {
zvol_state_t *zv;
struct bio *bio;
+ struct request *rq;
} zv_request_t;
+typedef struct zv_work {
+ struct request *rq;
+ struct work_struct work;
+} zv_work_t;
+
typedef struct zv_request_task {
zv_request_t zvr;
taskq_ent_t ent;
kmem_free(task, sizeof (*task));
}
+#ifdef HAVE_BLK_MQ
+
+/*
+ * This is called when a new block multiqueue request comes in. A request
+ * contains one or more BIOs.
+ */
+static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ zvol_state_t *zv = rq->q->queuedata;
+
+ /* Tell the kernel that we are starting to process this request */
+ blk_mq_start_request(rq);
+
+ if (blk_rq_is_passthrough(rq)) {
+ /* Skip non filesystem request */
+ blk_mq_end_request(rq, BLK_STS_IOERR);
+ return (BLK_STS_IOERR);
+ }
+
+ zvol_request_impl(zv, NULL, rq, 0);
+
+ /* Acknowledge to the kernel that we got this request */
+ return (BLK_STS_OK);
+}
+
+static struct blk_mq_ops zvol_blk_mq_queue_ops = {
+ .queue_rq = zvol_mq_queue_rq,
+};
+
+/* Initialize our blk-mq struct */
+static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
+{
+ struct zvol_state_os *zso = zv->zv_zso;
+
+ memset(&zso->tag_set, 0, sizeof (zso->tag_set));
+
+ /* Initialize tag set. */
+ zso->tag_set.ops = &zvol_blk_mq_queue_ops;
+ zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
+ zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
+ zso->tag_set.numa_node = NUMA_NO_NODE;
+ zso->tag_set.cmd_size = 0;
+
+ /*
+ * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
+ * zvol_request_impl()
+ */
+ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+ zso->tag_set.driver_data = zv;
+
+ return (blk_mq_alloc_tag_set(&zso->tag_set));
+}
+#endif /* HAVE_BLK_MQ */
+
/*
* Given a path, return TRUE if path is a ZVOL.
*/
zvol_write(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
int error = 0;
zfs_uio_t uio;
-
- zfs_uio_bvec_init(&uio, bio);
-
zvol_state_t *zv = zvr->zv;
+ struct request_queue *q;
+ struct gendisk *disk;
+ unsigned long start_time = 0;
+ boolean_t acct = B_FALSE;
+
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
ASSERT3P(zv->zv_zilog, !=, NULL);
+ q = zv->zv_zso->zvo_queue;
+ disk = zv->zv_zso->zvo_disk;
+
/* bio marked as FLUSH need to flush before write */
- if (bio_is_flush(bio))
+ if (io_is_flush(bio, rq))
zil_commit(zv->zv_zilog, ZVOL_OBJ);
/* Some requests are just for flush and nothing else. */
- if (uio.uio_resid == 0) {
+ if (io_size(bio, rq) == 0) {
rw_exit(&zv->zv_suspend_lock);
- BIO_END_IO(bio, 0);
+ END_IO(zv, bio, rq, 0);
return;
}
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
+ zfs_uio_bvec_init(&uio, bio, rq);
+
ssize_t start_resid = uio.uio_resid;
- unsigned long start_time;
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+ /*
+ * With use_blk_mq, accounting is done by blk_mq_start_request()
+ * and blk_mq_end_request(), so we can skip it here.
+ */
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct) {
+ start_time = blk_generic_start_io_acct(q, disk, WRITE,
+ bio);
+ }
+ }
boolean_t sync =
- bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
uio.uio_loffset, uio.uio_resid, RL_WRITER);
rw_exit(&zv->zv_suspend_lock);
- if (acct)
+ if (bio && acct) {
blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
zvol_discard(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
zvol_state_t *zv = zvr->zv;
- uint64_t start = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
+ uint64_t start = io_offset(bio, rq);
+ uint64_t size = io_size(bio, rq);
uint64_t end = start + size;
boolean_t sync;
int error = 0;
dmu_tx_t *tx;
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ unsigned long start_time = 0;
+
+ boolean_t acct = blk_queue_io_stat(q);
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
ASSERT3P(zv->zv_zilog, !=, NULL);
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
- unsigned long start_time;
-
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct) {
+ start_time = blk_generic_start_io_acct(q, disk, WRITE,
+ bio);
+ }
+ }
- sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
* the unaligned parts which is slow (read-modify-write) and useless
* since we are not freeing any space by doing so.
*/
- if (!bio_is_secure_erase(bio)) {
+ if (!io_is_secure_erase(bio, rq)) {
start = P2ROUNDUP(start, zv->zv_volblocksize);
end = P2ALIGN(end, zv->zv_volblocksize);
size = end - start;
unlock:
rw_exit(&zv->zv_suspend_lock);
- if (acct)
- blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+ if (bio && acct) {
+ blk_generic_end_io_acct(q, disk, WRITE, bio,
+ start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
zvol_read(zv_request_t *zvr)
{
struct bio *bio = zvr->bio;
+ struct request *rq = zvr->rq;
int error = 0;
zfs_uio_t uio;
-
- zfs_uio_bvec_init(&uio, bio);
-
+ boolean_t acct = B_FALSE;
zvol_state_t *zv = zvr->zv;
+ struct request_queue *q;
+ struct gendisk *disk;
+ unsigned long start_time = 0;
+
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
- struct request_queue *q = zv->zv_zso->zvo_queue;
- struct gendisk *disk = zv->zv_zso->zvo_disk;
+ zfs_uio_bvec_init(&uio, bio, rq);
+
+ q = zv->zv_zso->zvo_queue;
+ disk = zv->zv_zso->zvo_disk;
+
ssize_t start_resid = uio.uio_resid;
- unsigned long start_time;
- boolean_t acct = blk_queue_io_stat(q);
- if (acct)
- start_time = blk_generic_start_io_acct(q, disk, READ, bio);
+ /*
+ * When blk-mq is being used, accounting is done by
+ * blk_mq_start_request() and blk_mq_end_request().
+ */
+ if (bio) {
+ acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, READ,
+ bio);
+ }
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
uio.uio_loffset, uio.uio_resid, RL_READER);
uint64_t volsize = zv->zv_volsize;
+
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
rw_exit(&zv->zv_suspend_lock);
- if (acct)
+ if (bio && acct) {
blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+ }
- BIO_END_IO(bio, -error);
+ END_IO(zv, bio, rq, -error);
}
static void
zv_request_task_free(task);
}
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+
+/*
+ * Process a BIO or request
+ *
+ * Either 'bio' or 'rq' should be set depending on if we are processing a
+ * bio or a request (both should not be set).
+ *
+ * force_sync: Set to 0 to defer processing to a background taskq
+ * Set to 1 to process data synchronously
+ */
static void
-zvol_submit_bio(struct bio *bio)
-#else
-static blk_qc_t
-zvol_submit_bio(struct bio *bio)
-#endif
-#else
-static MAKE_REQUEST_FN_RET
-zvol_request(struct request_queue *q, struct bio *bio)
-#endif
+zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
+ boolean_t force_sync)
{
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#if defined(HAVE_BIO_BDEV_DISK)
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
-#else
- struct request_queue *q = bio->bi_disk->queue;
-#endif
-#endif
- zvol_state_t *zv = q->queuedata;
fstrans_cookie_t cookie = spl_fstrans_mark();
- uint64_t offset = BIO_BI_SECTOR(bio) << 9;
- uint64_t size = BIO_BI_SIZE(bio);
- int rw = bio_data_dir(bio);
+ uint64_t offset = io_offset(bio, rq);
+ uint64_t size = io_size(bio, rq);
+ int rw = io_data_dir(bio, rq);
- if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
- printk(KERN_INFO
- "%s: bad access: offset=%llu, size=%lu\n",
+ if (zvol_request_sync)
+ force_sync = 1;
+
+ zv_request_t zvr = {
+ .zv = zv,
+ .bio = bio,
+ .rq = rq,
+ };
+
+ if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
+ printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
zv->zv_zso->zvo_disk->disk_name,
(long long unsigned)offset,
(long unsigned)size);
- BIO_END_IO(bio, -SET_ERROR(EIO));
+ END_IO(zv, bio, rq, -SET_ERROR(EIO));
goto out;
}
- zv_request_t zvr = {
- .zv = zv,
- .bio = bio,
- };
zv_request_task_t *task;
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
- BIO_END_IO(bio, -SET_ERROR(EROFS));
+ END_IO(zv, bio, rq, -SET_ERROR(EROFS));
goto out;
}
* i/o may be a ZIL write (via zil_commit()), or a read of an
* indirect block, or a read of a data block (if this is a
* partial-block write). We will indicate that the i/o is
- * complete by calling BIO_END_IO() from the taskq callback.
+ * complete by calling END_IO() from the taskq callback.
*
* This design allows the calling thread to continue and
* initiate more concurrent operations by calling
* of one i/o at a time per zvol. However, an even better
* design would be for zvol_request() to initiate the zio
* directly, and then be notified by the zio_done callback,
- * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL
+ * which would call END_IO(). Unfortunately, the DMU/ZIL
* interfaces lack this functionality (they block waiting for
* the i/o to complete).
*/
- if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
- if (zvol_request_sync) {
+ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
+ if (force_sync) {
zvol_discard(&zvr);
} else {
task = zv_request_task_create(zvr);
zvol_discard_task, task, 0, &task->ent);
}
} else {
- if (zvol_request_sync) {
+ if (force_sync) {
zvol_write(&zvr);
} else {
task = zv_request_task_create(zvr);
* data and require no additional handling.
*/
if (size == 0) {
- BIO_END_IO(bio, 0);
+ END_IO(zv, bio, rq, 0);
goto out;
}
rw_enter(&zv->zv_suspend_lock, RW_READER);
/* See comment in WRITE case above. */
- if (zvol_request_sync) {
+ if (force_sync) {
zvol_read(&zvr);
} else {
task = zv_request_task_create(zvr);
out:
spl_fstrans_unmark(cookie);
-#if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
- defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
+static void
+zvol_submit_bio(struct bio *bio)
+#else
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#endif
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+#if defined(HAVE_BIO_BDEV_DISK)
+ struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+#else
+ struct request_queue *q = bio->bi_disk->queue;
+#endif
+#endif
+ zvol_state_t *zv = q->queuedata;
+
+ zvol_request_impl(zv, bio, NULL, 0);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
return (BLK_QC_T_NONE);
#endif
return (0);
}
+/*
+ * Why have two separate block_device_operations structs?
+ *
+ * Normally we'd just have one, and assign 'submit_bio' as needed. However,
+ * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
+ * can't just change submit_bio dynamically at runtime. So just create two
+ * separate structs to get around this.
+ */
+static const struct block_device_operations zvol_ops_blk_mq = {
+ .open = zvol_open,
+ .release = zvol_release,
+ .ioctl = zvol_ioctl,
+ .compat_ioctl = zvol_compat_ioctl,
+ .check_events = zvol_check_events,
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
+ .revalidate_disk = zvol_revalidate_disk,
+#endif
+ .getgeo = zvol_getgeo,
+ .owner = THIS_MODULE,
+};
+
static const struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
#endif
};
+static int
+zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
+{
+#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+#if defined(HAVE_BLK_ALLOC_DISK)
+ zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
+ if (zso->zvo_disk == NULL)
+ return (1);
+
+ zso->zvo_disk->minors = ZVOL_MINORS;
+ zso->zvo_queue = zso->zvo_disk->queue;
+#else
+ zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+ if (zso->zvo_queue == NULL)
+ return (1);
+
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ return (1);
+ }
+
+ zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_BLK_ALLOC_DISK */
+#else
+ zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+ if (zso->zvo_queue == NULL)
+ return (1);
+
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ return (1);
+ }
+
+ zso->zvo_disk->queue = zso->zvo_queue;
+#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+ return (0);
+
+}
+
+static int
+zvol_alloc_blk_mq(zvol_state_t *zv)
+{
+#ifdef HAVE_BLK_MQ
+ struct zvol_state_os *zso = zv->zv_zso;
+
+ /* Allocate our blk-mq tag_set */
+ if (zvol_blk_mq_alloc_tag_set(zv) != 0)
+ return (1);
+
+#if defined(HAVE_BLK_ALLOC_DISK)
+ zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
+ if (zso->zvo_disk == NULL) {
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+ zso->zvo_queue = zso->zvo_disk->queue;
+ zso->zvo_disk->minors = ZVOL_MINORS;
+#else
+ zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+ if (zso->zvo_disk == NULL) {
+ blk_cleanup_queue(zso->zvo_queue);
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+ /* Allocate queue */
+ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
+ if (IS_ERR(zso->zvo_queue)) {
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+
+ /* Our queue is now created, assign it to our disk */
+ zso->zvo_disk->queue = zso->zvo_queue;
+
+#endif
+#endif
+ return (0);
+}
+
/*
* Allocate memory for a new zvol_state_t and setup the required
* request queue and generic disk structures for the block device.
zvol_state_t *zv;
struct zvol_state_os *zso;
uint64_t volmode;
+ int ret;
if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
return (NULL);
list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
-#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
-#ifdef HAVE_BLK_ALLOC_DISK
- zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
- if (zso->zvo_disk == NULL)
- goto out_kmem;
-
- zso->zvo_disk->minors = ZVOL_MINORS;
- zso->zvo_queue = zso->zvo_disk->queue;
-#else
- zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
- if (zso->zvo_queue == NULL)
- goto out_kmem;
+#ifdef HAVE_BLK_MQ
+ zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
+#endif
- zso->zvo_disk = alloc_disk(ZVOL_MINORS);
- if (zso->zvo_disk == NULL) {
- blk_cleanup_queue(zso->zvo_queue);
- goto out_kmem;
+ /*
+ * The block layer has 3 interfaces for getting BIOs:
+ *
+ * 1. blk-mq request queues (new)
+ * 2. submit_bio() (oldest)
+ * 3. regular request queues (old).
+ *
+ * Each of those interfaces has two permutations:
+ *
+ * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
+ * both the disk and its queue (5.14 kernel or newer)
+ *
+ * b) We don't have blk_*alloc_disk(), and have to allocate the
+ * disk and the queue separately. (5.13 kernel or older)
+ */
+ if (zv->zv_zso->use_blk_mq) {
+ ret = zvol_alloc_blk_mq(zv);
+ zso->zvo_disk->fops = &zvol_ops_blk_mq;
+ } else {
+ ret = zvol_alloc_non_blk_mq(zso);
+ zso->zvo_disk->fops = &zvol_ops;
}
-
- zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_BLK_ALLOC_DISK */
-#else
- zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
- if (zso->zvo_queue == NULL)
+ if (ret != 0)
goto out_kmem;
- zso->zvo_disk = alloc_disk(ZVOL_MINORS);
- if (zso->zvo_disk == NULL) {
- blk_cleanup_queue(zso->zvo_queue);
- goto out_kmem;
- }
-
- zso->zvo_disk->queue = zso->zvo_queue;
-#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
-
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
/* Limit read-ahead to a single page to prevent over-prefetching. */
blk_queue_set_read_ahead(zso->zvo_queue, 1);
- /* Disable write merging in favor of the ZIO pipeline. */
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+ if (!zv->zv_zso->use_blk_mq) {
+ /* Disable write merging in favor of the ZIO pipeline. */
+ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+ }
/* Enable /proc/diskstats */
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
}
zso->zvo_disk->first_minor = (dev & MINORMASK);
- zso->zvo_disk->fops = &zvol_ops;
zso->zvo_disk->private_data = zv;
snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
ZVOL_DEV_NAME, (dev & MINORMASK));
put_disk(zv->zv_zso->zvo_disk);
#endif
+#ifdef HAVE_BLK_MQ
+ if (zv->zv_zso->use_blk_mq)
+ blk_mq_free_tag_set(&zv->zv_zso->tag_set);
+#endif
+
ida_simple_remove(&zvol_ida,
MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
(DMU_MAX_ACCESS / 4) >> 9);
- blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
- blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+
+ if (zv->zv_zso->use_blk_mq) {
+ /*
+ * IO requests can be really big (1MB). When an IO request
+ * comes in, it is passed off to zvol_read() or zvol_write()
+ * in a new thread, where it is chunked up into 'volblocksize'
+ * sized pieces and processed. So for example, if the request
+ * is a 1MB write and your volblocksize is 128k, one zvol_write
+ * thread will take that request and sequentially do ten 128k
+ * IOs. This is due to the fact that the thread needs to lock
+ * each volblocksize sized block. So you might be wondering:
+ * "instead of passing the whole 1MB request to one thread,
+ * why not pass ten individual 128k chunks to ten threads and
+ * process the whole write in parallel?" The short answer is
+ * that there's a sweet spot number of chunks that balances
+ * the greater parallelism with the added overhead of more
+ * threads. The sweet spot can be different depending on if you
+ * have a read or write heavy workload. Writes typically want
+ * high chunk counts while reads typically want lower ones. On
+ * a test pool with 6 NVMe drives in a 3x 2-disk mirror
+ * configuration, with volblocksize=8k, the sweet spot for good
+ * sequential reads and writes was at 8 chunks.
+ */
+
+ /*
+ * Below we tell the kernel how big we want our requests
+ * to be. You would think that blk_queue_io_opt() would be
+ * used to do this since it is used to "set optimal request
+ * size for the queue", but that doesn't seem to do
+ * anything - the kernel still gives you huge requests
+ * with tons of little PAGE_SIZE segments contained within it.
+ *
+ * Knowing that the kernel will just give you PAGE_SIZE segments
+ * no matter what, you can say "ok, I want PAGE_SIZE byte
+ * segments, and I want 'N' of them per request", where N is
+ * the correct number of segments for the volblocksize and
+ * number of chunks you want.
+ */
+#ifdef HAVE_BLK_MQ
+ if (zvol_blk_mq_blocks_per_thread != 0) {
+ unsigned int chunks;
+ chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
+
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+ PAGE_SIZE);
+ blk_queue_max_segments(zv->zv_zso->zvo_queue,
+ (zv->zv_volblocksize * chunks) / PAGE_SIZE);
+ } else {
+ /*
+ * Special case: zvol_blk_mq_blocks_per_thread = 0
+ * Max everything out.
+ */
+ blk_queue_max_segments(zv->zv_zso->zvo_queue,
+ UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
+ UINT_MAX);
+ }
+#endif
+ } else {
+ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+ blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+ }
+
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
zv->zv_volblocksize);
blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
zvol_init(void)
{
int error;
- int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+ /*
+ * zvol_threads is the module param the user passes in.
+ *
+ * zvol_actual_threads is what we use internally, since the user can
+ * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
+ */
+ static unsigned int zvol_actual_threads;
+
+ if (zvol_threads == 0) {
+ /*
+ * See dde9380a1 for why 32 was chosen here. This should
+ * probably be refined to be some multiple of the number
+ * of CPUs.
+ */
+ zvol_actual_threads = MAX(num_online_cpus(), 32);
+ } else {
+ zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
+ }
error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
return (error);
}
- zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
- threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+#ifdef HAVE_BLK_MQ
+ if (zvol_blk_mq_queue_depth == 0) {
+ zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
+ } else {
+ zvol_actual_blk_mq_queue_depth =
+ MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
+ }
+
+ if (zvol_blk_mq_threads == 0) {
+ zvol_blk_mq_actual_threads = num_online_cpus();
+ } else {
+ zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
+ 1024);
+ }
+#endif
+ zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
+ zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
unregister_blkdev(zvol_major, ZVOL_DRIVER);
return (-ENOMEM);
}
+
zvol_init_impl();
ida_init(&zvol_ida);
return (0);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
module_param(zvol_threads, uint, 0444);
-MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
+ "to 0 to use all active CPUs");
module_param(zvol_request_sync, uint, 0644);
MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
module_param(zvol_volmode, uint, 0644);
MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+
+#ifdef HAVE_BLK_MQ
+module_param(zvol_blk_mq_queue_depth, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
+
+module_param(zvol_use_blk_mq, uint, 0644);
+MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
+
+module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
+MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
+ "Process volblocksize blocks per thread");
+#endif
+
/* END CSTYLED */