From f70ced09170761acb69840cafaace4abc72cba4b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Sep 2014 23:23:47 +0800 Subject: [PATCH] blk-mq: support per-distpatch_queue flush machinery This patch supports to run one single flush machinery for each blk-mq dispatch queue, so that: - current init_request and exit_request callbacks can cover flush request too, then the buggy copying way of initializing flush request's pdu can be fixed - flushing performance gets improved in case of multi hw-queue In fio sync write test over virtio-blk(4 hw queues, ioengine=sync, iodepth=64, numjobs=4, bs=4K), it is observed that througput gets increased a lot over my test environment: - throughput: +70% in case of virtio-blk over null_blk - throughput: +30% in case of virtio-blk over SSD image The multi virtqueue feature isn't merged to QEMU yet, and patches for the feature can be found in below tree: git://kernel.ubuntu.com/ming/qemu.git v2.1.0-mq.4 And simply passing 'num_queues=4 vectors=5' should be enough to enable multi queue(quad queue) feature for QEMU virtio-blk. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-flush.c | 21 ++++++++++++------ block/blk-mq.c | 50 ++++++++++++++++++++---------------------- block/blk-sysfs.c | 4 ++-- block/blk.h | 16 +++++++++++--- include/linux/blk-mq.h | 6 +++++ 6 files changed, 60 insertions(+), 39 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b1dd4e086740..e1c2775c7597 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,7 +704,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - q->fq = blk_alloc_flush_queue(q); + q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0); if (!q->fq) return NULL; diff --git a/block/blk-flush.c b/block/blk-flush.c index 004d95e4098e..20badd7b9d1b 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -305,8 +305,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); - if (q->mq_ops) - blk_mq_clone_flush_request(flush_rq, first_rq); + + /* + * Borrow tag from the first request since they can't + * be in flight at the same time. + */ + if (q->mq_ops) { + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->tag = first_rq->tag; + } flush_rq->cmd_type = REQ_TYPE_FS; flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; @@ -480,22 +487,22 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, } EXPORT_SYMBOL(blkdev_issue_flush); -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q) +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); - fq = kzalloc(sizeof(*fq), GFP_KERNEL); + fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node); if (!fq) goto fail; if (q->mq_ops) { spin_lock_init(&fq->mq_flush_lock); - rq_sz = round_up(rq_sz + q->tag_set->cmd_size, - cache_line_size()); + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); } - fq->flush_rq = kzalloc(rq_sz, GFP_KERNEL); + fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node); if (!fq->flush_rq) goto fail_rq; diff --git a/block/blk-mq.c b/block/blk-mq.c index 53b6def12fc4..4e7a31466139 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -281,26 +281,6 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -/* - * Clone all relevant state from a request that has been put on hold in - * the flush state machine into the preallocated flush request that hangs - * off the request queue. - * - * For a driver the flush request should be invisible, that's why we are - * impersonating the original request here. - */ -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq) -{ - struct blk_mq_hw_ctx *hctx = - orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); - - flush_rq->mq_ctx = orig_rq->mq_ctx; - flush_rq->tag = orig_rq->tag; - memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), - hctx->cmd_size); -} - inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -1516,12 +1496,20 @@ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + unsigned flush_start_tag = set->queue_depth; + blk_mq_tag_idle(hctx); + if (set->ops->exit_request) + set->ops->exit_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx); + if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_free_flush_queue(hctx->fq); kfree(hctx->ctxs); blk_mq_free_bitmap(&hctx->ctx_map); } @@ -1556,6 +1544,7 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { int node; + unsigned flush_start_tag = set->queue_depth; node = hctx->numa_node; if (node == NUMA_NO_NODE) @@ -1594,8 +1583,23 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + if (!hctx->fq) + goto exit_hctx; + + if (set->ops->init_request && + set->ops->init_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx, node)) + goto free_fq; + return 0; + free_fq: + kfree(hctx->fq); + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: blk_mq_free_bitmap(&hctx->ctx_map); free_ctxs: @@ -1862,16 +1866,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) blk_mq_add_queue_tag_set(set, q); - q->fq = blk_alloc_flush_queue(q); - if (!q->fq) - goto err_hw_queues; - blk_mq_map_swqueue(q); return q; -err_hw_queues: - blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); err_hw: blk_cleanup_queue(q); err_hctxs: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 718cffc4c678..e8f38a36c625 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -517,10 +517,10 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - blk_free_flush_queue(q->fq); - if (q->mq_ops) blk_mq_free_queue(q); + else + blk_free_flush_queue(q->fq); blk_trace_shutdown(q); diff --git a/block/blk.h b/block/blk.h index 7ecdd8517e69..43b036185712 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,6 +2,8 @@ #define BLK_INTERNAL_H #include +#include +#include "blk-mq.h" /* Amount of time in which a process may batch requests */ #define BLK_BATCH_TIME (HZ/50UL) @@ -31,7 +33,14 @@ extern struct ida blk_queue_ida; static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { - return q->fq; + struct blk_mq_hw_ctx *hctx; + + if (!q->mq_ops) + return q->fq; + + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + return hctx->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -39,8 +48,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } -struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q); -void blk_free_flush_queue(struct blk_flush_queue *fq); +struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size); +void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 325349559fb0..02c5d950f444 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -4,6 +4,7 @@ #include struct blk_mq_tags; +struct blk_flush_queue; struct blk_mq_cpu_notifier { struct list_head list; @@ -34,6 +35,7 @@ struct blk_mq_hw_ctx { struct request_queue *queue; unsigned int queue_num; + struct blk_flush_queue *fq; void *driver_data; @@ -119,6 +121,10 @@ struct blk_mq_ops { /* * Called for every command allocated by the block layer to allow * the driver to set up driver specific data. + * + * Tag greater than or equal to queue_depth is for setting up + * flush request. + * * Ditto for exit/teardown. */ init_request_fn *init_request; -- 2.39.5