]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
blk-mq-sched: improve dispatching from sw queue
authorMing Lei <ming.lei@redhat.com>
Sat, 14 Oct 2017 09:22:30 +0000 (17:22 +0800)
committerJens Axboe <axboe@kernel.dk>
Wed, 1 Nov 2017 14:20:02 +0000 (08:20 -0600)
SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. However, there is also a queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver.
Unfortunately it is easy to cause queue busy because of the small
.cmd_per_lun.  Once these requests are flushed out, they have to stay in
hctx->dispatch, and no bio merge can happen on these requests, and
sequential IO performance is harmed.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request
from a sw queue, so that we can dispatch them in scheduler's way. We can
then avoid dequeueing too many requests from sw queue, since we don't
flush ->dispatch completely.

This patch improves dispatching from sw queue by using the .get_budget
and .put_budget callbacks.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-mq-sched.c
block/blk-mq.c
block/blk-mq.h
include/linux/blk-mq.h

index 8e525e66a0d971aaa50d5b57cd27471f7cc96e6e..df8581bb0a37083d3d579ee5f70b4e653fc1457e 100644 (file)
@@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
        return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+                                         struct blk_mq_ctx *ctx)
+{
+       unsigned idx = ctx->index_hw;
+
+       if (++idx == hctx->nr_ctx)
+               idx = 0;
+
+       return hctx->ctxs[idx];
+}
+
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+       struct request_queue *q = hctx->queue;
+       LIST_HEAD(rq_list);
+       struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+       do {
+               struct request *rq;
+               blk_status_t ret;
+
+               if (!sbitmap_any_bit_set(&hctx->ctx_map))
+                       break;
+
+               ret = blk_mq_get_dispatch_budget(hctx);
+               if (ret == BLK_STS_RESOURCE)
+                       return true;
+
+               rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+               if (!rq) {
+                       blk_mq_put_dispatch_budget(hctx);
+                       break;
+               } else if (ret != BLK_STS_OK) {
+                       blk_mq_end_request(rq, ret);
+                       continue;
+               }
+
+               /*
+                * Now this rq owns the budget which has to be released
+                * if this rq won't be queued to driver via .queue_rq()
+                * in blk_mq_dispatch_rq_list().
+                */
+               list_add(&rq->queuelist, &rq_list);
+
+               /* round robin for fair dispatch */
+               ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+       } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+       WRITE_ONCE(hctx->dispatch_from, ctx);
+
+       return false;
+}
+
 /* return true if hw queue need to be run again */
 bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
@@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
-               if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-                               has_sched_dispatch)
-                       run_queue = blk_mq_do_dispatch_sched(hctx);
+               if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+                       if (has_sched_dispatch)
+                               run_queue = blk_mq_do_dispatch_sched(hctx);
+                       else
+                               run_queue = blk_mq_do_dispatch_ctx(hctx);
+               }
        } else if (has_sched_dispatch) {
                run_queue = blk_mq_do_dispatch_sched(hctx);
+       } else if (q->mq_ops->get_budget) {
+               /*
+                * If we need to get budget before queuing request, we
+                * dequeue request one by one from sw queue for avoiding
+                * to mess up I/O merge when dispatch runs out of resource.
+                *
+                * TODO: get more budgets, and dequeue more requests in
+                * one time.
+                */
+               run_queue = blk_mq_do_dispatch_ctx(hctx);
        } else {
                blk_mq_flush_busy_ctxs(hctx, &rq_list);
                blk_mq_dispatch_rq_list(q, &rq_list, false);
index dcb467369999e3fbb4cb192e7b853bbe18b04b3f..097ca3ece716ee126960b95c2b1824fc60425c1f 100644 (file)
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+       struct blk_mq_hw_ctx *hctx;
+       struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+               void *data)
+{
+       struct dispatch_rq_data *dispatch_data = data;
+       struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+       struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+       spin_lock(&ctx->lock);
+       if (unlikely(!list_empty(&ctx->rq_list))) {
+               dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+               list_del_init(&dispatch_data->rq->queuelist);
+               if (list_empty(&ctx->rq_list))
+                       sbitmap_clear_bit(sb, bitnr);
+       }
+       spin_unlock(&ctx->lock);
+
+       return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+                                       struct blk_mq_ctx *start)
+{
+       unsigned off = start ? start->index_hw : 0;
+       struct dispatch_rq_data data = {
+               .hctx = hctx,
+               .rq   = NULL,
+       };
+
+       __sbitmap_for_each_set(&hctx->ctx_map, off,
+                              dispatch_rq_from_ctx, &data);
+
+       return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
        if (!queued)
index e413b732374e6cd045051818e410d17d68eaf922..522b420dedc07ad8bd32cce618935087b81b4a8b 100644 (file)
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
                                bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+                                       struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
index 901457df3d64dd74bea45088c5035d502d461f08..e5e6becd57d3c5a93776d70145889151b77afc68 100644 (file)
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
        struct sbitmap          ctx_map;
 
+       struct blk_mq_ctx       *dispatch_from;
+
        struct blk_mq_ctx       **ctxs;
        unsigned int            nr_ctx;