]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback
authorMing Lei <ming.lei@redhat.com>
Wed, 27 Nov 2019 20:18:09 +0000 (17:18 -0300)
committerMarcelo Henrique Cerri <marcelo.cerri@canonical.com>
Fri, 17 Jan 2020 17:23:11 +0000 (14:23 -0300)
BugLink: https://bugs.launchpad.net/bugs/1848739
blk_insert_cloned_request() is called in the fast path of a dm-rq driver
(e.g. blk-mq request-based DM mpath).  blk_insert_cloned_request() uses
blk_mq_request_bypass_insert() to directly append the request to the
blk-mq hctx->dispatch_list of the underlying queue.

1) This way isn't efficient enough because the hctx spinlock is always
used.

2) With blk_insert_cloned_request(), we completely bypass underlying
queue's elevator and depend on the upper-level dm-rq driver's elevator
to schedule IO.  But dm-rq currently can't get the underlying queue's
dispatch feedback at all.  Without knowing whether a request was issued
or not (e.g. due to underlying queue being busy) the dm-rq elevator will
not be able to provide effective IO merging (as a side-effect of dm-rq
currently blindly destaging a request from its elevator only to requeue
it after a delay, which kills any opportunity for merging).  This
obviously causes very bad sequential IO performance.

Fix this by updating blk_insert_cloned_request() to use
blk_mq_request_direct_issue().  blk_mq_request_direct_issue() allows a
request to be issued directly to the underlying queue and returns the
dispatch feedback (blk_status_t).  If blk_mq_request_direct_issue()
returns BLK_SYS_RESOURCE the dm-rq driver will now use DM_MAPIO_REQUEUE
to _not_ destage the request.  Whereby preserving the opportunity to
merge IO.

With this, request-based DM's blk-mq sequential IO performance is vastly
improved (as much as 3X in mpath/virtio-scsi testing).

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[blk-mq.c changes heavily influenced by Ming Lei's initial solution, but
they were refactored to make them less fragile and easier to read/review]
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(cherry picked from commit 396eaf21ee17c476e8f66249fb1f4a39003d0ab4)
Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
block/blk-core.c
block/blk-mq.c
block/blk-mq.h
drivers/md/dm-rq.c

index 6cce18587bdd1056b6b0d26fdafba24720b49097..1ae88e0a7ed4dde83f28f61e46ae6dea4d8c89b9 100644 (file)
@@ -2507,8 +2507,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
                 * bypass a potential scheduler on the bottom device for
                 * insert.
                 */
-               blk_mq_request_bypass_insert(rq, true);
-               return BLK_STS_OK;
+               return blk_mq_request_direct_issue(rq);
        }
 
        spin_lock_irqsave(q->queue_lock, flags);
index 6d15a2e285e9c8ab94278add8b464350774b4c8b..d417804b7f8ffa4959331c8c12c9a07b43188ec1 100644 (file)
@@ -1734,15 +1734,19 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static void __blk_mq_fallback_to_insert(struct blk_mq_hw_ctx *hctx,
                                        struct request *rq,
-                                       bool run_queue)
+                                       bool run_queue, bool bypass_insert)
 {
-       blk_mq_sched_insert_request(rq, false, run_queue, false,
-                                       hctx->flags & BLK_MQ_F_BLOCKING);
+       if (!bypass_insert)
+               blk_mq_sched_insert_request(rq, false, run_queue, false,
+                                           hctx->flags & BLK_MQ_F_BLOCKING);
+       else
+               blk_mq_request_bypass_insert(rq, run_queue);
 }
 
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                struct request *rq,
-                                               blk_qc_t *cookie)
+                                               blk_qc_t *cookie,
+                                               bool bypass_insert)
 {
        struct request_queue *q = rq->q;
        bool run_queue = true;
@@ -1753,7 +1757,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                goto insert;
        }
 
-       if (q->elevator)
+       if (q->elevator && !bypass_insert)
                goto insert;
 
        if (!blk_mq_get_dispatch_budget(hctx))
@@ -1766,7 +1770,9 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
        return __blk_mq_issue_directly(hctx, rq, cookie);
 insert:
-       __blk_mq_fallback_to_insert(hctx, rq, run_queue);
+       __blk_mq_fallback_to_insert(hctx, rq, run_queue, bypass_insert);
+       if (bypass_insert)
+               return BLK_STS_RESOURCE;
 
        return BLK_STS_OK;
 }
@@ -1781,15 +1787,30 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
        hctx_lock(hctx, &srcu_idx);
 
-       ret = __blk_mq_try_issue_directly(hctx, rq, cookie);
+       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
        if (ret == BLK_STS_RESOURCE)
-               __blk_mq_fallback_to_insert(hctx, rq, true);
+               __blk_mq_fallback_to_insert(hctx, rq, true, false);
        else if (ret != BLK_STS_OK)
                blk_mq_end_request(rq, ret);
 
        hctx_unlock(hctx, srcu_idx);
 }
 
+blk_status_t blk_mq_request_direct_issue(struct request *rq)
+{
+       blk_status_t ret;
+       int srcu_idx;
+       blk_qc_t unused_cookie;
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+       hctx_lock(hctx, &srcu_idx);
+       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+       hctx_unlock(hctx, srcu_idx);
+
+       return ret;
+}
+
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
index 7c528c07fe07a379b4d04f5a77a5dca20239c7e6..0daa9f2c3d61b641dbeec6dd02b3356d8fe0c2ac 100644 (file)
@@ -60,6 +60,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
                                struct list_head *list);
 
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_direct_issue(struct request *rq);
+
 /*
  * CPU -> queue mappings
  */
index 6f3c88b26d558a35f2bcc0d6d984cd2c6c7053ad..ecbf1f422f949ed700587f84f409b94cd1ecf1ed 100644 (file)
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
        dm_complete_request(tio->orig, error);
 }
 
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
        blk_status_t r;
 
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 
        clone->start_time = jiffies;
        r = blk_insert_cloned_request(clone->q, clone);
-       if (r)
+       if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
                /* must complete clone in terms of original request */
                dm_complete_request(rq, r);
+       return r;
 }
 
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
        struct mapped_device *md = tio->md;
        struct request *rq = tio->orig;
        struct request *clone = NULL;
+       blk_status_t ret;
 
        r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+check_again:
        switch (r) {
        case DM_MAPIO_SUBMITTED:
                /* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
                /* The target has remapped the I/O so dispatch it */
                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
                                     blk_rq_pos(rq));
-               dm_dispatch_clone_request(clone, rq);
+               ret = dm_dispatch_clone_request(clone, rq);
+               if (ret == BLK_STS_RESOURCE) {
+                       blk_rq_unprep_clone(clone);
+                       tio->ti->type->release_clone_rq(clone);
+                       tio->clone = NULL;
+                       if (!rq->q->mq_ops)
+                               r = DM_MAPIO_DELAY_REQUEUE;
+                       else
+                               r = DM_MAPIO_REQUEUE;
+                       goto check_again;
+               }
                break;
        case DM_MAPIO_REQUEUE:
                /* The target wants to requeue the I/O */