]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - block/blk-mq.c
sched/headers: Prepare to move signal wakeup & sigpending methods from <linux/sched...
[mirror_ubuntu-artful-kernel.git] / block / blk-mq.c
index c3400b5444a7da9842622cb4b0c94b2f1b5ddd64..6f35b6fd47990b4821847180bd7719cc19a6ea32 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/cpu.h>
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/signal.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
@@ -32,6 +34,7 @@
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -39,9 +42,11 @@ static LIST_HEAD(all_q_list);
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-       return sbitmap_any_bit_set(&hctx->ctx_map);
+       return sbitmap_any_bit_set(&hctx->ctx_map) ||
+                       !list_empty_careful(&hctx->dispatch) ||
+                       blk_mq_sched_has_work(hctx);
 }
 
 /*
@@ -167,8 +172,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-                              struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+                       struct request *rq, unsigned int op)
 {
        INIT_LIST_HEAD(&rq->queuelist);
        /* csd/requeue_work/fifo_time is initialized before use */
@@ -196,13 +201,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
        rq->special = NULL;
        /* tag was already set */
        rq->errors = 0;
-
-       rq->cmd = rq->__cmd;
-
        rq->extra_len = 0;
-       rq->sense_len = 0;
-       rq->resid_len = 0;
-       rq->sense = NULL;
 
        INIT_LIST_HEAD(&rq->timeout_list);
        rq->timeout = 0;
@@ -213,53 +212,58 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 
        ctx->rq_dispatched[op_is_sync(op)]++;
 }
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+                                      unsigned int op)
 {
        struct request *rq;
        unsigned int tag;
 
        tag = blk_mq_get_tag(data);
        if (tag != BLK_MQ_TAG_FAIL) {
-               rq = data->hctx->tags->rqs[tag];
+               struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 
-               if (blk_mq_tag_busy(data->hctx)) {
-                       rq->rq_flags = RQF_MQ_INFLIGHT;
-                       atomic_inc(&data->hctx->nr_active);
+               rq = tags->static_rqs[tag];
+
+               if (data->flags & BLK_MQ_REQ_INTERNAL) {
+                       rq->tag = -1;
+                       rq->internal_tag = tag;
+               } else {
+                       if (blk_mq_tag_busy(data->hctx)) {
+                               rq->rq_flags = RQF_MQ_INFLIGHT;
+                               atomic_inc(&data->hctx->nr_active);
+                       }
+                       rq->tag = tag;
+                       rq->internal_tag = -1;
                }
 
-               rq->tag = tag;
                blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
                return rq;
        }
 
        return NULL;
 }
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                unsigned int flags)
 {
-       struct blk_mq_ctx *ctx;
-       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_alloc_data alloc_data = { .flags = flags };
        struct request *rq;
-       struct blk_mq_alloc_data alloc_data;
        int ret;
 
        ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
        if (ret)
                return ERR_PTR(ret);
 
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw);
-       blk_mq_put_ctx(ctx);
+       rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-       if (!rq) {
-               blk_queue_exit(q);
+       blk_mq_put_ctx(alloc_data.ctx);
+       blk_queue_exit(q);
+
+       if (!rq)
                return ERR_PTR(-EWOULDBLOCK);
-       }
 
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
@@ -319,10 +323,10 @@ out_queue_exit:
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
-                                 struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                            struct request *rq)
 {
-       const int tag = rq->tag;
+       const int sched_tag = rq->internal_tag;
        struct request_queue *q = rq->q;
 
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -333,23 +337,31 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
        clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-       blk_mq_put_tag(hctx, ctx, tag);
+       if (rq->tag != -1)
+               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+       if (sched_tag != -1)
+               blk_mq_sched_completed_request(hctx, rq);
+       blk_mq_sched_restart_queues(hctx);
        blk_queue_exit(q);
 }
 
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
+                                    struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
 
        ctx->rq_completed[rq_is_sync(rq)]++;
-       __blk_mq_free_request(hctx, ctx, rq);
+       __blk_mq_finish_request(hctx, ctx, rq);
+}
 
+void blk_mq_finish_request(struct request *rq)
+{
+       blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
-       blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+       blk_mq_sched_put_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -467,11 +479,9 @@ void blk_mq_start_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
-       trace_block_rq_issue(q, rq);
+       blk_mq_sched_started_request(rq);
 
-       rq->resid_len = blk_rq_bytes(rq);
-       if (unlikely(blk_bidi_rq(rq)))
-               rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+       trace_block_rq_issue(q, rq);
 
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
                blk_stat_set_issue_time(&rq->issue_stat);
@@ -515,6 +525,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
        trace_block_rq_requeue(q, rq);
        wbt_requeue(q->rq_wb, &rq->issue_stat);
+       blk_mq_sched_requeue_request(rq);
 
        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -549,13 +560,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, true, false, false);
+               blk_mq_sched_insert_request(rq, true, false, false, true);
        }
 
        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
-               blk_mq_insert_request(rq, false, false, false);
+               blk_mq_sched_insert_request(rq, false, false, false, true);
        }
 
        blk_mq_run_hw_queues(q, false);
@@ -639,7 +650,7 @@ struct blk_mq_timeout_data {
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-       struct blk_mq_ops *ops = req->q->mq_ops;
+       const struct blk_mq_ops *ops = req->q->mq_ops;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
        /*
@@ -754,7 +765,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
        int checked = 8;
 
        list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
-               int el_ret;
+               bool merged = false;
 
                if (!checked--)
                        break;
@@ -762,20 +773,25 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
                if (!blk_rq_merge_ok(rq, bio))
                        continue;
 
-               el_ret = blk_try_merge(rq, bio);
-               if (el_ret == ELEVATOR_BACK_MERGE) {
-                       if (bio_attempt_back_merge(q, rq, bio)) {
-                               ctx->rq_merged++;
-                               return true;
-                       }
+               switch (blk_try_merge(rq, bio)) {
+               case ELEVATOR_BACK_MERGE:
+                       if (blk_mq_sched_allow_merge(q, rq, bio))
+                               merged = bio_attempt_back_merge(q, rq, bio);
                        break;
-               } else if (el_ret == ELEVATOR_FRONT_MERGE) {
-                       if (bio_attempt_front_merge(q, rq, bio)) {
-                               ctx->rq_merged++;
-                               return true;
-                       }
+               case ELEVATOR_FRONT_MERGE:
+                       if (blk_mq_sched_allow_merge(q, rq, bio))
+                               merged = bio_attempt_front_merge(q, rq, bio);
                        break;
+               case ELEVATOR_DISCARD_MERGE:
+                       merged = bio_attempt_discard_merge(q, rq, bio);
+                       break;
+               default:
+                       continue;
                }
+
+               if (merged)
+                       ctx->rq_merged++;
+               return merged;
        }
 
        return false;
@@ -803,7 +819,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
@@ -812,6 +828,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 }
+EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
 static inline unsigned int queued_to_index(unsigned int queued)
 {
@@ -821,6 +838,112 @@ static inline unsigned int queued_to_index(unsigned int queued)
        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+                          bool wait)
+{
+       struct blk_mq_alloc_data data = {
+               .q = rq->q,
+               .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+               .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+       };
+
+       if (rq->tag != -1) {
+done:
+               if (hctx)
+                       *hctx = data.hctx;
+               return true;
+       }
+
+       rq->tag = blk_mq_get_tag(&data);
+       if (rq->tag >= 0) {
+               if (blk_mq_tag_busy(data.hctx)) {
+                       rq->rq_flags |= RQF_MQ_INFLIGHT;
+                       atomic_inc(&data.hctx->nr_active);
+               }
+               data.hctx->tags->rqs[rq->tag] = rq;
+               goto done;
+       }
+
+       return false;
+}
+
+static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+                                 struct request *rq)
+{
+       if (rq->tag == -1 || rq->internal_tag == -1)
+               return;
+
+       blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+       rq->tag = -1;
+
+       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+               atomic_dec(&hctx->nr_active);
+       }
+}
+
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+       struct request *rq, *tmp, *first = NULL;
+
+       list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+               if (rq == first)
+                       break;
+               if (rq->tag != -1) {
+                       list_move(&rq->queuelist, list);
+                       if (!first)
+                               first = rq;
+               }
+       }
+
+       return first != NULL;
+}
+
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+                               void *key)
+{
+       struct blk_mq_hw_ctx *hctx;
+
+       hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+       list_del(&wait->task_list);
+       clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+       blk_mq_run_hw_queue(hctx, true);
+       return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+       struct sbq_wait_state *ws;
+
+       /*
+        * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+        * The thread which wins the race to grab this bit adds the hardware
+        * queue to the wait queue.
+        */
+       if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+           test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+               return false;
+
+       init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+       ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+       /*
+        * As soon as this returns, it's no longer safe to fiddle with
+        * hctx->dispatch_wait, since a completion can wake up the wait queue
+        * and unlock the bit.
+        */
+       add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+       return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct request_queue *q = hctx->queue;
@@ -843,6 +966,27 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                struct blk_mq_queue_data bd;
 
                rq = list_first_entry(list, struct request, queuelist);
+               if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+                       if (!queued && reorder_tags_to_front(list))
+                               continue;
+
+                       /*
+                        * The initial allocation attempt failed, so we need to
+                        * rerun the hardware queue when a tag is freed.
+                        */
+                       if (blk_mq_dispatch_wait_add(hctx)) {
+                               /*
+                                * It's possible that a tag was freed in the
+                                * window between the allocation failure and
+                                * adding the hardware queue to the wait queue.
+                                */
+                               if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                                       break;
+                       } else {
+                               break;
+                       }
+               }
+
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
@@ -855,6 +999,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                        queued++;
                        break;
                case BLK_MQ_RQ_QUEUE_BUSY:
+                       blk_mq_put_driver_tag(hctx, rq);
                        list_add(&rq->queuelist, list);
                        __blk_mq_requeue_request(rq);
                        break;
@@ -885,7 +1030,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
         */
        if (!list_empty(list)) {
                spin_lock(&hctx->lock);
-               list_splice(list, &hctx->dispatch);
+               list_splice_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
 
                /*
@@ -896,45 +1041,16 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                 * the requests in rq_list might get lost.
                 *
                 * blk_mq_run_hw_queue() already checks the STOPPED bit
-                **/
-               blk_mq_run_hw_queue(hctx, true);
-       }
-
-       return ret != BLK_MQ_RQ_QUEUE_BUSY;
-}
-
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
-       LIST_HEAD(rq_list);
-
-       if (unlikely(blk_mq_hctx_stopped(hctx)))
-               return;
-
-       hctx->run++;
-
-       /*
-        * Touch any software queue that has pending entries.
-        */
-       flush_busy_ctxs(hctx, &rq_list);
-
-       /*
-        * If we have previous entries on our dispatch list, grab them
-        * and stuff them at the front for more fair dispatch.
-        */
-       if (!list_empty_careful(&hctx->dispatch)) {
-               spin_lock(&hctx->lock);
-               if (!list_empty(&hctx->dispatch))
-                       list_splice_init(&hctx->dispatch, &rq_list);
-               spin_unlock(&hctx->lock);
+                *
+                * If RESTART or TAG_WAITING is set, then let completion restart
+                * the queue instead of potentially looping here.
+                */
+               if (!blk_mq_sched_needs_restart(hctx) &&
+                   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+                       blk_mq_run_hw_queue(hctx, true);
        }
 
-       blk_mq_dispatch_rq_list(hctx, &rq_list);
+       return queued != 0;
 }
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
@@ -946,11 +1062,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
        if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
                rcu_read_lock();
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                rcu_read_unlock();
        } else {
                srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-               blk_mq_process_rq_list(hctx);
+               blk_mq_sched_dispatch_requests(hctx);
                srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
        }
 }
@@ -1006,8 +1122,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
        int i;
 
        queue_for_each_hw_ctx(q, hctx, i) {
-               if ((!blk_mq_hctx_has_pending(hctx) &&
-                   list_empty_careful(&hctx->dispatch)) ||
+               if (!blk_mq_hctx_has_pending(hctx) ||
                    blk_mq_hctx_stopped(hctx))
                        continue;
 
@@ -1116,6 +1231,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
        if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
                return;
 
+       blk_mq_stop_hw_queue(hctx);
        kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
                        &hctx->delay_work, msecs_to_jiffies(msecs));
 }
@@ -1135,8 +1251,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
                list_add_tail(&rq->queuelist, &ctx->rq_list);
 }
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-                                   struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                            bool at_head)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
 
@@ -1144,32 +1260,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
        blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-                          bool async)
-{
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       spin_lock(&ctx->lock);
-       __blk_mq_insert_request(hctx, rq, at_head);
-       spin_unlock(&ctx->lock);
-
-       if (run_queue)
-               blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-                                    struct blk_mq_ctx *ctx,
-                                    struct list_head *list,
-                                    int depth,
-                                    bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+                           struct list_head *list)
 
 {
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-       trace_block_unplug(q, depth, !from_schedule);
-
        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
@@ -1185,8 +1279,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
        }
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
-
-       blk_mq_run_hw_queue(hctx, from_schedule);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1222,9 +1314,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                BUG_ON(!rq->q);
                if (rq->mq_ctx != this_ctx) {
                        if (this_ctx) {
-                               blk_mq_insert_requests(this_q, this_ctx,
-                                                       &ctx_list, depth,
-                                                       from_schedule);
+                               trace_block_unplug(this_q, depth, from_schedule);
+                               blk_mq_sched_insert_requests(this_q, this_ctx,
+                                                               &ctx_list,
+                                                               from_schedule);
                        }
 
                        this_ctx = rq->mq_ctx;
@@ -1241,8 +1334,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
         * on 'ctx_list'. Do those.
         */
        if (this_ctx) {
-               blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-                                      from_schedule);
+               trace_block_unplug(this_q, depth, from_schedule);
+               blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+                                               from_schedule);
        }
 }
 
@@ -1280,46 +1374,39 @@ insert_rq:
                }
 
                spin_unlock(&ctx->lock);
-               __blk_mq_free_request(hctx, ctx, rq);
+               __blk_mq_finish_request(hctx, ctx, rq);
                return true;
        }
 }
 
-static struct request *blk_mq_map_request(struct request_queue *q,
-                                         struct bio *bio,
-                                         struct blk_mq_alloc_data *data)
+static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-       struct request *rq;
-
-       blk_queue_enter_live(q);
-       ctx = blk_mq_get_ctx(q);
-       hctx = blk_mq_map_queue(q, ctx->cpu);
+       if (rq->tag != -1)
+               return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
 
-       trace_block_getrq(q, bio, bio->bi_opf);
-       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-       rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-       data->hctx->queued++;
-       return rq;
+       return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-       int ret;
        struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .list = NULL,
                .last = 1
        };
-       blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+       struct blk_mq_hw_ctx *hctx;
+       blk_qc_t new_cookie;
+       int ret;
 
-       if (blk_mq_hctx_stopped(hctx))
+       if (q->elevator)
                goto insert;
 
+       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+               goto insert;
+
+       new_cookie = request_to_qc_t(hctx, rq);
+
        /*
         * For OK queue, we are done. For error, kill it. Any other
         * error (busy), just add it to our list as we previously
@@ -1341,7 +1428,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
        }
 
 insert:
-       blk_mq_insert_request(rq, false, true, true);
+       blk_mq_sched_insert_request(rq, false, true, true, false);
 }
 
 /*
@@ -1352,8 +1439,8 @@ insert:
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-       struct blk_mq_alloc_data data;
+       const int is_flush_fua = op_is_flush(bio->bi_opf);
+       struct blk_mq_alloc_data data = { .flags = 0 };
        struct request *rq;
        unsigned int request_count = 0, srcu_idx;
        struct blk_plug *plug;
@@ -1374,9 +1461,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                return BLK_QC_T_NONE;
 
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
        wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
        if (unlikely(!rq)) {
                __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
@@ -1384,9 +1476,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        wbt_track(&rq->issue_stat, wb_acct);
 
-       cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+       cookie = request_to_qc_t(data.hctx, rq);
 
        if (unlikely(is_flush_fua)) {
+               if (q->elevator)
+                       goto elv_insert;
                blk_mq_bio_to_request(rq, bio);
                blk_insert_flush(rq);
                goto run_queue;
@@ -1438,6 +1532,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                goto done;
        }
 
+       if (q->elevator) {
+elv_insert:
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true,
+                                               !is_sync || is_flush_fua, true);
+               goto done;
+       }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                /*
                 * For a SYNC request, send it to the hardware immediately. For
@@ -1460,10 +1562,10 @@ done:
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+       const int is_flush_fua = op_is_flush(bio->bi_opf);
        struct blk_plug *plug;
        unsigned int request_count = 0;
-       struct blk_mq_alloc_data data;
+       struct blk_mq_alloc_data data = { .flags = 0 };
        struct request *rq;
        blk_qc_t cookie;
        unsigned int wb_acct;
@@ -1483,9 +1585,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
        } else
                request_count = blk_plug_queued_count(q);
 
+       if (blk_mq_sched_bio_merge(q, bio))
+               return BLK_QC_T_NONE;
+
        wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-       rq = blk_mq_map_request(q, bio, &data);
+       trace_block_getrq(q, bio, bio->bi_opf);
+
+       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
        if (unlikely(!rq)) {
                __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
@@ -1493,9 +1600,11 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 
        wbt_track(&rq->issue_stat, wb_acct);
 
-       cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+       cookie = request_to_qc_t(data.hctx, rq);
 
        if (unlikely(is_flush_fua)) {
+               if (q->elevator)
+                       goto elv_insert;
                blk_mq_bio_to_request(rq, bio);
                blk_insert_flush(rq);
                goto run_queue;
@@ -1535,6 +1644,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                return cookie;
        }
 
+       if (q->elevator) {
+elv_insert:
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true,
+                                               !is_sync || is_flush_fua, true);
+               goto done;
+       }
        if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
                /*
                 * For a SYNC request, send it to the hardware immediately. For
@@ -1547,11 +1664,12 @@ run_queue:
        }
 
        blk_mq_put_ctx(data.ctx);
+done:
        return cookie;
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-               struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx)
 {
        struct page *page;
 
@@ -1559,11 +1677,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                int i;
 
                for (i = 0; i < tags->nr_tags; i++) {
-                       if (!tags->rqs[i])
+                       struct request *rq = tags->static_rqs[i];
+
+                       if (!rq)
                                continue;
-                       set->ops->exit_request(set->driver_data, tags->rqs[i],
+                       set->ops->exit_request(set->driver_data, rq,
                                                hctx_idx, i);
-                       tags->rqs[i] = NULL;
+                       tags->static_rqs[i] = NULL;
                }
        }
 
@@ -1577,33 +1697,32 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
+}
 
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
        kfree(tags->rqs);
+       tags->rqs = NULL;
+       kfree(tags->static_rqs);
+       tags->static_rqs = NULL;
 
        blk_mq_free_tags(tags);
 }
 
-static size_t order_to_size(unsigned int order)
-{
-       return (size_t)PAGE_SIZE << order;
-}
-
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-               unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+                                       unsigned int hctx_idx,
+                                       unsigned int nr_tags,
+                                       unsigned int reserved_tags)
 {
        struct blk_mq_tags *tags;
-       unsigned int i, j, entries_per_page, max_order = 4;
-       size_t rq_size, left;
 
-       tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+       tags = blk_mq_init_tags(nr_tags, reserved_tags,
                                set->numa_node,
                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;
 
-       INIT_LIST_HEAD(&tags->page_list);
-
-       tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+       tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 set->numa_node);
        if (!tags->rqs) {
@@ -1611,15 +1730,40 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                return NULL;
        }
 
+       tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
+                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                                set->numa_node);
+       if (!tags->static_rqs) {
+               kfree(tags->rqs);
+               blk_mq_free_tags(tags);
+               return NULL;
+       }
+
+       return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+       return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+                    unsigned int hctx_idx, unsigned int depth)
+{
+       unsigned int i, j, entries_per_page, max_order = 4;
+       size_t rq_size, left;
+
+       INIT_LIST_HEAD(&tags->page_list);
+
        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
-       left = rq_size * set->queue_depth;
+       left = rq_size * depth;
 
-       for (i = 0; i < set->queue_depth; ) {
+       for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
@@ -1653,15 +1797,17 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
-               to_do = min(entries_per_page, set->queue_depth - i);
+               to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
-                       tags->rqs[i] = p;
+                       struct request *rq = p;
+
+                       tags->static_rqs[i] = rq;
                        if (set->ops->init_request) {
                                if (set->ops->init_request(set->driver_data,
-                                               tags->rqs[i], hctx_idx, i,
+                                               rq, hctx_idx, i,
                                                set->numa_node)) {
-                                       tags->rqs[i] = NULL;
+                                       tags->static_rqs[i] = NULL;
                                        goto fail;
                                }
                        }
@@ -1670,11 +1816,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                        i++;
                }
        }
-       return tags;
+       return 0;
 
 fail:
-       blk_mq_free_rq_map(set, tags, hctx_idx);
-       return NULL;
+       blk_mq_free_rqs(set, tags, hctx_idx);
+       return -ENOMEM;
 }
 
 /*
@@ -1866,6 +2012,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
        }
 }
 
+static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+{
+       int ret = 0;
+
+       set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+                                       set->queue_depth, set->reserved_tags);
+       if (!set->tags[hctx_idx])
+               return false;
+
+       ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
+                               set->queue_depth);
+       if (!ret)
+               return true;
+
+       blk_mq_free_rq_map(set->tags[hctx_idx]);
+       set->tags[hctx_idx] = NULL;
+       return false;
+}
+
+static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
+                                        unsigned int hctx_idx)
+{
+       if (set->tags[hctx_idx]) {
+               blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+               blk_mq_free_rq_map(set->tags[hctx_idx]);
+               set->tags[hctx_idx] = NULL;
+       }
+}
+
 static void blk_mq_map_swqueue(struct request_queue *q,
                               const struct cpumask *online_mask)
 {
@@ -1894,17 +2069,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 
                hctx_idx = q->mq_map[i];
                /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[hctx_idx]) {
-                       set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
-
+               if (!set->tags[hctx_idx] &&
+                   !__blk_mq_alloc_rq_map(set, hctx_idx)) {
                        /*
                         * If tags initialization fail for some hctx,
                         * that hctx won't be brought online.  In this
                         * case, remap the current ctx to hctx[0] which
                         * is guaranteed to always have tags allocated
                         */
-                       if (!set->tags[hctx_idx])
-                               q->mq_map[i] = 0;
+                       q->mq_map[i] = 0;
                }
 
                ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -1927,10 +2100,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                         * fallback in case of a new remap fails
                         * allocation
                         */
-                       if (i && set->tags[i]) {
-                               blk_mq_free_rq_map(set, set->tags[i], i);
-                               set->tags[i] = NULL;
-                       }
+                       if (i && set->tags[i])
+                               blk_mq_free_map_and_requests(set, i);
+
                        hctx->tags = NULL;
                        continue;
                }
@@ -2023,6 +2195,8 @@ void blk_mq_release(struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
 
+       blk_mq_sched_teardown(q);
+
        /* hctx kobj stays in hctx */
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx)
@@ -2097,10 +2271,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx = hctxs[j];
 
                if (hctx) {
-                       if (hctx->tags) {
-                               blk_mq_free_rq_map(set, hctx->tags, j);
-                               set->tags[j] = NULL;
-                       }
+                       if (hctx->tags)
+                               blk_mq_free_map_and_requests(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
                        free_cpumask_var(hctx->cpumask);
                        kobject_put(&hctx->kobj);
@@ -2181,6 +2353,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        mutex_unlock(&all_q_mutex);
        put_online_cpus();
 
+       if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+               int ret;
+
+               ret = blk_mq_sched_init(q);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
+
        return q;
 
 err_hctxs:
@@ -2279,10 +2459,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu)
  * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
  * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
  *
- * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
- * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
- * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
- * is ignored.
+ * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
+ * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
+ * ignored.
  */
 static int blk_mq_queue_reinit_prepare(unsigned int cpu)
 {
@@ -2296,17 +2476,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
        int i;
 
-       for (i = 0; i < set->nr_hw_queues; i++) {
-               set->tags[i] = blk_mq_init_rq_map(set, i);
-               if (!set->tags[i])
+       for (i = 0; i < set->nr_hw_queues; i++)
+               if (!__blk_mq_alloc_rq_map(set, i))
                        goto out_unwind;
-       }
 
        return 0;
 
 out_unwind:
        while (--i >= 0)
-               blk_mq_free_rq_map(set, set->tags[i], i);
+               blk_mq_free_rq_map(set->tags[i]);
 
        return -ENOMEM;
 }
@@ -2430,10 +2608,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
        int i;
 
-       for (i = 0; i < nr_cpu_ids; i++) {
-               if (set->tags[i])
-                       blk_mq_free_rq_map(set, set->tags[i], i);
-       }
+       for (i = 0; i < nr_cpu_ids; i++)
+               blk_mq_free_map_and_requests(set, i);
 
        kfree(set->mq_map);
        set->mq_map = NULL;
@@ -2449,14 +2625,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        struct blk_mq_hw_ctx *hctx;
        int i, ret;
 
-       if (!set || nr > set->queue_depth)
+       if (!set)
                return -EINVAL;
 
+       blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
+
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->tags)
                        continue;
-               ret = blk_mq_tag_update_depth(hctx->tags, nr);
+               /*
+                * If we're using an MQ scheduler, just update the scheduler
+                * queue depth. This is similar to what the old code would do.
+                */
+               if (!hctx->sched_tags) {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
+                                                       min(nr, set->queue_depth),
+                                                       false);
+               } else {
+                       ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
+                                                       nr, true);
+               }
                if (ret)
                        break;
        }
@@ -2464,6 +2654,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        if (!ret)
                q->nr_requests = nr;
 
+       blk_mq_unfreeze_queue(q);
+       blk_mq_start_stopped_hw_queues(q, true);
+
        return ret;
 }
 
@@ -2483,10 +2676,14 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
 
+               /*
+                * Manually set the make_request_fn as blk_queue_make_request
+                * resets a lot of the queue settings.
+                */
                if (q->nr_hw_queues > 1)
-                       blk_queue_make_request(q, blk_mq_make_request);
+                       q->make_request_fn = blk_mq_make_request;
                else
-                       blk_queue_make_request(q, blk_sq_make_request);
+                       q->make_request_fn = blk_sq_make_request;
 
                blk_mq_queue_reinit(q, cpu_online_mask);
        }
@@ -2649,7 +2846,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
                blk_flush_plug_list(plug, false);
 
        hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-       rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       if (!blk_qc_t_is_internal(cookie))
+               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       else
+               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
 
        return __blk_mq_poll(hctx, rq);
 }