]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blobdiff - block/blk-mq.c
Merge branch 'scsi-target-for-v4.10' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-zesty-kernel.git] / block / blk-mq.c
index f3d27a6dee09dfa48dd7b78bc024f4a9809e8f88..4bf850e8d6b5abf5a1f5252ab3bfe6dd10de6111 100644 (file)
@@ -30,6 +30,8 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -115,6 +117,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+       bool rcu = false;
+
+       blk_mq_stop_hw_queues(q);
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (hctx->flags & BLK_MQ_F_BLOCKING)
+                       synchronize_srcu(&hctx->queue_rq_srcu);
+               else
+                       rcu = true;
+       }
+       if (rcu)
+               synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
@@ -139,17 +168,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 EXPORT_SYMBOL(blk_mq_can_queue);
 
 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-                              struct request *rq, int op,
-                              unsigned int op_flags)
+                              struct request *rq, unsigned int op)
 {
-       if (blk_queue_io_stat(q))
-               op_flags |= REQ_IO_STAT;
-
        INIT_LIST_HEAD(&rq->queuelist);
        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = q;
        rq->mq_ctx = ctx;
-       req_set_op_attrs(rq, op, op_flags);
+       rq->cmd_flags = op;
+       if (blk_queue_io_stat(q))
+               rq->rq_flags |= RQF_IO_STAT;
        /* do not touch atomic flags, it needs atomic ops against the timer */
        rq->cpu = -1;
        INIT_HLIST_NODE(&rq->hash);
@@ -184,11 +211,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
        rq->end_io_data = NULL;
        rq->next_rq = NULL;
 
-       ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+       ctx->rq_dispatched[op_is_sync(op)]++;
 }
 
 static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 {
        struct request *rq;
        unsigned int tag;
@@ -198,12 +225,12 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
                rq = data->hctx->tags->rqs[tag];
 
                if (blk_mq_tag_busy(data->hctx)) {
-                       rq->cmd_flags = REQ_MQ_INFLIGHT;
+                       rq->rq_flags = RQF_MQ_INFLIGHT;
                        atomic_inc(&data->hctx->nr_active);
                }
 
                rq->tag = tag;
-               blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+               blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
                return rq;
        }
 
@@ -226,7 +253,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
        ctx = blk_mq_get_ctx(q);
        hctx = blk_mq_map_queue(q, ctx->cpu);
        blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
        blk_mq_put_ctx(ctx);
 
        if (!rq) {
@@ -278,7 +305,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
        ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
 
        blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
        if (!rq) {
                ret = -EWOULDBLOCK;
                goto out_queue_exit;
@@ -298,11 +325,14 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
        const int tag = rq->tag;
        struct request_queue *q = rq->q;
 
-       if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+       if (rq->rq_flags & RQF_MQ_INFLIGHT)
                atomic_dec(&hctx->nr_active);
-       rq->cmd_flags = 0;
+
+       wbt_done(q->rq_wb, &rq->issue_stat);
+       rq->rq_flags = 0;
 
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
        blk_mq_put_tag(hctx, ctx, tag);
        blk_queue_exit(q);
 }
@@ -328,6 +358,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
        blk_account_io_done(rq);
 
        if (rq->end_io) {
+               wbt_done(rq->q->rq_wb, &rq->issue_stat);
                rq->end_io(rq, error);
        } else {
                if (unlikely(blk_bidi_rq(rq)))
@@ -378,10 +409,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
        put_cpu();
 }
 
+static void blk_mq_stat_add(struct request *rq)
+{
+       if (rq->rq_flags & RQF_STATS) {
+               /*
+                * We could rq->mq_ctx here, but there's less of a risk
+                * of races if we have the completion event add the stats
+                * to the local software queue.
+                */
+               struct blk_mq_ctx *ctx;
+
+               ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+               blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+       }
+}
+
 static void __blk_mq_complete_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
+       blk_mq_stat_add(rq);
+
        if (!q->softirq_done_fn)
                blk_mq_end_request(rq, rq->errors);
        else
@@ -425,6 +473,12 @@ void blk_mq_start_request(struct request *rq)
        if (unlikely(blk_bidi_rq(rq)))
                rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 
+       if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+               blk_stat_set_issue_time(&rq->issue_stat);
+               rq->rq_flags |= RQF_STATS;
+               wbt_issue(q->rq_wb, &rq->issue_stat);
+       }
+
        blk_add_timer(rq);
 
        /*
@@ -460,6 +514,7 @@ static void __blk_mq_requeue_request(struct request *rq)
        struct request_queue *q = rq->q;
 
        trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->issue_stat);
 
        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -467,12 +522,12 @@ static void __blk_mq_requeue_request(struct request *rq)
        }
 }
 
-void blk_mq_requeue_request(struct request *rq)
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 {
        __blk_mq_requeue_request(rq);
 
        BUG_ON(blk_queued_rq(rq));
-       blk_mq_add_to_requeue_list(rq, true);
+       blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
@@ -489,10 +544,10 @@ static void blk_mq_requeue_work(struct work_struct *work)
        spin_unlock_irqrestore(&q->requeue_lock, flags);
 
        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-               if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+               if (!(rq->rq_flags & RQF_SOFTBARRIER))
                        continue;
 
-               rq->cmd_flags &= ~REQ_SOFTBARRIER;
+               rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
                blk_mq_insert_request(rq, true, false, false);
        }
@@ -503,14 +558,11 @@ static void blk_mq_requeue_work(struct work_struct *work)
                blk_mq_insert_request(rq, false, false, false);
        }
 
-       /*
-        * Use the start variant of queue running here, so that running
-        * the requeue work will kick stopped queues.
-        */
-       blk_mq_start_hw_queues(q);
+       blk_mq_run_hw_queues(q, false);
 }
 
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+                               bool kick_requeue_list)
 {
        struct request_queue *q = rq->q;
        unsigned long flags;
@@ -519,24 +571,21 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
         * We abuse this flag that is otherwise used by the I/O scheduler to
         * request head insertation from the workqueue.
         */
-       BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+       BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
        spin_lock_irqsave(&q->requeue_lock, flags);
        if (at_head) {
-               rq->cmd_flags |= REQ_SOFTBARRIER;
+               rq->rq_flags |= RQF_SOFTBARRIER;
                list_add(&rq->queuelist, &q->requeue_list);
        } else {
                list_add_tail(&rq->queuelist, &q->requeue_list);
        }
        spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
-void blk_mq_cancel_requeue_work(struct request_queue *q)
-{
-       cancel_delayed_work_sync(&q->requeue_work);
+       if (kick_requeue_list)
+               blk_mq_kick_requeue_list(q);
 }
-EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
@@ -772,44 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued)
        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct request_queue *q = hctx->queue;
        struct request *rq;
-       LIST_HEAD(rq_list);
        LIST_HEAD(driver_list);
        struct list_head *dptr;
-       int queued;
-
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
-               return;
-
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-               cpu_online(hctx->next_cpu));
-
-       hctx->run++;
-
-       /*
-        * Touch any software queue that has pending entries.
-        */
-       flush_busy_ctxs(hctx, &rq_list);
-
-       /*
-        * If we have previous entries on our dispatch list, grab them
-        * and stuff them at the front for more fair dispatch.
-        */
-       if (!list_empty_careful(&hctx->dispatch)) {
-               spin_lock(&hctx->lock);
-               if (!list_empty(&hctx->dispatch))
-                       list_splice_init(&hctx->dispatch, &rq_list);
-               spin_unlock(&hctx->lock);
-       }
+       int queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
        /*
         * Start off with dptr being NULL, so we start the first request
@@ -821,16 +839,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
-       while (!list_empty(&rq_list)) {
+       while (!list_empty(list)) {
                struct blk_mq_queue_data bd;
-               int ret;
 
-               rq = list_first_entry(&rq_list, struct request, queuelist);
+               rq = list_first_entry(list, struct request, queuelist);
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
                bd.list = dptr;
-               bd.last = list_empty(&rq_list);
+               bd.last = list_empty(list);
 
                ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
@@ -838,7 +855,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                        queued++;
                        break;
                case BLK_MQ_RQ_QUEUE_BUSY:
-                       list_add(&rq->queuelist, &rq_list);
+                       list_add(&rq->queuelist, list);
                        __blk_mq_requeue_request(rq);
                        break;
                default:
@@ -856,7 +873,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                 * We've done the first request. If we have more than 1
                 * left in the list, set dptr to defer issue.
                 */
-               if (!dptr && rq_list.next != rq_list.prev)
+               if (!dptr && list->next != list->prev)
                        dptr = &driver_list;
        }
 
@@ -866,10 +883,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
-       if (!list_empty(&rq_list)) {
+       if (!list_empty(list)) {
                spin_lock(&hctx->lock);
-               list_splice(&rq_list, &hctx->dispatch);
+               list_splice(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);
+
                /*
                 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
                 * it's possible the queue is stopped and restarted again
@@ -881,6 +899,61 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                 **/
                blk_mq_run_hw_queue(hctx, true);
        }
+
+       return ret != BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+{
+       LIST_HEAD(rq_list);
+       LIST_HEAD(driver_list);
+
+       if (unlikely(blk_mq_hctx_stopped(hctx)))
+               return;
+
+       hctx->run++;
+
+       /*
+        * Touch any software queue that has pending entries.
+        */
+       flush_busy_ctxs(hctx, &rq_list);
+
+       /*
+        * If we have previous entries on our dispatch list, grab them
+        * and stuff them at the front for more fair dispatch.
+        */
+       if (!list_empty_careful(&hctx->dispatch)) {
+               spin_lock(&hctx->lock);
+               if (!list_empty(&hctx->dispatch))
+                       list_splice_init(&hctx->dispatch, &rq_list);
+               spin_unlock(&hctx->lock);
+       }
+
+       blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+       int srcu_idx;
+
+       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu));
+
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+               rcu_read_lock();
+               blk_mq_process_rq_list(hctx);
+               rcu_read_unlock();
+       } else {
+               srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+               blk_mq_process_rq_list(hctx);
+               srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+       }
 }
 
 /*
@@ -895,7 +968,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
                return WORK_CPU_UNBOUND;
 
        if (--hctx->next_cpu_batch <= 0) {
-               int cpu = hctx->next_cpu, next_cpu;
+               int next_cpu;
 
                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                if (next_cpu >= nr_cpu_ids)
@@ -903,8 +976,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-               return cpu;
        }
 
        return hctx->next_cpu;
@@ -912,8 +983,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
-           !blk_mq_hw_queue_mapped(hctx)))
+       if (unlikely(blk_mq_hctx_stopped(hctx) ||
+                    !blk_mq_hw_queue_mapped(hctx)))
                return;
 
        if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -938,7 +1009,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
        queue_for_each_hw_ctx(q, hctx, i) {
                if ((!blk_mq_hctx_has_pending(hctx) &&
                    list_empty_careful(&hctx->dispatch)) ||
-                   test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+                   blk_mq_hctx_stopped(hctx))
                        continue;
 
                blk_mq_run_hw_queue(hctx, async);
@@ -946,6 +1017,26 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
+/**
+ * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
+ * @q: request queue.
+ *
+ * The caller is responsible for serializing this function against
+ * blk_mq_{start,stop}_hw_queue().
+ */
+bool blk_mq_queue_stopped(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               if (blk_mq_hctx_stopped(hctx))
+                       return true;
+
+       return false;
+}
+EXPORT_SYMBOL(blk_mq_queue_stopped);
+
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        cancel_work(&hctx->run_work);
@@ -982,18 +1073,23 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+       if (!blk_mq_hctx_stopped(hctx))
+               return;
+
+       clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+       blk_mq_run_hw_queue(hctx, async);
+}
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
+
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
 
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
-                       continue;
-
-               clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-               blk_mq_run_hw_queue(hctx, async);
-       }
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_start_stopped_hw_queue(hctx, async);
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
@@ -1155,7 +1251,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
        init_request_from_bio(rq, bio);
 
-       blk_account_io_start(rq, 1);
+       blk_account_io_start(rq, true);
 }
 
 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
@@ -1190,40 +1286,27 @@ insert_rq:
        }
 }
 
-struct blk_map_ctx {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-};
-
 static struct request *blk_mq_map_request(struct request_queue *q,
                                          struct bio *bio,
-                                         struct blk_map_ctx *data)
+                                         struct blk_mq_alloc_data *data)
 {
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct request *rq;
-       int op = bio_data_dir(bio);
-       int op_flags = 0;
-       struct blk_mq_alloc_data alloc_data;
 
        blk_queue_enter_live(q);
        ctx = blk_mq_get_ctx(q);
        hctx = blk_mq_map_queue(q, ctx->cpu);
 
-       if (rw_is_sync(bio_op(bio), bio->bi_opf))
-               op_flags |= REQ_SYNC;
-
-       trace_block_getrq(q, bio, op);
-       blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
+       trace_block_getrq(q, bio, bio->bi_opf);
+       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+       rq = __blk_mq_alloc_request(data, bio->bi_opf);
 
-       data->hctx = alloc_data.hctx;
-       data->ctx = alloc_data.ctx;
        data->hctx->queued++;
        return rq;
 }
 
-static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
        int ret;
        struct request_queue *q = rq->q;
@@ -1235,6 +1318,9 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
        };
        blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
 
+       if (blk_mq_hctx_stopped(hctx))
+               goto insert;
+
        /*
         * For OK queue, we are done. For error, kill it. Any other
         * error (busy), just add it to our list as we previously
@@ -1243,7 +1329,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
        ret = q->mq_ops->queue_rq(hctx, &bd);
        if (ret == BLK_MQ_RQ_QUEUE_OK) {
                *cookie = new_cookie;
-               return 0;
+               return;
        }
 
        __blk_mq_requeue_request(rq);
@@ -1252,10 +1338,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
                *cookie = BLK_QC_T_NONE;
                rq->errors = -EIO;
                blk_mq_end_request(rq, rq->errors);
-               return 0;
+               return;
        }
 
-       return -1;
+insert:
+       blk_mq_insert_request(rq, false, true, true);
 }
 
 /*
@@ -1265,14 +1352,15 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
  */
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
-       const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+       const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-       struct blk_map_ctx data;
+       struct blk_mq_alloc_data data;
        struct request *rq;
-       unsigned int request_count = 0;
+       unsigned int request_count = 0, srcu_idx;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        blk_qc_t cookie;
+       unsigned int wb_acct;
 
        blk_queue_bounce(q, &bio);
 
@@ -1287,9 +1375,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                return BLK_QC_T_NONE;
 
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
        rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
 
        cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -1312,7 +1406,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                blk_mq_bio_to_request(rq, bio);
 
                /*
-                * We do limited pluging. If the bio can be merged, do that.
+                * We do limited plugging. If the bio can be merged, do that.
                 * Otherwise the existing request in the plug list will be
                 * issued. So the plug list will have one request at most
                 */
@@ -1332,9 +1426,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                blk_mq_put_ctx(data.ctx);
                if (!old_rq)
                        goto done;
-               if (!blk_mq_direct_issue_request(old_rq, &cookie))
-                       goto done;
-               blk_mq_insert_request(old_rq, false, true, true);
+
+               if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+                       rcu_read_lock();
+                       blk_mq_try_issue_directly(old_rq, &cookie);
+                       rcu_read_unlock();
+               } else {
+                       srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+                       blk_mq_try_issue_directly(old_rq, &cookie);
+                       srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+               }
                goto done;
        }
 
@@ -1359,13 +1460,14 @@ done:
  */
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
-       const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+       const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
        struct blk_plug *plug;
        unsigned int request_count = 0;
-       struct blk_map_ctx data;
+       struct blk_mq_alloc_data data;
        struct request *rq;
        blk_qc_t cookie;
+       unsigned int wb_acct;
 
        blk_queue_bounce(q, &bio);
 
@@ -1382,9 +1484,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
        } else
                request_count = blk_plug_queued_count(q);
 
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
        rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
 
        cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -1401,13 +1509,25 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
         */
        plug = current->plug;
        if (plug) {
+               struct request *last = NULL;
+
                blk_mq_bio_to_request(rq, bio);
+
+               /*
+                * @request_count may become stale because of schedule
+                * out, so check the list again.
+                */
+               if (list_empty(&plug->mq_list))
+                       request_count = 0;
                if (!request_count)
                        trace_block_plug(q);
+               else
+                       last = list_entry_rq(plug->mq_list.prev);
 
                blk_mq_put_ctx(data.ctx);
 
-               if (request_count >= BLK_MAX_REQUEST_COUNT) {
+               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+                   blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                        blk_flush_plug_list(plug, false);
                        trace_block_plug(q);
                }
@@ -1485,7 +1605,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
        INIT_LIST_HEAD(&tags->page_list);
 
        tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
-                                GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 set->numa_node);
        if (!tags->rqs) {
                blk_mq_free_tags(tags);
@@ -1511,7 +1631,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 
                do {
                        page = alloc_pages_node(set->numa_node,
-                               GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
@@ -1532,7 +1652,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                 * Allow kmemleak to scan these pages as they contain pointers
                 * to additional allocations like via ops->init_request().
                 */
-               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, set->queue_depth - i);
                left -= to_do * rq_size;
@@ -1613,6 +1733,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 
+       if (hctx->flags & BLK_MQ_F_BLOCKING)
+               cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
        blk_mq_remove_cpuhp(hctx);
        blk_free_flush_queue(hctx->fq);
        sbitmap_free(&hctx->ctx_map);
@@ -1693,6 +1816,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
                                   flush_start_tag + hctx_idx, node))
                goto free_fq;
 
+       if (hctx->flags & BLK_MQ_F_BLOCKING)
+               init_srcu_struct(&hctx->queue_rq_srcu);
+
        return 0;
 
  free_fq:
@@ -1723,6 +1849,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                spin_lock_init(&__ctx->lock);
                INIT_LIST_HEAD(&__ctx->rq_list);
                __ctx->queue = q;
+               blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+               blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
 
                /* If the cpu isn't online, the cpu is mapped to first hctx */
                if (!cpu_online(i))
@@ -1742,7 +1870,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 static void blk_mq_map_swqueue(struct request_queue *q,
                               const struct cpumask *online_mask)
 {
-       unsigned int i;
+       unsigned int i, hctx_idx;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;
@@ -1765,6 +1893,21 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                if (!cpumask_test_cpu(i, online_mask))
                        continue;
 
+               hctx_idx = q->mq_map[i];
+               /* unmapped hw queue can be remapped after CPU topo changed */
+               if (!set->tags[hctx_idx]) {
+                       set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+
+                       /*
+                        * If tags initialization fail for some hctx,
+                        * that hctx won't be brought online.  In this
+                        * case, remap the current ctx to hctx[0] which
+                        * is guaranteed to always have tags allocated
+                        */
+                       if (!set->tags[hctx_idx])
+                               q->mq_map[i] = 0;
+               }
+
                ctx = per_cpu_ptr(q->queue_ctx, i);
                hctx = blk_mq_map_queue(q, i);
 
@@ -1781,7 +1924,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
-                       if (set->tags[i]) {
+                       /* Never unmap queue 0.  We need it as a
+                        * fallback in case of a new remap fails
+                        * allocation
+                        */
+                       if (i && set->tags[i]) {
                                blk_mq_free_rq_map(set, set->tags[i], i);
                                set->tags[i] = NULL;
                        }
@@ -1789,9 +1936,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                        continue;
                }
 
-               /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[i])
-                       set->tags[i] = blk_mq_init_rq_map(set, i);
                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);
 
@@ -2018,6 +2162,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         */
        q->nr_requests = set->queue_depth;
 
+       /*
+        * Default to classic polling
+        */
+       q->poll_nsec = -1;
+
        if (set->ops->complete)
                blk_queue_softirq_done(q, set->ops->complete);
 
@@ -2053,6 +2202,8 @@ void blk_mq_free_queue(struct request_queue *q)
        list_del_init(&q->all_q_node);
        mutex_unlock(&all_q_mutex);
 
+       wbt_exit(q);
+
        blk_mq_del_queue_tag_set(q);
 
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2099,16 +2250,9 @@ static void blk_mq_queue_reinit_work(void)
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
-       list_for_each_entry(q, &all_q_list, all_q_node) {
+       list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_wait(q);
 
-               /*
-                * timeout handler can't touch hw queue during the
-                * reinitialization
-                */
-               del_timer_sync(&q->timeout);
-       }
-
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_queue_reinit(q, &cpuhp_online_new);
 
@@ -2353,6 +2497,165 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+                                      struct blk_mq_hw_ctx *hctx,
+                                      struct request *rq)
+{
+       struct blk_rq_stat stat[2];
+       unsigned long ret = 0;
+
+       /*
+        * If stats collection isn't on, don't sleep but turn it on for
+        * future users
+        */
+       if (!blk_stat_enable(q))
+               return 0;
+
+       /*
+        * We don't have to do this once per IO, should optimize this
+        * to just use the current window of stats until it changes
+        */
+       memset(&stat, 0, sizeof(stat));
+       blk_hctx_stat_get(hctx, stat);
+
+       /*
+        * As an optimistic guess, use half of the mean service time
+        * for this type of request. We can (and should) make this smarter.
+        * For instance, if the completion latencies are tight, we can
+        * get closer than just half the mean. This is especially
+        * important on devices where the completion latencies are longer
+        * than ~10 usec.
+        */
+       if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+               ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+       else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+               ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+       return ret;
+}
+
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+                                    struct blk_mq_hw_ctx *hctx,
+                                    struct request *rq)
+{
+       struct hrtimer_sleeper hs;
+       enum hrtimer_mode mode;
+       unsigned int nsecs;
+       ktime_t kt;
+
+       if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+               return false;
+
+       /*
+        * poll_nsec can be:
+        *
+        * -1:  don't ever hybrid sleep
+        *  0:  use half of prev avg
+        * >0:  use this specific value
+        */
+       if (q->poll_nsec == -1)
+               return false;
+       else if (q->poll_nsec > 0)
+               nsecs = q->poll_nsec;
+       else
+               nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+       if (!nsecs)
+               return false;
+
+       set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+       /*
+        * This will be replaced with the stats tracking code, using
+        * 'avg_completion_time / 2' as the pre-sleep target.
+        */
+       kt = ktime_set(0, nsecs);
+
+       mode = HRTIMER_MODE_REL;
+       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_set_expires(&hs.timer, kt);
+
+       hrtimer_init_sleeper(&hs, current);
+       do {
+               if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+                       break;
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               hrtimer_start_expires(&hs.timer, mode);
+               if (hs.task)
+                       io_schedule();
+               hrtimer_cancel(&hs.timer);
+               mode = HRTIMER_MODE_ABS;
+       } while (hs.task && !signal_pending(current));
+
+       __set_current_state(TASK_RUNNING);
+       destroy_hrtimer_on_stack(&hs.timer);
+       return true;
+}
+
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       struct request_queue *q = hctx->queue;
+       long state;
+
+       /*
+        * If we sleep, have the caller restart the poll loop to reset
+        * the state. Like for the other success return cases, the
+        * caller is responsible for checking if the IO completed. If
+        * the IO isn't complete, we'll get called again and will go
+        * straight to the busy poll loop.
+        */
+       if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
+               return true;
+
+       hctx->poll_considered++;
+
+       state = current->state;
+       while (!need_resched()) {
+               int ret;
+
+               hctx->poll_invoked++;
+
+               ret = q->mq_ops->poll(hctx, rq->tag);
+               if (ret > 0) {
+                       hctx->poll_success++;
+                       set_current_state(TASK_RUNNING);
+                       return true;
+               }
+
+               if (signal_pending_state(state, current))
+                       set_current_state(TASK_RUNNING);
+
+               if (current->state == TASK_RUNNING)
+                       return true;
+               if (ret < 0)
+                       break;
+               cpu_relax();
+       }
+
+       return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_plug *plug;
+       struct request *rq;
+
+       if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+               return false;
+
+       plug = current->plug;
+       if (plug)
+               blk_flush_plug_list(plug, false);
+
+       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+       rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+
+       return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
 void blk_mq_disable_hotplug(void)
 {
        mutex_lock(&all_q_mutex);