block: deal with stale req count of plug list

[mirror_ubuntu-artful-kernel.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index dc5f47f6093166f09d9c875aae79db2031e41acf..f39e69c732cc628c7fa54802160a2c495b28e87d 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,8 @@
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
  
  static DEFINE_MUTEX(all_q_mutex);
  static LIST_HEAD(all_q_list);
@@ -115,6 +117,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+       bool rcu = false;
+
+       blk_mq_stop_hw_queues(q);
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (hctx->flags & BLK_MQ_F_BLOCKING)
+                       synchronize_srcu(&hctx->queue_rq_srcu);
+               else
+                       rcu = true;
+       }
+       if (rcu)
+               synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
  void blk_mq_wake_waiters(struct request_queue *q)
  {
         struct blk_mq_hw_ctx *hctx;
@@ -139,17 +168,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
  EXPORT_SYMBOL(blk_mq_can_queue);
  
  static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-                              struct request *rq, int op,
-                              unsigned int op_flags)
+                              struct request *rq, unsigned int op)
  {
-       if (blk_queue_io_stat(q))
-               op_flags |= REQ_IO_STAT;
-
         INIT_LIST_HEAD(&rq->queuelist);
         /* csd/requeue_work/fifo_time is initialized before use */
         rq->q = q;
         rq->mq_ctx = ctx;
-       req_set_op_attrs(rq, op, op_flags);
+       rq->cmd_flags = op;
+       if (blk_queue_io_stat(q))
+               rq->rq_flags |= RQF_IO_STAT;
         /* do not touch atomic flags, it needs atomic ops against the timer */
         rq->cpu = -1;
         INIT_HLIST_NODE(&rq->hash);
@@ -184,11 +211,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
         rq->end_io_data = NULL;
         rq->next_rq = NULL;
  
-       ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+       ctx->rq_dispatched[op_is_sync(op)]++;
  }
  
  static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
  {
         struct request *rq;
         unsigned int tag;
@@ -198,12 +225,12 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
                 rq = data->hctx->tags->rqs[tag];
  
                 if (blk_mq_tag_busy(data->hctx)) {
-                       rq->cmd_flags = REQ_MQ_INFLIGHT;
+                       rq->rq_flags = RQF_MQ_INFLIGHT;
                         atomic_inc(&data->hctx->nr_active);
                 }
  
                 rq->tag = tag;
-               blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+               blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
                 return rq;
         }
  
@@ -224,9 +251,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                 return ERR_PTR(ret);
  
         ctx = blk_mq_get_ctx(q);
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
+       hctx = blk_mq_map_queue(q, ctx->cpu);
         blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
         blk_mq_put_ctx(ctx);
  
         if (!rq) {
@@ -278,7 +305,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
         ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
  
         blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
         if (!rq) {
                 ret = -EWOULDBLOCK;
                 goto out_queue_exit;
@@ -298,9 +325,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
         const int tag = rq->tag;
         struct request_queue *q = rq->q;
  
-       if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+       if (rq->rq_flags & RQF_MQ_INFLIGHT)
                 atomic_dec(&hctx->nr_active);
-       rq->cmd_flags = 0;
+
+       wbt_done(q->rq_wb, &rq->issue_stat);
+       rq->rq_flags = 0;
  
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
         blk_mq_put_tag(hctx, ctx, tag);
@@ -319,11 +348,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
  
  void blk_mq_free_request(struct request *rq)
  {
-       struct blk_mq_hw_ctx *hctx;
-       struct request_queue *q = rq->q;
-
-       hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
-       blk_mq_free_hctx_request(hctx, rq);
+       blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
  }
  EXPORT_SYMBOL_GPL(blk_mq_free_request);
  
@@ -332,6 +357,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
         blk_account_io_done(rq);
  
         if (rq->end_io) {
+               wbt_done(rq->q->rq_wb, &rq->issue_stat);
                 rq->end_io(rq, error);
         } else {
                 if (unlikely(blk_bidi_rq(rq)))
@@ -382,10 +408,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
         put_cpu();
  }
  
+static void blk_mq_stat_add(struct request *rq)
+{
+       if (rq->rq_flags & RQF_STATS) {
+               /*
+                * We could rq->mq_ctx here, but there's less of a risk
+                * of races if we have the completion event add the stats
+                * to the local software queue.
+                */
+               struct blk_mq_ctx *ctx;
+
+               ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+               blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+       }
+}
+
  static void __blk_mq_complete_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
  
+       blk_mq_stat_add(rq);
+
         if (!q->softirq_done_fn)
                 blk_mq_end_request(rq, rq->errors);
         else
@@ -429,6 +472,12 @@ void blk_mq_start_request(struct request *rq)
         if (unlikely(blk_bidi_rq(rq)))
                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
  
+       if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+               blk_stat_set_issue_time(&rq->issue_stat);
+               rq->rq_flags |= RQF_STATS;
+               wbt_issue(q->rq_wb, &rq->issue_stat);
+       }
+
         blk_add_timer(rq);
  
         /*
@@ -464,6 +513,7 @@ static void __blk_mq_requeue_request(struct request *rq)
         struct request_queue *q = rq->q;
  
         trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->issue_stat);
  
         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                 if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -471,12 +521,12 @@ static void __blk_mq_requeue_request(struct request *rq)
         }
  }
  
-void blk_mq_requeue_request(struct request *rq)
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
  {
         __blk_mq_requeue_request(rq);
  
         BUG_ON(blk_queued_rq(rq));
-       blk_mq_add_to_requeue_list(rq, true);
+       blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
  }
  EXPORT_SYMBOL(blk_mq_requeue_request);
  
@@ -493,10 +543,10 @@ static void blk_mq_requeue_work(struct work_struct *work)
         spin_unlock_irqrestore(&q->requeue_lock, flags);
  
         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-               if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+               if (!(rq->rq_flags & RQF_SOFTBARRIER))
                         continue;
  
-               rq->cmd_flags &= ~REQ_SOFTBARRIER;
+               rq->rq_flags &= ~RQF_SOFTBARRIER;
                 list_del_init(&rq->queuelist);
                 blk_mq_insert_request(rq, true, false, false);
         }
@@ -507,14 +557,11 @@ static void blk_mq_requeue_work(struct work_struct *work)
                 blk_mq_insert_request(rq, false, false, false);
         }
  
-       /*
-        * Use the start variant of queue running here, so that running
-        * the requeue work will kick stopped queues.
-        */
-       blk_mq_start_hw_queues(q);
+       blk_mq_run_hw_queues(q, false);
  }
  
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+                               bool kick_requeue_list)
  {
         struct request_queue *q = rq->q;
         unsigned long flags;
@@ -523,24 +570,21 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
          * We abuse this flag that is otherwise used by the I/O scheduler to
          * request head insertation from the workqueue.
          */
-       BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+       BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
  
         spin_lock_irqsave(&q->requeue_lock, flags);
         if (at_head) {
-               rq->cmd_flags |= REQ_SOFTBARRIER;
+               rq->rq_flags |= RQF_SOFTBARRIER;
                 list_add(&rq->queuelist, &q->requeue_list);
         } else {
                 list_add_tail(&rq->queuelist, &q->requeue_list);
         }
         spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
  
-void blk_mq_cancel_requeue_work(struct request_queue *q)
-{
-       cancel_delayed_work_sync(&q->requeue_work);
+       if (kick_requeue_list)
+               blk_mq_kick_requeue_list(q);
  }
-EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
  
  void blk_mq_kick_requeue_list(struct request_queue *q)
  {
@@ -782,7 +826,7 @@ static inline unsigned int queued_to_index(unsigned int queued)
   * of IO. In particular, we'd like FIFO behaviour on handling existing
   * items on the hctx->dispatch list. Ignore that for now.
   */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
  {
         struct request_queue *q = hctx->queue;
         struct request *rq;
@@ -791,12 +835,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         struct list_head *dptr;
         int queued;
  
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+       if (unlikely(blk_mq_hctx_stopped(hctx)))
                 return;
  
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-               cpu_online(hctx->next_cpu));
-
         hctx->run++;
  
         /*
@@ -887,6 +928,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         }
  }
  
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+       int srcu_idx;
+
+       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu));
+
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+               rcu_read_lock();
+               blk_mq_process_rq_list(hctx);
+               rcu_read_unlock();
+       } else {
+               srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+               blk_mq_process_rq_list(hctx);
+               srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+       }
+}
+
  /*
   * It'd be great if the workqueue API had a way to pass
   * in a mask and had some smarts for more clever placement.
@@ -899,7 +958,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
                 return WORK_CPU_UNBOUND;
  
         if (--hctx->next_cpu_batch <= 0) {
-               int cpu = hctx->next_cpu, next_cpu;
+               int next_cpu;
  
                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                 if (next_cpu >= nr_cpu_ids)
@@ -907,8 +966,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  
                 hctx->next_cpu = next_cpu;
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-               return cpu;
         }
  
         return hctx->next_cpu;
@@ -916,8 +973,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  
  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
  {
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
-           !blk_mq_hw_queue_mapped(hctx)))
+       if (unlikely(blk_mq_hctx_stopped(hctx) ||
+                    !blk_mq_hw_queue_mapped(hctx)))
                 return;
  
         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -942,7 +999,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
         queue_for_each_hw_ctx(q, hctx, i) {
                 if ((!blk_mq_hctx_has_pending(hctx) &&
                     list_empty_careful(&hctx->dispatch)) ||
-                   test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+                   blk_mq_hctx_stopped(hctx))
                         continue;
  
                 blk_mq_run_hw_queue(hctx, async);
@@ -950,6 +1007,26 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
  }
  EXPORT_SYMBOL(blk_mq_run_hw_queues);
  
+/**
+ * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
+ * @q: request queue.
+ *
+ * The caller is responsible for serializing this function against
+ * blk_mq_{start,stop}_hw_queue().
+ */
+bool blk_mq_queue_stopped(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               if (blk_mq_hctx_stopped(hctx))
+                       return true;
+
+       return false;
+}
+EXPORT_SYMBOL(blk_mq_queue_stopped);
+
  void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
  {
         cancel_work(&hctx->run_work);
@@ -992,7 +1069,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
         int i;
  
         queue_for_each_hw_ctx(q, hctx, i) {
-               if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+               if (!blk_mq_hctx_stopped(hctx))
                         continue;
  
                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -1058,9 +1135,7 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx;
-
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
  
         spin_lock(&ctx->lock);
         __blk_mq_insert_request(hctx, rq, at_head);
@@ -1077,12 +1152,10 @@ static void blk_mq_insert_requests(struct request_queue *q,
                                      bool from_schedule)
  
  {
-       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
  
         trace_block_unplug(q, depth, !from_schedule);
  
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
         /*
          * preemption doesn't flush plug list, so it's possible ctx->cpu is
          * offline now
@@ -1198,45 +1271,31 @@ insert_rq:
         }
  }
  
-struct blk_map_ctx {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-};
-
  static struct request *blk_mq_map_request(struct request_queue *q,
                                           struct bio *bio,
-                                         struct blk_map_ctx *data)
+                                         struct blk_mq_alloc_data *data)
  {
         struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         struct request *rq;
-       int op = bio_data_dir(bio);
-       int op_flags = 0;
-       struct blk_mq_alloc_data alloc_data;
  
         blk_queue_enter_live(q);
         ctx = blk_mq_get_ctx(q);
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
+       hctx = blk_mq_map_queue(q, ctx->cpu);
  
-       if (rw_is_sync(bio_op(bio), bio->bi_opf))
-               op_flags |= REQ_SYNC;
+       trace_block_getrq(q, bio, bio->bi_opf);
+       blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+       rq = __blk_mq_alloc_request(data, bio->bi_opf);
  
-       trace_block_getrq(q, bio, op);
-       blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
-
-       hctx->queued++;
-       data->hctx = hctx;
-       data->ctx = ctx;
+       data->hctx->queued++;
         return rq;
  }
  
-static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
  {
         int ret;
         struct request_queue *q = rq->q;
-       struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
-                       rq->mq_ctx->cpu);
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
         struct blk_mq_queue_data bd = {
                 .rq = rq,
                 .list = NULL,
@@ -1244,6 +1303,9 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
         };
         blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
  
+       if (blk_mq_hctx_stopped(hctx))
+               goto insert;
+
         /*
          * For OK queue, we are done. For error, kill it. Any other
          * error (busy), just add it to our list as we previously
@@ -1252,7 +1314,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
         ret = q->mq_ops->queue_rq(hctx, &bd);
         if (ret == BLK_MQ_RQ_QUEUE_OK) {
                 *cookie = new_cookie;
-               return 0;
+               return;
         }
  
         __blk_mq_requeue_request(rq);
@@ -1261,10 +1323,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
                 *cookie = BLK_QC_T_NONE;
                 rq->errors = -EIO;
                 blk_mq_end_request(rq, rq->errors);
-               return 0;
+               return;
         }
  
-       return -1;
+insert:
+       blk_mq_insert_request(rq, false, true, true);
  }
  
  /*
@@ -1274,14 +1337,15 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
   */
  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  {
-       const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+       const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-       struct blk_map_ctx data;
+       struct blk_mq_alloc_data data;
         struct request *rq;
-       unsigned int request_count = 0;
+       unsigned int request_count = 0, srcu_idx;
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
+       unsigned int wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1296,9 +1360,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
             blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                 return BLK_QC_T_NONE;
  
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                 return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -1321,7 +1391,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 blk_mq_bio_to_request(rq, bio);
  
                 /*
-                * We do limited pluging. If the bio can be merged, do that.
+                * We do limited plugging. If the bio can be merged, do that.
                  * Otherwise the existing request in the plug list will be
                  * issued. So the plug list will have one request at most
                  */
@@ -1341,9 +1411,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 blk_mq_put_ctx(data.ctx);
                 if (!old_rq)
                         goto done;
-               if (!blk_mq_direct_issue_request(old_rq, &cookie))
-                       goto done;
-               blk_mq_insert_request(old_rq, false, true, true);
+
+               if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+                       rcu_read_lock();
+                       blk_mq_try_issue_directly(old_rq, &cookie);
+                       rcu_read_unlock();
+               } else {
+                       srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+                       blk_mq_try_issue_directly(old_rq, &cookie);
+                       srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+               }
                 goto done;
         }
  
@@ -1368,13 +1445,14 @@ done:
   */
  static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
  {
-       const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+       const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
         struct blk_plug *plug;
         unsigned int request_count = 0;
-       struct blk_map_ctx data;
+       struct blk_mq_alloc_data data;
         struct request *rq;
         blk_qc_t cookie;
+       unsigned int wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1391,9 +1469,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
         } else
                 request_count = blk_plug_queued_count(q);
  
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                 return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -1410,13 +1494,25 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
          */
         plug = current->plug;
         if (plug) {
+               struct request *last = NULL;
+
                 blk_mq_bio_to_request(rq, bio);
+
+               /*
+                * @request_count may become stale because of schedule
+                * out, so check the list again.
+                */
+               if (list_empty(&plug->mq_list))
+                       request_count = 0;
                 if (!request_count)
                         trace_block_plug(q);
+               else
+                       last = list_entry_rq(plug->mq_list.prev);
  
                 blk_mq_put_ctx(data.ctx);
  
-               if (request_count >= BLK_MAX_REQUEST_COUNT) {
+               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+                   blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                         blk_flush_plug_list(plug, false);
                         trace_block_plug(q);
                 }
@@ -1440,15 +1536,6 @@ run_queue:
         return cookie;
  }
  
-/*
- * Default mapping to a software queue, since we use one per CPU.
- */
-struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
-{
-       return q->queue_hw_ctx[q->mq_map[cpu]];
-}
-EXPORT_SYMBOL(blk_mq_map_queue);
-
  static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                 struct blk_mq_tags *tags, unsigned int hctx_idx)
  {
@@ -1581,11 +1668,13 @@ fail:
   * software queue to the hw queue dispatch list, and ensure that it
   * gets run.
   */
-static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
+static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
  {
+       struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         LIST_HEAD(tmp);
  
+       hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
  
         spin_lock(&ctx->lock);
@@ -1596,30 +1685,20 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
         spin_unlock(&ctx->lock);
  
         if (list_empty(&tmp))
-               return NOTIFY_OK;
+               return 0;
  
         spin_lock(&hctx->lock);
         list_splice_tail_init(&tmp, &hctx->dispatch);
         spin_unlock(&hctx->lock);
  
         blk_mq_run_hw_queue(hctx, true);
-       return NOTIFY_OK;
+       return 0;
  }
  
-static int blk_mq_hctx_notify(void *data, unsigned long action,
-                             unsigned int cpu)
+static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
  {
-       struct blk_mq_hw_ctx *hctx = data;
-
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-               return blk_mq_hctx_cpu_offline(hctx, cpu);
-
-       /*
-        * In case of CPU online, tags may be reallocated
-        * in blk_mq_map_swqueue() after mapping is updated.
-        */
-
-       return NOTIFY_OK;
+       cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+                                           &hctx->cpuhp_dead);
  }
  
  /* hctx->ctxs will be freed in queue's release handler */
@@ -1639,7 +1718,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
         if (set->ops->exit_hctx)
                 set->ops->exit_hctx(hctx, hctx_idx);
  
-       blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+       if (hctx->flags & BLK_MQ_F_BLOCKING)
+               cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
+       blk_mq_remove_cpuhp(hctx);
         blk_free_flush_queue(hctx->fq);
         sbitmap_free(&hctx->ctx_map);
  }
@@ -1686,9 +1768,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
         hctx->queue_num = hctx_idx;
         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
  
-       blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
-                                       blk_mq_hctx_notify, hctx);
-       blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
+       cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
  
         hctx->tags = set->tags[hctx_idx];
  
@@ -1721,6 +1801,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
                                    flush_start_tag + hctx_idx, node))
                 goto free_fq;
  
+       if (hctx->flags & BLK_MQ_F_BLOCKING)
+               init_srcu_struct(&hctx->queue_rq_srcu);
+
         return 0;
  
   free_fq:
@@ -1733,8 +1816,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
   free_ctxs:
         kfree(hctx->ctxs);
   unregister_cpu_notifier:
-       blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-
+       blk_mq_remove_cpuhp(hctx);
         return -1;
  }
  
@@ -1752,12 +1834,14 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                 spin_lock_init(&__ctx->lock);
                 INIT_LIST_HEAD(&__ctx->rq_list);
                 __ctx->queue = q;
+               blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+               blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
  
                 /* If the cpu isn't online, the cpu is mapped to first hctx */
                 if (!cpu_online(i))
                         continue;
  
-               hctx = q->mq_ops->map_queue(q, i);
+               hctx = blk_mq_map_queue(q, i);
  
                 /*
                  * Set local node, IFF we have more than one hw queue. If
@@ -1795,7 +1879,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                         continue;
  
                 ctx = per_cpu_ptr(q->queue_ctx, i);
-               hctx = q->mq_ops->map_queue(q, i);
+               hctx = blk_mq_map_queue(q, i);
  
                 cpumask_set_cpu(i, hctx->cpumask);
                 ctx->index_hw = hctx->nr_ctx;
@@ -1824,7 +1908,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                 hctx->tags = set->tags[i];
                 WARN_ON(!hctx->tags);
  
-               cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
                 /*
                  * Set the map size to the number of mapped software queues.
                  * This is more accurate and more efficient than looping
@@ -1918,7 +2001,6 @@ void blk_mq_release(struct request_queue *q)
                 kfree(hctx);
         }
  
-       kfree(q->mq_map);
         q->mq_map = NULL;
  
         kfree(q->queue_hw_ctx);
@@ -2017,9 +2099,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         if (!q->queue_hw_ctx)
                 goto err_percpu;
  
-       q->mq_map = blk_mq_make_queue_map(set);
-       if (!q->mq_map)
-               goto err_map;
+       q->mq_map = set->mq_map;
  
         blk_mq_realloc_hw_ctxs(set, q);
         if (!q->nr_hw_queues)
@@ -2069,8 +2149,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         return q;
  
  err_hctxs:
-       kfree(q->mq_map);
-err_map:
         kfree(q->queue_hw_ctx);
  err_percpu:
         free_percpu(q->queue_ctx);
@@ -2088,6 +2166,8 @@ void blk_mq_free_queue(struct request_queue *q)
         list_del_init(&q->all_q_node);
         mutex_unlock(&all_q_mutex);
  
+       wbt_exit(q);
+
         blk_mq_del_queue_tag_set(q);
  
         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2102,8 +2182,6 @@ static void blk_mq_queue_reinit(struct request_queue *q,
  
         blk_mq_sysfs_unregister(q);
  
-       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
-
         /*
          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
          * we should change hctx numa_node according to new topology (this
@@ -2115,50 +2193,18 @@ static void blk_mq_queue_reinit(struct request_queue *q,
         blk_mq_sysfs_register(q);
  }
  
-static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
-                                     unsigned long action, void *hcpu)
+/*
+ * New online cpumask which is going to be set in this hotplug event.
+ * Declare this cpumasks as global as cpu-hotplug operation is invoked
+ * one-by-one and dynamically allocating this could result in a failure.
+ */
+static struct cpumask cpuhp_online_new;
+
+static void blk_mq_queue_reinit_work(void)
  {
         struct request_queue *q;
-       int cpu = (unsigned long)hcpu;
-       /*
-        * New online cpumask which is going to be set in this hotplug event.
-        * Declare this cpumasks as global as cpu-hotplug operation is invoked
-        * one-by-one and dynamically allocating this could result in a failure.
-        */
-       static struct cpumask online_new;
-
-       /*
-        * Before hotadded cpu starts handling requests, new mappings must
-        * be established.  Otherwise, these requests in hw queue might
-        * never be dispatched.
-        *
-        * For example, there is a single hw queue (hctx) and two CPU queues
-        * (ctx0 for CPU0, and ctx1 for CPU1).
-        *
-        * Now CPU1 is just onlined and a request is inserted into
-        * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
-        * still zero.
-        *
-        * And then while running hw queue, flush_busy_ctxs() finds bit0 is
-        * set in pending bitmap and tries to retrieve requests in
-        * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
-        * so the request in ctx1->rq_list is ignored.
-        */
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DEAD:
-       case CPU_UP_CANCELED:
-               cpumask_copy(&online_new, cpu_online_mask);
-               break;
-       case CPU_UP_PREPARE:
-               cpumask_copy(&online_new, cpu_online_mask);
-               cpumask_set_cpu(cpu, &online_new);
-               break;
-       default:
-               return NOTIFY_OK;
-       }
  
         mutex_lock(&all_q_mutex);
-
         /*
          * We need to freeze and reinit all existing queues.  Freezing
          * involves synchronous wait for an RCU grace period and doing it
@@ -2179,13 +2225,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
         }
  
         list_for_each_entry(q, &all_q_list, all_q_node)
-               blk_mq_queue_reinit(q, &online_new);
+               blk_mq_queue_reinit(q, &cpuhp_online_new);
  
         list_for_each_entry(q, &all_q_list, all_q_node)
                 blk_mq_unfreeze_queue(q);
  
         mutex_unlock(&all_q_mutex);
-       return NOTIFY_OK;
+}
+
+static int blk_mq_queue_reinit_dead(unsigned int cpu)
+{
+       cpumask_copy(&cpuhp_online_new, cpu_online_mask);
+       blk_mq_queue_reinit_work();
+       return 0;
+}
+
+/*
+ * Before hotadded cpu starts handling requests, new mappings must be
+ * established.  Otherwise, these requests in hw queue might never be
+ * dispatched.
+ *
+ * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
+ * for CPU0, and ctx1 for CPU1).
+ *
+ * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
+ * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
+ *
+ * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
+ * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
+ * is ignored.
+ */
+static int blk_mq_queue_reinit_prepare(unsigned int cpu)
+{
+       cpumask_copy(&cpuhp_online_new, cpu_online_mask);
+       cpumask_set_cpu(cpu, &cpuhp_online_new);
+       blk_mq_queue_reinit_work();
+       return 0;
  }
  
  static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
@@ -2242,12 +2318,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
         return 0;
  }
  
-struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
-{
-       return tags->cpumask;
-}
-EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
-
  /*
   * Alloc a tag set to be associated with one or more request queues.
   * May fail with EINVAL for various error conditions. May adjust the
@@ -2256,6 +2326,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
   */
  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
  {
+       int ret;
+
         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
  
         if (!set->nr_hw_queues)
@@ -2265,7 +2337,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                 return -EINVAL;
  
-       if (!set->ops->queue_rq || !set->ops->map_queue)
+       if (!set->ops->queue_rq)
                 return -EINVAL;
  
         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
@@ -2294,17 +2366,35 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
         if (!set->tags)
                 return -ENOMEM;
  
-       if (blk_mq_alloc_rq_maps(set))
-               goto enomem;
+       ret = -ENOMEM;
+       set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
+                       GFP_KERNEL, set->numa_node);
+       if (!set->mq_map)
+               goto out_free_tags;
+
+       if (set->ops->map_queues)
+               ret = set->ops->map_queues(set);
+       else
+               ret = blk_mq_map_queues(set);
+       if (ret)
+               goto out_free_mq_map;
+
+       ret = blk_mq_alloc_rq_maps(set);
+       if (ret)
+               goto out_free_mq_map;
  
         mutex_init(&set->tag_list_lock);
         INIT_LIST_HEAD(&set->tag_list);
  
         return 0;
-enomem:
+
+out_free_mq_map:
+       kfree(set->mq_map);
+       set->mq_map = NULL;
+out_free_tags:
         kfree(set->tags);
         set->tags = NULL;
-       return -ENOMEM;
+       return ret;
  }
  EXPORT_SYMBOL(blk_mq_alloc_tag_set);
  
@@ -2317,6 +2407,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
                         blk_mq_free_rq_map(set, set->tags[i], i);
         }
  
+       kfree(set->mq_map);
+       set->mq_map = NULL;
+
         kfree(set->tags);
         set->tags = NULL;
  }
@@ -2375,6 +2468,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
  }
  EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
  
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+       struct request_queue *q = hctx->queue;
+       long state;
+
+       hctx->poll_considered++;
+
+       state = current->state;
+       while (!need_resched()) {
+               int ret;
+
+               hctx->poll_invoked++;
+
+               ret = q->mq_ops->poll(hctx, rq->tag);
+               if (ret > 0) {
+                       hctx->poll_success++;
+                       set_current_state(TASK_RUNNING);
+                       return true;
+               }
+
+               if (signal_pending_state(state, current))
+                       set_current_state(TASK_RUNNING);
+
+               if (current->state == TASK_RUNNING)
+                       return true;
+               if (ret < 0)
+                       break;
+               cpu_relax();
+       }
+
+       return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_plug *plug;
+       struct request *rq;
+
+       if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+               return false;
+
+       plug = current->plug;
+       if (plug)
+               blk_flush_plug_list(plug, false);
+
+       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+       rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+
+       return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
  void blk_mq_disable_hotplug(void)
  {
         mutex_lock(&all_q_mutex);
@@ -2387,10 +2534,12 @@ void blk_mq_enable_hotplug(void)
  
  static int __init blk_mq_init(void)
  {
-       blk_mq_cpu_init();
-
-       hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
+       cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
+                               blk_mq_hctx_notify_dead);
  
+       cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
+                                 blk_mq_queue_reinit_prepare,
+                                 blk_mq_queue_reinit_dead);
         return 0;
  }
  subsys_initcall(blk_mq_init);