blk-wbt: fix has-sleeper queueing check

[mirror_ubuntu-bionic-kernel.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 1a80d8c4f3ecffc713741f640b4e0bb3ba44865d..2f8436314985574cfd0c237b31e5290c86b53fa1 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -119,6 +119,25 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
  }
  
+static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
+                                    struct request *rq, void *priv,
+                                    bool reserved)
+{
+       struct mq_inflight *mi = priv;
+
+       if (rq->part == mi->part)
+               mi->inflight[rq_data_dir(rq)]++;
+}
+
+void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
+                        unsigned int inflight[2])
+{
+       struct mq_inflight mi = { .part = part, .inflight = inflight, };
+
+       inflight[0] = inflight[1] = 0;
+       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
+}
+
  void blk_freeze_queue_start(struct request_queue *q)
  {
         int freeze_depth;
@@ -443,7 +462,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                 blk_queue_exit(q);
                 return ERR_PTR(-EXDEV);
         }
-       cpu = cpumask_first(alloc_data.hctx->cpumask);
+       cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
  
         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -731,7 +750,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
  
  void blk_mq_kick_requeue_list(struct request_queue *q)
  {
-       kblockd_schedule_delayed_work(&q->requeue_work, 0);
+       kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
  }
  EXPORT_SYMBOL(blk_mq_kick_requeue_list);
  
@@ -1096,7 +1115,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 blk_status_t ret;
  
                 rq = list_first_entry(list, struct request, queuelist);
-               if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+
+               hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+               if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+                       break;
+
+               if (!blk_mq_get_driver_tag(rq, NULL, false)) {
                         /*
                          * The initial allocation attempt failed, so we need to
                          * rerun the hardware queue when a tag is freed. The
@@ -1105,8 +1129,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                          * we'll re-run it below.
                          */
                         if (!blk_mq_mark_tag_wait(&hctx, rq)) {
-                               if (got_budget)
-                                       blk_mq_put_dispatch_budget(hctx);
+                               blk_mq_put_dispatch_budget(hctx);
                                 /*
                                  * For non-shared tags, the RESTART check
                                  * will suffice.
@@ -1117,11 +1140,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                         }
                 }
  
-               if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-                       blk_mq_put_driver_tag(rq);
-                       break;
-               }
-
                 list_del_init(&rq->queuelist);
  
                 bd.rq = rq;
@@ -1249,6 +1267,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         }
  }
  
+static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
+{
+       int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
+
+       if (cpu >= nr_cpu_ids)
+               cpu = cpumask_first(hctx->cpumask);
+       return cpu;
+}
+
  /*
   * It'd be great if the workqueue API had a way to pass
   * in a mask and had some smarts for more clever placement.
@@ -1257,29 +1284,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
   */
  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  {
+       bool tried = false;
+       int next_cpu = hctx->next_cpu;
+
         if (hctx->queue->nr_hw_queues == 1)
                 return WORK_CPU_UNBOUND;
  
         if (--hctx->next_cpu_batch <= 0) {
-               int next_cpu;
-
-               next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+select_cpu:
+               next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
+                               cpu_online_mask);
                 if (next_cpu >= nr_cpu_ids)
-                       next_cpu = cpumask_first(hctx->cpumask);
+                       next_cpu = blk_mq_first_mapped_cpu(hctx);
+               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+       }
+
+       /*
+        * Do unbound schedule if we can't find a online CPU for this hctx,
+        * and it should only happen in the path of handling CPU DEAD.
+        */
+       if (!cpu_online(next_cpu)) {
+               if (!tried) {
+                       tried = true;
+                       goto select_cpu;
+               }
  
+               /*
+                * Make sure to re-select CPU next time once after CPUs
+                * in hctx->cpumask become online again.
+                */
                 hctx->next_cpu = next_cpu;
-               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+               hctx->next_cpu_batch = 1;
+               return WORK_CPU_UNBOUND;
         }
  
-       return hctx->next_cpu;
+       hctx->next_cpu = next_cpu;
+       return next_cpu;
  }
  
  static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                                         unsigned long msecs)
  {
-       if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-               return;
-
         if (unlikely(blk_mq_hctx_stopped(hctx)))
                 return;
  
@@ -1294,9 +1339,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                 put_cpu();
         }
  
-       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                                        &hctx->run_work,
-                                        msecs_to_jiffies(msecs));
+       kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+                                   msecs_to_jiffies(msecs));
  }
  
  void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1637,11 +1681,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
         if (q->elevator)
                 goto insert;
  
-       if (!blk_mq_get_driver_tag(rq, NULL, false))
+       if (!blk_mq_get_dispatch_budget(hctx))
                 goto insert;
  
-       if (!blk_mq_get_dispatch_budget(hctx)) {
-               blk_mq_put_driver_tag(rq);
+       if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+               blk_mq_put_dispatch_budget(hctx);
                 goto insert;
         }
  
@@ -2137,16 +2181,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                 INIT_LIST_HEAD(&__ctx->rq_list);
                 __ctx->queue = q;
  
-               /* If the cpu isn't present, the cpu is mapped to first hctx */
-               if (!cpu_present(i))
-                       continue;
-
-               hctx = blk_mq_map_queue(q, i);
-
                 /*
                  * Set local node, IFF we have more than one hw queue. If
                  * not, we remain on the home node of the device
                  */
+               hctx = blk_mq_map_queue(q, i);
                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                         hctx->numa_node = local_memory_node(cpu_to_node(i));
         }
@@ -2203,7 +2242,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
          *
          * If the cpu isn't present, the cpu is mapped to first hctx.
          */
-       for_each_present_cpu(i) {
+       for_each_possible_cpu(i) {
                 hctx_idx = q->mq_map[i];
                 /* unmapped hw queue can be remapped after CPU topo changed */
                 if (!set->tags[hctx_idx] &&
@@ -2257,7 +2296,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                 /*
                  * Initialize batch roundrobin counts
                  */
-               hctx->next_cpu = cpumask_first(hctx->cpumask);
+               hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
         }
  }
@@ -2304,7 +2343,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
  
         mutex_lock(&set->tag_list_lock);
         list_del_rcu(&q->tag_set_list);
-       INIT_LIST_HEAD(&q->tag_set_list);
         if (list_is_singular(&set->tag_list)) {
                 /* just transitioned to unshared */
                 set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2312,8 +2350,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
                 blk_mq_update_tag_set_depth(set, false);
         }
         mutex_unlock(&set->tag_list_lock);
-
         synchronize_rcu();
+       INIT_LIST_HEAD(&q->tag_set_list);
  }
  
  static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,