]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - block/blk-mq.c
blk-mq: move hctx lock/unlock into a helper
[mirror_ubuntu-bionic-kernel.git] / block / blk-mq.c
index 028a19864696a28ae9f077019633d40f606a1645..da42485cb39d954c475a078db62967e9a47772e7 100644 (file)
@@ -119,6 +119,25 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 }
 
+static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
+                                    struct request *rq, void *priv,
+                                    bool reserved)
+{
+       struct mq_inflight *mi = priv;
+
+       if (rq->part == mi->part)
+               mi->inflight[rq_data_dir(rq)]++;
+}
+
+void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
+                        unsigned int inflight[2])
+{
+       struct mq_inflight mi = { .part = part, .inflight = inflight, };
+
+       inflight[0] = inflight[1] = 0;
+       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
+}
+
 void blk_freeze_queue_start(struct request_queue *q)
 {
        int freeze_depth;
@@ -279,7 +298,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
-               if (blk_mq_tag_busy(data->hctx)) {
+               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
                        rq->rq_flags = RQF_MQ_INFLIGHT;
                        atomic_inc(&data->hctx->nr_active);
                }
@@ -357,6 +376,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                 */
                if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
                        e->type->ops.mq.limit_depth(op, data);
+       } else {
+               blk_mq_tag_busy(data->hctx);
        }
 
        tag = blk_mq_get_tag(data);
@@ -559,6 +580,22 @@ static void __blk_mq_complete_request(struct request *rq)
        put_cpu();
 }
 
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+{
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+               rcu_read_unlock();
+       else
+               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+}
+
+static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+{
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+               rcu_read_lock();
+       else
+               *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:                the request being processed
@@ -686,12 +723,20 @@ static void blk_mq_requeue_work(struct work_struct *work)
        spin_unlock_irq(&q->requeue_lock);
 
        list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-               if (!(rq->rq_flags & RQF_SOFTBARRIER))
+               if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
                        continue;
 
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
-               blk_mq_sched_insert_request(rq, true, false, false, true);
+               /*
+                * If RQF_DONTPREP, rq has contained some driver specific
+                * data, so insert it to hctx dispatch list to avoid any
+                * merge.
+                */
+               if (rq->rq_flags & RQF_DONTPREP)
+                       blk_mq_request_bypass_insert(rq, false);
+               else
+                       blk_mq_sched_insert_request(rq, true, false, false, true);
        }
 
        while (!list_empty(&rq_list)) {
@@ -731,7 +776,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-       kblockd_schedule_delayed_work(&q->requeue_work, 0);
+       kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
@@ -974,6 +1019,7 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
                .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
                .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
        };
+       bool shared;
 
        might_sleep_if(wait);
 
@@ -983,9 +1029,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
                data.flags |= BLK_MQ_REQ_RESERVED;
 
+       shared = blk_mq_tag_busy(data.hctx);
        rq->tag = blk_mq_get_tag(&data);
        if (rq->tag >= 0) {
-               if (blk_mq_tag_busy(data.hctx)) {
+               if (shared) {
                        rq->rq_flags |= RQF_MQ_INFLIGHT;
                        atomic_inc(&data.hctx->nr_active);
                }
@@ -1096,7 +1143,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                blk_status_t ret;
 
                rq = list_first_entry(list, struct request, queuelist);
-               if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+
+               hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+               if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+                       break;
+
+               if (!blk_mq_get_driver_tag(rq, NULL, false)) {
                        /*
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed. The
@@ -1105,8 +1157,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                         * we'll re-run it below.
                         */
                        if (!blk_mq_mark_tag_wait(&hctx, rq)) {
-                               if (got_budget)
-                                       blk_mq_put_dispatch_budget(hctx);
+                               blk_mq_put_dispatch_budget(hctx);
                                /*
                                 * For non-shared tags, the RESTART check
                                 * will suffice.
@@ -1117,11 +1168,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                        }
                }
 
-               if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-                       blk_mq_put_driver_tag(rq);
-                       break;
-               }
-
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
@@ -1236,17 +1282,20 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         */
        WARN_ON_ONCE(in_interrupt());
 
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               blk_mq_sched_dispatch_requests(hctx);
-               rcu_read_unlock();
-       } else {
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               blk_mq_sched_dispatch_requests(hctx);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+       blk_mq_sched_dispatch_requests(hctx);
+       hctx_unlock(hctx, srcu_idx);
+}
+
+static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
+{
+       int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
+
+       if (cpu >= nr_cpu_ids)
+               cpu = cpumask_first(hctx->cpumask);
+       return cpu;
 }
 
 /*
@@ -1257,30 +1306,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
+       bool tried = false;
+       int next_cpu = hctx->next_cpu;
+
        if (hctx->queue->nr_hw_queues == 1)
                return WORK_CPU_UNBOUND;
 
        if (--hctx->next_cpu_batch <= 0) {
-               int next_cpu;
-
-               next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+select_cpu:
+               next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
-                       next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
+                       next_cpu = blk_mq_first_mapped_cpu(hctx);
+               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+       }
 
+       /*
+        * Do unbound schedule if we can't find a online CPU for this hctx,
+        * and it should only happen in the path of handling CPU DEAD.
+        */
+       if (!cpu_online(next_cpu)) {
+               if (!tried) {
+                       tried = true;
+                       goto select_cpu;
+               }
+
+               /*
+                * Make sure to re-select CPU next time once after CPUs
+                * in hctx->cpumask become online again.
+                */
                hctx->next_cpu = next_cpu;
-               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+               hctx->next_cpu_batch = 1;
+               return WORK_CPU_UNBOUND;
        }
 
-       return hctx->next_cpu;
+       hctx->next_cpu = next_cpu;
+       return next_cpu;
 }
 
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                                        unsigned long msecs)
 {
-       if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-               return;
-
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;
 
@@ -1295,9 +1361,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
                put_cpu();
        }
 
-       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                                        &hctx->run_work,
-                                        msecs_to_jiffies(msecs));
+       kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+                                   msecs_to_jiffies(msecs));
 }
 
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1308,7 +1373,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-       if (blk_mq_hctx_has_pending(hctx)) {
+       int srcu_idx;
+       bool need_run;
+
+       /*
+        * When queue is quiesced, we may be switching io scheduler, or
+        * updating nr_hw_queues, or other things, and we can't run queue
+        * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+        *
+        * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+        * quiesced.
+        */
+       hctx_lock(hctx, &srcu_idx);
+       need_run = !blk_queue_quiesced(hctx->queue) &&
+               blk_mq_hctx_has_pending(hctx);
+       hctx_unlock(hctx, srcu_idx);
+
+       if (need_run) {
                __blk_mq_delay_run_hw_queue(hctx, async, 0);
                return true;
        }
@@ -1564,7 +1645,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                BUG_ON(!rq->q);
                if (rq->mq_ctx != this_ctx) {
                        if (this_ctx) {
-                               trace_block_unplug(this_q, depth, from_schedule);
+                               trace_block_unplug(this_q, depth, !from_schedule);
                                blk_mq_sched_insert_requests(this_q, this_ctx,
                                                                &ctx_list,
                                                                from_schedule);
@@ -1584,7 +1665,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
         * on 'ctx_list'. Do those.
         */
        if (this_ctx) {
-               trace_block_unplug(this_q, depth, from_schedule);
+               trace_block_unplug(this_q, depth, !from_schedule);
                blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
                                                from_schedule);
        }
@@ -1618,7 +1699,7 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 
 static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                        struct request *rq,
-                                       blk_qc_t *cookie, bool may_sleep)
+                                       blk_qc_t *cookie)
 {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
@@ -1638,11 +1719,11 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        if (q->elevator)
                goto insert;
 
-       if (!blk_mq_get_driver_tag(rq, NULL, false))
+       if (!blk_mq_get_dispatch_budget(hctx))
                goto insert;
 
-       if (!blk_mq_get_dispatch_budget(hctx)) {
-               blk_mq_put_driver_tag(rq);
+       if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+               blk_mq_put_dispatch_budget(hctx);
                goto insert;
        }
 
@@ -1668,25 +1749,20 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        }
 
 insert:
-       blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+       blk_mq_sched_insert_request(rq, false, run_queue, false,
+                                       hctx->flags & BLK_MQ_F_BLOCKING);
 }
 
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq, blk_qc_t *cookie)
 {
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-               rcu_read_unlock();
-       } else {
-               unsigned int srcu_idx;
+       int srcu_idx;
 
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               __blk_mq_try_issue_directly(hctx, rq, cookie, true);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+       __blk_mq_try_issue_directly(hctx, rq, cookie);
+       hctx_unlock(hctx, srcu_idx);
 }
 
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -2015,7 +2091,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 {
        blk_mq_debugfs_unregister_hctx(hctx);
 
-       blk_mq_tag_idle(hctx);
+       if (blk_mq_hw_queue_mapped(hctx))
+               blk_mq_tag_idle(hctx);
 
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@ -2025,12 +2102,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 
-       if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->queue_rq_srcu);
-
        blk_mq_remove_cpuhp(hctx);
-       blk_free_flush_queue(hctx->fq);
-       sbitmap_free(&hctx->ctx_map);
 }
 
 static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -2071,12 +2143,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
-                                       GFP_KERNEL, node);
+                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
        if (!hctx->ctxs)
                goto unregister_cpu_notifier;
 
-       if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
-                             node))
+       if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
                goto free_ctxs;
 
        hctx->nr_ctx = 0;
@@ -2091,7 +2163,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
        if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
                goto exit_hctx;
 
-       hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+       hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
+                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
        if (!hctx->fq)
                goto sched_exit_hctx;
 
@@ -2108,7 +2181,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
        return 0;
 
  free_fq:
-       kfree(hctx->fq);
+       blk_free_flush_queue(hctx->fq);
  sched_exit_hctx:
        blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
  exit_hctx:
@@ -2252,8 +2325,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                /*
                 * Initialize batch roundrobin counts
                 */
-               hctx->next_cpu = cpumask_first_and(hctx->cpumask,
-                               cpu_online_mask);
+               hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
 }
@@ -2300,7 +2372,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 
        mutex_lock(&set->tag_list_lock);
        list_del_rcu(&q->tag_set_list);
-       INIT_LIST_HEAD(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2308,8 +2379,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
                blk_mq_update_tag_set_depth(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
-
        synchronize_rcu();
+       INIT_LIST_HEAD(&q->tag_set_list);
 }
 
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -2414,12 +2485,14 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 
                node = blk_mq_hw_queue_to_node(q->mq_map, i);
                hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
-                                       GFP_KERNEL, node);
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                               node);
                if (!hctxs[i])
                        break;
 
-               if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
-                                               node)) {
+               if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
+                                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                                       node)) {
                        kfree(hctxs[i]);
                        hctxs[i] = NULL;
                        break;
@@ -2541,7 +2614,8 @@ err_exit:
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
 
-void blk_mq_free_queue(struct request_queue *q)
+/* tags can _not_ be used after returning from blk_mq_exit_queue */
+void blk_mq_exit_queue(struct request_queue *q)
 {
        struct blk_mq_tag_set   *set = q->tag_set;
 
@@ -2750,7 +2824,11 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        if (!set)
                return -EINVAL;
 
+       if (q->nr_requests == nr)
+               return 0;
+
        blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
 
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
@@ -2774,6 +2852,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
        if (!ret)
                q->nr_requests = nr;
 
+       blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);
 
        return ret;