X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=block%2Fblk-mq.c;h=5336fef6bbb989b96cb0a5c70caee8f4a65b8fb4;hb=61fcd9bcfcd88a55396bb75bdb106b71a111900e;hp=8939e16b854628bc520be23254604866c9977d52;hpb=8a591698099390c6d5336396ccbd3ac506a492f3;p=mirror_ubuntu-bionic-kernel.git diff --git a/block/blk-mq.c b/block/blk-mq.c index 8939e16b8546..5336fef6bbb9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,6 +119,25 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } +static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if (rq->part == mi->part) + mi->inflight[rq_data_dir(rq)]++; +} + +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part, .inflight = inflight, }; + + inflight[0] = inflight[1] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -279,7 +298,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (blk_mq_tag_busy(data->hctx)) { + if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq->rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -357,6 +376,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, */ if (!op_is_flush(op) && e->type->ops.mq.limit_depth) e->type->ops.mq.limit_depth(op, data); + } else { + blk_mq_tag_busy(data->hctx); } tag = blk_mq_get_tag(data); @@ -686,12 +707,20 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->rq_flags & RQF_SOFTBARRIER)) + if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) continue; rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + /* + * If RQF_DONTPREP, rq has contained some driver specific + * data, so insert it to hctx dispatch list to avoid any + * merge. + */ + if (rq->rq_flags & RQF_DONTPREP) + blk_mq_request_bypass_insert(rq, false); + else + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { @@ -731,7 +760,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_delayed_work(&q->requeue_work, 0); + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -974,6 +1003,7 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, }; + bool shared; might_sleep_if(wait); @@ -983,9 +1013,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; + shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { - if (blk_mq_tag_busy(data.hctx)) { + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } @@ -1248,6 +1279,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); + + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(hctx->cpumask); + return cpu; +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -1256,30 +1296,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + int next_cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; - - next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, +select_cpu: + next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); + next_cpu = blk_mq_first_mapped_cpu(hctx); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ hctx->next_cpu = next_cpu; - hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; } - return hctx->next_cpu; + hctx->next_cpu = next_cpu; + return next_cpu; } static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { - if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) - return; - if (unlikely(blk_mq_hctx_stopped(hctx))) return; @@ -1294,9 +1351,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); } void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) @@ -1563,7 +1619,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1583,7 +1639,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } @@ -2025,12 +2081,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->queue_rq_srcu); - blk_mq_remove_cpuhp(hctx); - blk_free_flush_queue(hctx->fq); - sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -2071,12 +2122,12 @@ static int blk_mq_init_hctx(struct request_queue *q, * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), - GFP_KERNEL, node); + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!hctx->ctxs) goto unregister_cpu_notifier; - if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, - node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -2091,7 +2142,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) goto exit_hctx; - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!hctx->fq) goto sched_exit_hctx; @@ -2108,7 +2160,7 @@ static int blk_mq_init_hctx(struct request_queue *q, return 0; free_fq: - kfree(hctx->fq); + blk_free_flush_queue(hctx->fq); sched_exit_hctx: blk_mq_sched_exit_hctx(q, hctx, hctx_idx); exit_hctx: @@ -2252,8 +2304,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first_and(hctx->cpumask, - cpu_online_mask); + hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } @@ -2300,7 +2351,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_rcu(&q->tag_set_list); - INIT_LIST_HEAD(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2308,8 +2358,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); + INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, @@ -2414,12 +2464,14 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, node = blk_mq_hw_queue_to_node(q->mq_map, i); hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), - GFP_KERNEL, node); + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node); if (!hctxs[i]) break; - if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) { + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node)) { kfree(hctxs[i]); hctxs[i] = NULL; break; @@ -2541,7 +2593,8 @@ err_exit: } EXPORT_SYMBOL(blk_mq_init_allocated_queue); -void blk_mq_free_queue(struct request_queue *q) +/* tags can _not_ be used after returning from blk_mq_exit_queue */ +void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; @@ -2750,6 +2803,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!set) return -EINVAL; + if (q->nr_requests == nr) + return 0; + blk_mq_freeze_queue(q); ret = 0;