]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - block/blk-mq.c
block: add scalable completion tracking of requests
[mirror_ubuntu-artful-kernel.git] / block / blk-mq.c
CommitLineData
75bb4625
JA
1/*
2 * Block multiqueue core code
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */
320ae51f
JA
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/backing-dev.h>
10#include <linux/bio.h>
11#include <linux/blkdev.h>
f75782e4 12#include <linux/kmemleak.h>
320ae51f
JA
13#include <linux/mm.h>
14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/workqueue.h>
17#include <linux/smp.h>
18#include <linux/llist.h>
19#include <linux/list_sort.h>
20#include <linux/cpu.h>
21#include <linux/cache.h>
22#include <linux/sched/sysctl.h>
23#include <linux/delay.h>
aedcd72f 24#include <linux/crash_dump.h>
88c7b2b7 25#include <linux/prefetch.h>
320ae51f
JA
26
27#include <trace/events/block.h>
28
29#include <linux/blk-mq.h>
30#include "blk.h"
31#include "blk-mq.h"
32#include "blk-mq-tag.h"
cf43e6be 33#include "blk-stat.h"
320ae51f
JA
34
35static DEFINE_MUTEX(all_q_mutex);
36static LIST_HEAD(all_q_list);
37
320ae51f
JA
38/*
39 * Check if any of the ctx's have pending work in this hardware queue
40 */
41static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
42{
88459642 43 return sbitmap_any_bit_set(&hctx->ctx_map);
1429d7c9
JA
44}
45
320ae51f
JA
46/*
47 * Mark this ctx as having pending work in this hardware queue
48 */
49static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
50 struct blk_mq_ctx *ctx)
51{
88459642
OS
52 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
53 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
1429d7c9
JA
54}
55
56static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
57 struct blk_mq_ctx *ctx)
58{
88459642 59 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
320ae51f
JA
60}
61
b4c6a028 62void blk_mq_freeze_queue_start(struct request_queue *q)
43a5e4e2 63{
4ecd4fef 64 int freeze_depth;
cddd5d17 65
4ecd4fef
CH
66 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
67 if (freeze_depth == 1) {
3ef28e83 68 percpu_ref_kill(&q->q_usage_counter);
b94ec296 69 blk_mq_run_hw_queues(q, false);
cddd5d17 70 }
f3af020b 71}
b4c6a028 72EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
f3af020b
TH
73
74static void blk_mq_freeze_queue_wait(struct request_queue *q)
75{
3ef28e83 76 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
43a5e4e2
ML
77}
78
f3af020b
TH
79/*
80 * Guarantee no request is in use, so we can change any data structure of
81 * the queue afterward.
82 */
3ef28e83 83void blk_freeze_queue(struct request_queue *q)
f3af020b 84{
3ef28e83
DW
85 /*
86 * In the !blk_mq case we are only calling this to kill the
87 * q_usage_counter, otherwise this increases the freeze depth
88 * and waits for it to return to zero. For this reason there is
89 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
90 * exported to drivers as the only user for unfreeze is blk_mq.
91 */
f3af020b
TH
92 blk_mq_freeze_queue_start(q);
93 blk_mq_freeze_queue_wait(q);
94}
3ef28e83
DW
95
96void blk_mq_freeze_queue(struct request_queue *q)
97{
98 /*
99 * ...just an alias to keep freeze and unfreeze actions balanced
100 * in the blk_mq_* namespace
101 */
102 blk_freeze_queue(q);
103}
c761d96b 104EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
f3af020b 105
b4c6a028 106void blk_mq_unfreeze_queue(struct request_queue *q)
320ae51f 107{
4ecd4fef 108 int freeze_depth;
320ae51f 109
4ecd4fef
CH
110 freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
111 WARN_ON_ONCE(freeze_depth < 0);
112 if (!freeze_depth) {
3ef28e83 113 percpu_ref_reinit(&q->q_usage_counter);
320ae51f 114 wake_up_all(&q->mq_freeze_wq);
add703fd 115 }
320ae51f 116}
b4c6a028 117EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
320ae51f 118
6a83e74d
BVA
119/**
120 * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
121 * @q: request queue.
122 *
123 * Note: this function does not prevent that the struct request end_io()
124 * callback function is invoked. Additionally, it is not prevented that
125 * new queue_rq() calls occur unless the queue has been stopped first.
126 */
127void blk_mq_quiesce_queue(struct request_queue *q)
128{
129 struct blk_mq_hw_ctx *hctx;
130 unsigned int i;
131 bool rcu = false;
132
133 blk_mq_stop_hw_queues(q);
134
135 queue_for_each_hw_ctx(q, hctx, i) {
136 if (hctx->flags & BLK_MQ_F_BLOCKING)
137 synchronize_srcu(&hctx->queue_rq_srcu);
138 else
139 rcu = true;
140 }
141 if (rcu)
142 synchronize_rcu();
143}
144EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
145
aed3ea94
JA
146void blk_mq_wake_waiters(struct request_queue *q)
147{
148 struct blk_mq_hw_ctx *hctx;
149 unsigned int i;
150
151 queue_for_each_hw_ctx(q, hctx, i)
152 if (blk_mq_hw_queue_mapped(hctx))
153 blk_mq_tag_wakeup_all(hctx->tags, true);
3fd5940c
KB
154
155 /*
156 * If we are called because the queue has now been marked as
157 * dying, we need to ensure that processes currently waiting on
158 * the queue are notified as well.
159 */
160 wake_up_all(&q->mq_freeze_wq);
aed3ea94
JA
161}
162
320ae51f
JA
163bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
164{
165 return blk_mq_has_free_tags(hctx->tags);
166}
167EXPORT_SYMBOL(blk_mq_can_queue);
168
94eddfbe 169static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
ef295ecf 170 struct request *rq, unsigned int op)
320ae51f 171{
af76e555
CH
172 INIT_LIST_HEAD(&rq->queuelist);
173 /* csd/requeue_work/fifo_time is initialized before use */
174 rq->q = q;
320ae51f 175 rq->mq_ctx = ctx;
ef295ecf 176 rq->cmd_flags = op;
e8064021
CH
177 if (blk_queue_io_stat(q))
178 rq->rq_flags |= RQF_IO_STAT;
af76e555
CH
179 /* do not touch atomic flags, it needs atomic ops against the timer */
180 rq->cpu = -1;
af76e555
CH
181 INIT_HLIST_NODE(&rq->hash);
182 RB_CLEAR_NODE(&rq->rb_node);
af76e555
CH
183 rq->rq_disk = NULL;
184 rq->part = NULL;
3ee32372 185 rq->start_time = jiffies;
af76e555
CH
186#ifdef CONFIG_BLK_CGROUP
187 rq->rl = NULL;
0fec08b4 188 set_start_time_ns(rq);
af76e555
CH
189 rq->io_start_time_ns = 0;
190#endif
191 rq->nr_phys_segments = 0;
192#if defined(CONFIG_BLK_DEV_INTEGRITY)
193 rq->nr_integrity_segments = 0;
194#endif
af76e555
CH
195 rq->special = NULL;
196 /* tag was already set */
197 rq->errors = 0;
af76e555 198
6f4a1626
TB
199 rq->cmd = rq->__cmd;
200
af76e555
CH
201 rq->extra_len = 0;
202 rq->sense_len = 0;
203 rq->resid_len = 0;
204 rq->sense = NULL;
205
af76e555 206 INIT_LIST_HEAD(&rq->timeout_list);
f6be4fb4
JA
207 rq->timeout = 0;
208
af76e555
CH
209 rq->end_io = NULL;
210 rq->end_io_data = NULL;
211 rq->next_rq = NULL;
212
ef295ecf 213 ctx->rq_dispatched[op_is_sync(op)]++;
320ae51f
JA
214}
215
5dee8577 216static struct request *
ef295ecf 217__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
5dee8577
CH
218{
219 struct request *rq;
220 unsigned int tag;
221
cb96a42c 222 tag = blk_mq_get_tag(data);
5dee8577 223 if (tag != BLK_MQ_TAG_FAIL) {
cb96a42c 224 rq = data->hctx->tags->rqs[tag];
5dee8577 225
cb96a42c 226 if (blk_mq_tag_busy(data->hctx)) {
e8064021 227 rq->rq_flags = RQF_MQ_INFLIGHT;
cb96a42c 228 atomic_inc(&data->hctx->nr_active);
5dee8577
CH
229 }
230
231 rq->tag = tag;
ef295ecf 232 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
5dee8577
CH
233 return rq;
234 }
235
236 return NULL;
237}
238
6f3b0e8b
CH
239struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
240 unsigned int flags)
320ae51f 241{
d852564f
CH
242 struct blk_mq_ctx *ctx;
243 struct blk_mq_hw_ctx *hctx;
320ae51f 244 struct request *rq;
cb96a42c 245 struct blk_mq_alloc_data alloc_data;
a492f075 246 int ret;
320ae51f 247
6f3b0e8b 248 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
a492f075
JL
249 if (ret)
250 return ERR_PTR(ret);
320ae51f 251
d852564f 252 ctx = blk_mq_get_ctx(q);
7d7e0f90 253 hctx = blk_mq_map_queue(q, ctx->cpu);
6f3b0e8b 254 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
ef295ecf 255 rq = __blk_mq_alloc_request(&alloc_data, rw);
d852564f 256 blk_mq_put_ctx(ctx);
841bac2c 257
c76541a9 258 if (!rq) {
3ef28e83 259 blk_queue_exit(q);
a492f075 260 return ERR_PTR(-EWOULDBLOCK);
c76541a9 261 }
0c4de0f3
CH
262
263 rq->__data_len = 0;
264 rq->__sector = (sector_t) -1;
265 rq->bio = rq->biotail = NULL;
320ae51f
JA
266 return rq;
267}
4bb659b1 268EXPORT_SYMBOL(blk_mq_alloc_request);
320ae51f 269
1f5bd336
ML
270struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
271 unsigned int flags, unsigned int hctx_idx)
272{
273 struct blk_mq_hw_ctx *hctx;
274 struct blk_mq_ctx *ctx;
275 struct request *rq;
276 struct blk_mq_alloc_data alloc_data;
277 int ret;
278
279 /*
280 * If the tag allocator sleeps we could get an allocation for a
281 * different hardware context. No need to complicate the low level
282 * allocator for this for the rare use case of a command tied to
283 * a specific queue.
284 */
285 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
286 return ERR_PTR(-EINVAL);
287
288 if (hctx_idx >= q->nr_hw_queues)
289 return ERR_PTR(-EIO);
290
291 ret = blk_queue_enter(q, true);
292 if (ret)
293 return ERR_PTR(ret);
294
c8712c6a
CH
295 /*
296 * Check if the hardware context is actually mapped to anything.
297 * If not tell the caller that it should skip this queue.
298 */
1f5bd336 299 hctx = q->queue_hw_ctx[hctx_idx];
c8712c6a
CH
300 if (!blk_mq_hw_queue_mapped(hctx)) {
301 ret = -EXDEV;
302 goto out_queue_exit;
303 }
1f5bd336
ML
304 ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
305
306 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
ef295ecf 307 rq = __blk_mq_alloc_request(&alloc_data, rw);
1f5bd336 308 if (!rq) {
c8712c6a
CH
309 ret = -EWOULDBLOCK;
310 goto out_queue_exit;
1f5bd336
ML
311 }
312
313 return rq;
c8712c6a
CH
314
315out_queue_exit:
316 blk_queue_exit(q);
317 return ERR_PTR(ret);
1f5bd336
ML
318}
319EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
320
320ae51f
JA
321static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
322 struct blk_mq_ctx *ctx, struct request *rq)
323{
324 const int tag = rq->tag;
325 struct request_queue *q = rq->q;
326
e8064021 327 if (rq->rq_flags & RQF_MQ_INFLIGHT)
0d2602ca 328 atomic_dec(&hctx->nr_active);
e8064021 329 rq->rq_flags = 0;
0d2602ca 330
af76e555 331 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
40aabb67 332 blk_mq_put_tag(hctx, ctx, tag);
3ef28e83 333 blk_queue_exit(q);
320ae51f
JA
334}
335
7c7f2f2b 336void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
320ae51f
JA
337{
338 struct blk_mq_ctx *ctx = rq->mq_ctx;
320ae51f
JA
339
340 ctx->rq_completed[rq_is_sync(rq)]++;
320ae51f 341 __blk_mq_free_request(hctx, ctx, rq);
7c7f2f2b
JA
342
343}
344EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
345
346void blk_mq_free_request(struct request *rq)
347{
7d7e0f90 348 blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
320ae51f 349}
1a3b595a 350EXPORT_SYMBOL_GPL(blk_mq_free_request);
320ae51f 351
c8a446ad 352inline void __blk_mq_end_request(struct request *rq, int error)
320ae51f 353{
0d11e6ac
ML
354 blk_account_io_done(rq);
355
91b63639 356 if (rq->end_io) {
320ae51f 357 rq->end_io(rq, error);
91b63639
CH
358 } else {
359 if (unlikely(blk_bidi_rq(rq)))
360 blk_mq_free_request(rq->next_rq);
320ae51f 361 blk_mq_free_request(rq);
91b63639 362 }
320ae51f 363}
c8a446ad 364EXPORT_SYMBOL(__blk_mq_end_request);
63151a44 365
c8a446ad 366void blk_mq_end_request(struct request *rq, int error)
63151a44
CH
367{
368 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
369 BUG();
c8a446ad 370 __blk_mq_end_request(rq, error);
63151a44 371}
c8a446ad 372EXPORT_SYMBOL(blk_mq_end_request);
320ae51f 373
30a91cb4 374static void __blk_mq_complete_request_remote(void *data)
320ae51f 375{
3d6efbf6 376 struct request *rq = data;
320ae51f 377
30a91cb4 378 rq->q->softirq_done_fn(rq);
320ae51f 379}
320ae51f 380
ed851860 381static void blk_mq_ipi_complete_request(struct request *rq)
320ae51f
JA
382{
383 struct blk_mq_ctx *ctx = rq->mq_ctx;
38535201 384 bool shared = false;
320ae51f
JA
385 int cpu;
386
38535201 387 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
30a91cb4
CH
388 rq->q->softirq_done_fn(rq);
389 return;
390 }
320ae51f
JA
391
392 cpu = get_cpu();
38535201
CH
393 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
394 shared = cpus_share_cache(cpu, ctx->cpu);
395
396 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
30a91cb4 397 rq->csd.func = __blk_mq_complete_request_remote;
3d6efbf6
CH
398 rq->csd.info = rq;
399 rq->csd.flags = 0;
c46fff2a 400 smp_call_function_single_async(ctx->cpu, &rq->csd);
3d6efbf6 401 } else {
30a91cb4 402 rq->q->softirq_done_fn(rq);
3d6efbf6 403 }
320ae51f
JA
404 put_cpu();
405}
30a91cb4 406
cf43e6be
JA
407static void blk_mq_stat_add(struct request *rq)
408{
409 if (rq->rq_flags & RQF_STATS) {
410 /*
411 * We could rq->mq_ctx here, but there's less of a risk
412 * of races if we have the completion event add the stats
413 * to the local software queue.
414 */
415 struct blk_mq_ctx *ctx;
416
417 ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
418 blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
419 }
420}
421
1fa8cc52 422static void __blk_mq_complete_request(struct request *rq)
ed851860
JA
423{
424 struct request_queue *q = rq->q;
425
cf43e6be
JA
426 blk_mq_stat_add(rq);
427
ed851860 428 if (!q->softirq_done_fn)
c8a446ad 429 blk_mq_end_request(rq, rq->errors);
ed851860
JA
430 else
431 blk_mq_ipi_complete_request(rq);
432}
433
30a91cb4
CH
434/**
435 * blk_mq_complete_request - end I/O on a request
436 * @rq: the request being processed
437 *
438 * Description:
439 * Ends all I/O on a request. It does not handle partial completions.
440 * The actual completion happens out-of-order, through a IPI handler.
441 **/
f4829a9b 442void blk_mq_complete_request(struct request *rq, int error)
30a91cb4 443{
95f09684
JA
444 struct request_queue *q = rq->q;
445
446 if (unlikely(blk_should_fake_timeout(q)))
30a91cb4 447 return;
f4829a9b
CH
448 if (!blk_mark_rq_complete(rq)) {
449 rq->errors = error;
ed851860 450 __blk_mq_complete_request(rq);
f4829a9b 451 }
30a91cb4
CH
452}
453EXPORT_SYMBOL(blk_mq_complete_request);
320ae51f 454
973c0191
KB
455int blk_mq_request_started(struct request *rq)
456{
457 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
458}
459EXPORT_SYMBOL_GPL(blk_mq_request_started);
460
e2490073 461void blk_mq_start_request(struct request *rq)
320ae51f
JA
462{
463 struct request_queue *q = rq->q;
464
465 trace_block_rq_issue(q, rq);
466
742ee69b 467 rq->resid_len = blk_rq_bytes(rq);
91b63639
CH
468 if (unlikely(blk_bidi_rq(rq)))
469 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
742ee69b 470
cf43e6be
JA
471 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
472 blk_stat_set_issue_time(&rq->issue_stat);
473 rq->rq_flags |= RQF_STATS;
474 }
475
2b8393b4 476 blk_add_timer(rq);
87ee7b11 477
538b7534
JA
478 /*
479 * Ensure that ->deadline is visible before set the started
480 * flag and clear the completed flag.
481 */
482 smp_mb__before_atomic();
483
87ee7b11
JA
484 /*
485 * Mark us as started and clear complete. Complete might have been
486 * set if requeue raced with timeout, which then marked it as
487 * complete. So be sure to clear complete again when we start
488 * the request, otherwise we'll ignore the completion event.
489 */
4b570521
JA
490 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
491 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
492 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
493 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
49f5baa5
CH
494
495 if (q->dma_drain_size && blk_rq_bytes(rq)) {
496 /*
497 * Make sure space for the drain appears. We know we can do
498 * this because max_hw_segments has been adjusted to be one
499 * fewer than the device can handle.
500 */
501 rq->nr_phys_segments++;
502 }
320ae51f 503}
e2490073 504EXPORT_SYMBOL(blk_mq_start_request);
320ae51f 505
ed0791b2 506static void __blk_mq_requeue_request(struct request *rq)
320ae51f
JA
507{
508 struct request_queue *q = rq->q;
509
510 trace_block_rq_requeue(q, rq);
49f5baa5 511
e2490073
CH
512 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
513 if (q->dma_drain_size && blk_rq_bytes(rq))
514 rq->nr_phys_segments--;
515 }
320ae51f
JA
516}
517
2b053aca 518void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
ed0791b2 519{
ed0791b2 520 __blk_mq_requeue_request(rq);
ed0791b2 521
ed0791b2 522 BUG_ON(blk_queued_rq(rq));
2b053aca 523 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
ed0791b2
CH
524}
525EXPORT_SYMBOL(blk_mq_requeue_request);
526
6fca6a61
CH
527static void blk_mq_requeue_work(struct work_struct *work)
528{
529 struct request_queue *q =
2849450a 530 container_of(work, struct request_queue, requeue_work.work);
6fca6a61
CH
531 LIST_HEAD(rq_list);
532 struct request *rq, *next;
533 unsigned long flags;
534
535 spin_lock_irqsave(&q->requeue_lock, flags);
536 list_splice_init(&q->requeue_list, &rq_list);
537 spin_unlock_irqrestore(&q->requeue_lock, flags);
538
539 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
e8064021 540 if (!(rq->rq_flags & RQF_SOFTBARRIER))
6fca6a61
CH
541 continue;
542
e8064021 543 rq->rq_flags &= ~RQF_SOFTBARRIER;
6fca6a61
CH
544 list_del_init(&rq->queuelist);
545 blk_mq_insert_request(rq, true, false, false);
546 }
547
548 while (!list_empty(&rq_list)) {
549 rq = list_entry(rq_list.next, struct request, queuelist);
550 list_del_init(&rq->queuelist);
551 blk_mq_insert_request(rq, false, false, false);
552 }
553
52d7f1b5 554 blk_mq_run_hw_queues(q, false);
6fca6a61
CH
555}
556
2b053aca
BVA
557void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
558 bool kick_requeue_list)
6fca6a61
CH
559{
560 struct request_queue *q = rq->q;
561 unsigned long flags;
562
563 /*
564 * We abuse this flag that is otherwise used by the I/O scheduler to
565 * request head insertation from the workqueue.
566 */
e8064021 567 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
6fca6a61
CH
568
569 spin_lock_irqsave(&q->requeue_lock, flags);
570 if (at_head) {
e8064021 571 rq->rq_flags |= RQF_SOFTBARRIER;
6fca6a61
CH
572 list_add(&rq->queuelist, &q->requeue_list);
573 } else {
574 list_add_tail(&rq->queuelist, &q->requeue_list);
575 }
576 spin_unlock_irqrestore(&q->requeue_lock, flags);
2b053aca
BVA
577
578 if (kick_requeue_list)
579 blk_mq_kick_requeue_list(q);
6fca6a61
CH
580}
581EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
582
583void blk_mq_kick_requeue_list(struct request_queue *q)
584{
2849450a 585 kblockd_schedule_delayed_work(&q->requeue_work, 0);
6fca6a61
CH
586}
587EXPORT_SYMBOL(blk_mq_kick_requeue_list);
588
2849450a
MS
589void blk_mq_delay_kick_requeue_list(struct request_queue *q,
590 unsigned long msecs)
591{
592 kblockd_schedule_delayed_work(&q->requeue_work,
593 msecs_to_jiffies(msecs));
594}
595EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
596
1885b24d
JA
597void blk_mq_abort_requeue_list(struct request_queue *q)
598{
599 unsigned long flags;
600 LIST_HEAD(rq_list);
601
602 spin_lock_irqsave(&q->requeue_lock, flags);
603 list_splice_init(&q->requeue_list, &rq_list);
604 spin_unlock_irqrestore(&q->requeue_lock, flags);
605
606 while (!list_empty(&rq_list)) {
607 struct request *rq;
608
609 rq = list_first_entry(&rq_list, struct request, queuelist);
610 list_del_init(&rq->queuelist);
611 rq->errors = -EIO;
612 blk_mq_end_request(rq, rq->errors);
613 }
614}
615EXPORT_SYMBOL(blk_mq_abort_requeue_list);
616
0e62f51f
JA
617struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
618{
88c7b2b7
JA
619 if (tag < tags->nr_tags) {
620 prefetch(tags->rqs[tag]);
4ee86bab 621 return tags->rqs[tag];
88c7b2b7 622 }
4ee86bab
HR
623
624 return NULL;
24d2f903
CH
625}
626EXPORT_SYMBOL(blk_mq_tag_to_rq);
627
320ae51f 628struct blk_mq_timeout_data {
46f92d42
CH
629 unsigned long next;
630 unsigned int next_set;
320ae51f
JA
631};
632
90415837 633void blk_mq_rq_timed_out(struct request *req, bool reserved)
320ae51f 634{
46f92d42
CH
635 struct blk_mq_ops *ops = req->q->mq_ops;
636 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
87ee7b11
JA
637
638 /*
639 * We know that complete is set at this point. If STARTED isn't set
640 * anymore, then the request isn't active and the "timeout" should
641 * just be ignored. This can happen due to the bitflag ordering.
642 * Timeout first checks if STARTED is set, and if it is, assumes
643 * the request is active. But if we race with completion, then
644 * we both flags will get cleared. So check here again, and ignore
645 * a timeout event with a request that isn't active.
646 */
46f92d42
CH
647 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
648 return;
87ee7b11 649
46f92d42 650 if (ops->timeout)
0152fb6b 651 ret = ops->timeout(req, reserved);
46f92d42
CH
652
653 switch (ret) {
654 case BLK_EH_HANDLED:
655 __blk_mq_complete_request(req);
656 break;
657 case BLK_EH_RESET_TIMER:
658 blk_add_timer(req);
659 blk_clear_rq_complete(req);
660 break;
661 case BLK_EH_NOT_HANDLED:
662 break;
663 default:
664 printk(KERN_ERR "block: bad eh return: %d\n", ret);
665 break;
666 }
87ee7b11 667}
5b3f25fc 668
81481eb4
CH
669static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
670 struct request *rq, void *priv, bool reserved)
671{
672 struct blk_mq_timeout_data *data = priv;
87ee7b11 673
eb130dbf
KB
674 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
675 /*
676 * If a request wasn't started before the queue was
677 * marked dying, kill it here or it'll go unnoticed.
678 */
a59e0f57
KB
679 if (unlikely(blk_queue_dying(rq->q))) {
680 rq->errors = -EIO;
681 blk_mq_end_request(rq, rq->errors);
682 }
46f92d42 683 return;
eb130dbf 684 }
87ee7b11 685
46f92d42
CH
686 if (time_after_eq(jiffies, rq->deadline)) {
687 if (!blk_mark_rq_complete(rq))
0152fb6b 688 blk_mq_rq_timed_out(rq, reserved);
46f92d42
CH
689 } else if (!data->next_set || time_after(data->next, rq->deadline)) {
690 data->next = rq->deadline;
691 data->next_set = 1;
692 }
87ee7b11
JA
693}
694
287922eb 695static void blk_mq_timeout_work(struct work_struct *work)
320ae51f 696{
287922eb
CH
697 struct request_queue *q =
698 container_of(work, struct request_queue, timeout_work);
81481eb4
CH
699 struct blk_mq_timeout_data data = {
700 .next = 0,
701 .next_set = 0,
702 };
81481eb4 703 int i;
320ae51f 704
71f79fb3
GKB
705 /* A deadlock might occur if a request is stuck requiring a
706 * timeout at the same time a queue freeze is waiting
707 * completion, since the timeout code would not be able to
708 * acquire the queue reference here.
709 *
710 * That's why we don't use blk_queue_enter here; instead, we use
711 * percpu_ref_tryget directly, because we need to be able to
712 * obtain a reference even in the short window between the queue
713 * starting to freeze, by dropping the first reference in
714 * blk_mq_freeze_queue_start, and the moment the last request is
715 * consumed, marked by the instant q_usage_counter reaches
716 * zero.
717 */
718 if (!percpu_ref_tryget(&q->q_usage_counter))
287922eb
CH
719 return;
720
0bf6cd5b 721 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
320ae51f 722
81481eb4
CH
723 if (data.next_set) {
724 data.next = blk_rq_timeout(round_jiffies_up(data.next));
725 mod_timer(&q->timeout, data.next);
0d2602ca 726 } else {
0bf6cd5b
CH
727 struct blk_mq_hw_ctx *hctx;
728
f054b56c
ML
729 queue_for_each_hw_ctx(q, hctx, i) {
730 /* the hctx may be unmapped, so check it here */
731 if (blk_mq_hw_queue_mapped(hctx))
732 blk_mq_tag_idle(hctx);
733 }
0d2602ca 734 }
287922eb 735 blk_queue_exit(q);
320ae51f
JA
736}
737
738/*
739 * Reverse check our software queue for entries that we could potentially
740 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
741 * too much time checking for merges.
742 */
743static bool blk_mq_attempt_merge(struct request_queue *q,
744 struct blk_mq_ctx *ctx, struct bio *bio)
745{
746 struct request *rq;
747 int checked = 8;
748
749 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
750 int el_ret;
751
752 if (!checked--)
753 break;
754
755 if (!blk_rq_merge_ok(rq, bio))
756 continue;
757
758 el_ret = blk_try_merge(rq, bio);
759 if (el_ret == ELEVATOR_BACK_MERGE) {
760 if (bio_attempt_back_merge(q, rq, bio)) {
761 ctx->rq_merged++;
762 return true;
763 }
764 break;
765 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
766 if (bio_attempt_front_merge(q, rq, bio)) {
767 ctx->rq_merged++;
768 return true;
769 }
770 break;
771 }
772 }
773
774 return false;
775}
776
88459642
OS
777struct flush_busy_ctx_data {
778 struct blk_mq_hw_ctx *hctx;
779 struct list_head *list;
780};
781
782static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
783{
784 struct flush_busy_ctx_data *flush_data = data;
785 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
786 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
787
788 sbitmap_clear_bit(sb, bitnr);
789 spin_lock(&ctx->lock);
790 list_splice_tail_init(&ctx->rq_list, flush_data->list);
791 spin_unlock(&ctx->lock);
792 return true;
793}
794
1429d7c9
JA
795/*
796 * Process software queues that have been marked busy, splicing them
797 * to the for-dispatch
798 */
799static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
800{
88459642
OS
801 struct flush_busy_ctx_data data = {
802 .hctx = hctx,
803 .list = list,
804 };
1429d7c9 805
88459642 806 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1429d7c9 807}
1429d7c9 808
703fd1c0
JA
809static inline unsigned int queued_to_index(unsigned int queued)
810{
811 if (!queued)
812 return 0;
1429d7c9 813
703fd1c0 814 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1429d7c9
JA
815}
816
320ae51f
JA
817/*
818 * Run this hardware queue, pulling any software queues mapped to it in.
819 * Note that this function currently has various problems around ordering
820 * of IO. In particular, we'd like FIFO behaviour on handling existing
821 * items on the hctx->dispatch list. Ignore that for now.
822 */
6a83e74d 823static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
320ae51f
JA
824{
825 struct request_queue *q = hctx->queue;
320ae51f
JA
826 struct request *rq;
827 LIST_HEAD(rq_list);
74c45052
JA
828 LIST_HEAD(driver_list);
829 struct list_head *dptr;
1429d7c9 830 int queued;
320ae51f 831
5d1b25c1 832 if (unlikely(blk_mq_hctx_stopped(hctx)))
320ae51f
JA
833 return;
834
835 hctx->run++;
836
837 /*
838 * Touch any software queue that has pending entries.
839 */
1429d7c9 840 flush_busy_ctxs(hctx, &rq_list);
320ae51f
JA
841
842 /*
843 * If we have previous entries on our dispatch list, grab them
844 * and stuff them at the front for more fair dispatch.
845 */
846 if (!list_empty_careful(&hctx->dispatch)) {
847 spin_lock(&hctx->lock);
848 if (!list_empty(&hctx->dispatch))
849 list_splice_init(&hctx->dispatch, &rq_list);
850 spin_unlock(&hctx->lock);
851 }
852
74c45052
JA
853 /*
854 * Start off with dptr being NULL, so we start the first request
855 * immediately, even if we have more pending.
856 */
857 dptr = NULL;
858
320ae51f
JA
859 /*
860 * Now process all the entries, sending them to the driver.
861 */
1429d7c9 862 queued = 0;
320ae51f 863 while (!list_empty(&rq_list)) {
74c45052 864 struct blk_mq_queue_data bd;
320ae51f
JA
865 int ret;
866
867 rq = list_first_entry(&rq_list, struct request, queuelist);
868 list_del_init(&rq->queuelist);
320ae51f 869
74c45052
JA
870 bd.rq = rq;
871 bd.list = dptr;
872 bd.last = list_empty(&rq_list);
873
874 ret = q->mq_ops->queue_rq(hctx, &bd);
320ae51f
JA
875 switch (ret) {
876 case BLK_MQ_RQ_QUEUE_OK:
877 queued++;
52b9c330 878 break;
320ae51f 879 case BLK_MQ_RQ_QUEUE_BUSY:
320ae51f 880 list_add(&rq->queuelist, &rq_list);
ed0791b2 881 __blk_mq_requeue_request(rq);
320ae51f
JA
882 break;
883 default:
884 pr_err("blk-mq: bad return on queue: %d\n", ret);
320ae51f 885 case BLK_MQ_RQ_QUEUE_ERROR:
1e93b8c2 886 rq->errors = -EIO;
c8a446ad 887 blk_mq_end_request(rq, rq->errors);
320ae51f
JA
888 break;
889 }
890
891 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
892 break;
74c45052
JA
893
894 /*
895 * We've done the first request. If we have more than 1
896 * left in the list, set dptr to defer issue.
897 */
898 if (!dptr && rq_list.next != rq_list.prev)
899 dptr = &driver_list;
320ae51f
JA
900 }
901
703fd1c0 902 hctx->dispatched[queued_to_index(queued)]++;
320ae51f
JA
903
904 /*
905 * Any items that need requeuing? Stuff them into hctx->dispatch,
906 * that is where we will continue on next queue run.
907 */
908 if (!list_empty(&rq_list)) {
909 spin_lock(&hctx->lock);
910 list_splice(&rq_list, &hctx->dispatch);
911 spin_unlock(&hctx->lock);
9ba52e58
SL
912 /*
913 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
914 * it's possible the queue is stopped and restarted again
915 * before this. Queue restart will dispatch requests. And since
916 * requests in rq_list aren't added into hctx->dispatch yet,
917 * the requests in rq_list might get lost.
918 *
919 * blk_mq_run_hw_queue() already checks the STOPPED bit
920 **/
921 blk_mq_run_hw_queue(hctx, true);
320ae51f
JA
922 }
923}
924
6a83e74d
BVA
925static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
926{
927 int srcu_idx;
928
929 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
930 cpu_online(hctx->next_cpu));
931
932 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
933 rcu_read_lock();
934 blk_mq_process_rq_list(hctx);
935 rcu_read_unlock();
936 } else {
937 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
938 blk_mq_process_rq_list(hctx);
939 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
940 }
941}
942
506e931f
JA
943/*
944 * It'd be great if the workqueue API had a way to pass
945 * in a mask and had some smarts for more clever placement.
946 * For now we just round-robin here, switching for every
947 * BLK_MQ_CPU_WORK_BATCH queued items.
948 */
949static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
950{
b657d7e6
CH
951 if (hctx->queue->nr_hw_queues == 1)
952 return WORK_CPU_UNBOUND;
506e931f
JA
953
954 if (--hctx->next_cpu_batch <= 0) {
c02ebfdd 955 int next_cpu;
506e931f
JA
956
957 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
958 if (next_cpu >= nr_cpu_ids)
959 next_cpu = cpumask_first(hctx->cpumask);
960
961 hctx->next_cpu = next_cpu;
962 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
963 }
964
b657d7e6 965 return hctx->next_cpu;
506e931f
JA
966}
967
320ae51f
JA
968void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
969{
5d1b25c1
BVA
970 if (unlikely(blk_mq_hctx_stopped(hctx) ||
971 !blk_mq_hw_queue_mapped(hctx)))
320ae51f
JA
972 return;
973
1b792f2f 974 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
2a90d4aa
PB
975 int cpu = get_cpu();
976 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
398205b8 977 __blk_mq_run_hw_queue(hctx);
2a90d4aa 978 put_cpu();
398205b8
PB
979 return;
980 }
e4043dcf 981
2a90d4aa 982 put_cpu();
e4043dcf 983 }
398205b8 984
27489a3c 985 kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
320ae51f
JA
986}
987
b94ec296 988void blk_mq_run_hw_queues(struct request_queue *q, bool async)
320ae51f
JA
989{
990 struct blk_mq_hw_ctx *hctx;
991 int i;
992
993 queue_for_each_hw_ctx(q, hctx, i) {
994 if ((!blk_mq_hctx_has_pending(hctx) &&
995 list_empty_careful(&hctx->dispatch)) ||
5d1b25c1 996 blk_mq_hctx_stopped(hctx))
320ae51f
JA
997 continue;
998
b94ec296 999 blk_mq_run_hw_queue(hctx, async);
320ae51f
JA
1000 }
1001}
b94ec296 1002EXPORT_SYMBOL(blk_mq_run_hw_queues);
320ae51f 1003
fd001443
BVA
1004/**
1005 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1006 * @q: request queue.
1007 *
1008 * The caller is responsible for serializing this function against
1009 * blk_mq_{start,stop}_hw_queue().
1010 */
1011bool blk_mq_queue_stopped(struct request_queue *q)
1012{
1013 struct blk_mq_hw_ctx *hctx;
1014 int i;
1015
1016 queue_for_each_hw_ctx(q, hctx, i)
1017 if (blk_mq_hctx_stopped(hctx))
1018 return true;
1019
1020 return false;
1021}
1022EXPORT_SYMBOL(blk_mq_queue_stopped);
1023
320ae51f
JA
1024void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1025{
27489a3c 1026 cancel_work(&hctx->run_work);
70f4db63 1027 cancel_delayed_work(&hctx->delay_work);
320ae51f
JA
1028 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1029}
1030EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1031
280d45f6
CH
1032void blk_mq_stop_hw_queues(struct request_queue *q)
1033{
1034 struct blk_mq_hw_ctx *hctx;
1035 int i;
1036
1037 queue_for_each_hw_ctx(q, hctx, i)
1038 blk_mq_stop_hw_queue(hctx);
1039}
1040EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1041
320ae51f
JA
1042void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1043{
1044 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
e4043dcf 1045
0ffbce80 1046 blk_mq_run_hw_queue(hctx, false);
320ae51f
JA
1047}
1048EXPORT_SYMBOL(blk_mq_start_hw_queue);
1049
2f268556
CH
1050void blk_mq_start_hw_queues(struct request_queue *q)
1051{
1052 struct blk_mq_hw_ctx *hctx;
1053 int i;
1054
1055 queue_for_each_hw_ctx(q, hctx, i)
1056 blk_mq_start_hw_queue(hctx);
1057}
1058EXPORT_SYMBOL(blk_mq_start_hw_queues);
1059
1b4a3258 1060void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
320ae51f
JA
1061{
1062 struct blk_mq_hw_ctx *hctx;
1063 int i;
1064
1065 queue_for_each_hw_ctx(q, hctx, i) {
5d1b25c1 1066 if (!blk_mq_hctx_stopped(hctx))
320ae51f
JA
1067 continue;
1068
1069 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1b4a3258 1070 blk_mq_run_hw_queue(hctx, async);
320ae51f
JA
1071 }
1072}
1073EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1074
70f4db63 1075static void blk_mq_run_work_fn(struct work_struct *work)
320ae51f
JA
1076{
1077 struct blk_mq_hw_ctx *hctx;
1078
27489a3c 1079 hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
e4043dcf 1080
320ae51f
JA
1081 __blk_mq_run_hw_queue(hctx);
1082}
1083
70f4db63
CH
1084static void blk_mq_delay_work_fn(struct work_struct *work)
1085{
1086 struct blk_mq_hw_ctx *hctx;
1087
1088 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
1089
1090 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
1091 __blk_mq_run_hw_queue(hctx);
1092}
1093
1094void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1095{
19c66e59
ML
1096 if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1097 return;
70f4db63 1098
b657d7e6
CH
1099 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1100 &hctx->delay_work, msecs_to_jiffies(msecs));
70f4db63
CH
1101}
1102EXPORT_SYMBOL(blk_mq_delay_queue);
1103
cfd0c552 1104static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
cfd0c552
ML
1105 struct request *rq,
1106 bool at_head)
320ae51f 1107{
e57690fe
JA
1108 struct blk_mq_ctx *ctx = rq->mq_ctx;
1109
01b983c9
JA
1110 trace_block_rq_insert(hctx->queue, rq);
1111
72a0a36e
CH
1112 if (at_head)
1113 list_add(&rq->queuelist, &ctx->rq_list);
1114 else
1115 list_add_tail(&rq->queuelist, &ctx->rq_list);
cfd0c552 1116}
4bb659b1 1117
cfd0c552
ML
1118static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
1119 struct request *rq, bool at_head)
1120{
1121 struct blk_mq_ctx *ctx = rq->mq_ctx;
1122
e57690fe 1123 __blk_mq_insert_req_list(hctx, rq, at_head);
320ae51f 1124 blk_mq_hctx_mark_pending(hctx, ctx);
320ae51f
JA
1125}
1126
eeabc850 1127void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
e57690fe 1128 bool async)
320ae51f 1129{
e57690fe 1130 struct blk_mq_ctx *ctx = rq->mq_ctx;
eeabc850 1131 struct request_queue *q = rq->q;
7d7e0f90 1132 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
320ae51f 1133
a57a178a
CH
1134 spin_lock(&ctx->lock);
1135 __blk_mq_insert_request(hctx, rq, at_head);
1136 spin_unlock(&ctx->lock);
320ae51f 1137
320ae51f
JA
1138 if (run_queue)
1139 blk_mq_run_hw_queue(hctx, async);
1140}
1141
1142static void blk_mq_insert_requests(struct request_queue *q,
1143 struct blk_mq_ctx *ctx,
1144 struct list_head *list,
1145 int depth,
1146 bool from_schedule)
1147
1148{
7d7e0f90 1149 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
320ae51f
JA
1150
1151 trace_block_unplug(q, depth, !from_schedule);
1152
320ae51f
JA
1153 /*
1154 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1155 * offline now
1156 */
1157 spin_lock(&ctx->lock);
1158 while (!list_empty(list)) {
1159 struct request *rq;
1160
1161 rq = list_first_entry(list, struct request, queuelist);
e57690fe 1162 BUG_ON(rq->mq_ctx != ctx);
320ae51f 1163 list_del_init(&rq->queuelist);
e57690fe 1164 __blk_mq_insert_req_list(hctx, rq, false);
320ae51f 1165 }
cfd0c552 1166 blk_mq_hctx_mark_pending(hctx, ctx);
320ae51f
JA
1167 spin_unlock(&ctx->lock);
1168
320ae51f
JA
1169 blk_mq_run_hw_queue(hctx, from_schedule);
1170}
1171
1172static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1173{
1174 struct request *rqa = container_of(a, struct request, queuelist);
1175 struct request *rqb = container_of(b, struct request, queuelist);
1176
1177 return !(rqa->mq_ctx < rqb->mq_ctx ||
1178 (rqa->mq_ctx == rqb->mq_ctx &&
1179 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1180}
1181
1182void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1183{
1184 struct blk_mq_ctx *this_ctx;
1185 struct request_queue *this_q;
1186 struct request *rq;
1187 LIST_HEAD(list);
1188 LIST_HEAD(ctx_list);
1189 unsigned int depth;
1190
1191 list_splice_init(&plug->mq_list, &list);
1192
1193 list_sort(NULL, &list, plug_ctx_cmp);
1194
1195 this_q = NULL;
1196 this_ctx = NULL;
1197 depth = 0;
1198
1199 while (!list_empty(&list)) {
1200 rq = list_entry_rq(list.next);
1201 list_del_init(&rq->queuelist);
1202 BUG_ON(!rq->q);
1203 if (rq->mq_ctx != this_ctx) {
1204 if (this_ctx) {
1205 blk_mq_insert_requests(this_q, this_ctx,
1206 &ctx_list, depth,
1207 from_schedule);
1208 }
1209
1210 this_ctx = rq->mq_ctx;
1211 this_q = rq->q;
1212 depth = 0;
1213 }
1214
1215 depth++;
1216 list_add_tail(&rq->queuelist, &ctx_list);
1217 }
1218
1219 /*
1220 * If 'this_ctx' is set, we know we have entries to complete
1221 * on 'ctx_list'. Do those.
1222 */
1223 if (this_ctx) {
1224 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1225 from_schedule);
1226 }
1227}
1228
1229static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1230{
1231 init_request_from_bio(rq, bio);
4b570521 1232
a21f2a3e 1233 blk_account_io_start(rq, 1);
320ae51f
JA
1234}
1235
274a5843
JA
1236static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1237{
1238 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1239 !blk_queue_nomerges(hctx->queue);
1240}
1241
07068d5b
JA
1242static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1243 struct blk_mq_ctx *ctx,
1244 struct request *rq, struct bio *bio)
320ae51f 1245{
e18378a6 1246 if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
07068d5b
JA
1247 blk_mq_bio_to_request(rq, bio);
1248 spin_lock(&ctx->lock);
1249insert_rq:
1250 __blk_mq_insert_request(hctx, rq, false);
1251 spin_unlock(&ctx->lock);
1252 return false;
1253 } else {
274a5843
JA
1254 struct request_queue *q = hctx->queue;
1255
07068d5b
JA
1256 spin_lock(&ctx->lock);
1257 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1258 blk_mq_bio_to_request(rq, bio);
1259 goto insert_rq;
1260 }
320ae51f 1261
07068d5b
JA
1262 spin_unlock(&ctx->lock);
1263 __blk_mq_free_request(hctx, ctx, rq);
1264 return true;
14ec77f3 1265 }
07068d5b 1266}
14ec77f3 1267
07068d5b
JA
1268static struct request *blk_mq_map_request(struct request_queue *q,
1269 struct bio *bio,
2552e3f8 1270 struct blk_mq_alloc_data *data)
07068d5b
JA
1271{
1272 struct blk_mq_hw_ctx *hctx;
1273 struct blk_mq_ctx *ctx;
1274 struct request *rq;
320ae51f 1275
3ef28e83 1276 blk_queue_enter_live(q);
320ae51f 1277 ctx = blk_mq_get_ctx(q);
7d7e0f90 1278 hctx = blk_mq_map_queue(q, ctx->cpu);
320ae51f 1279
ef295ecf 1280 trace_block_getrq(q, bio, bio->bi_opf);
2552e3f8 1281 blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
ef295ecf 1282 rq = __blk_mq_alloc_request(data, bio->bi_opf);
320ae51f 1283
7dd2fb68 1284 data->hctx->queued++;
07068d5b
JA
1285 return rq;
1286}
1287
2253efc8
BVA
1288static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1289 struct request *rq, blk_qc_t *cookie)
f984df1f
SL
1290{
1291 int ret;
1292 struct request_queue *q = rq->q;
f984df1f
SL
1293 struct blk_mq_queue_data bd = {
1294 .rq = rq,
1295 .list = NULL,
1296 .last = 1
1297 };
7b371636 1298 blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
f984df1f 1299
2253efc8
BVA
1300 if (blk_mq_hctx_stopped(hctx))
1301 goto insert;
1302
f984df1f
SL
1303 /*
1304 * For OK queue, we are done. For error, kill it. Any other
1305 * error (busy), just add it to our list as we previously
1306 * would have done
1307 */
1308 ret = q->mq_ops->queue_rq(hctx, &bd);
7b371636
JA
1309 if (ret == BLK_MQ_RQ_QUEUE_OK) {
1310 *cookie = new_cookie;
2253efc8 1311 return;
7b371636 1312 }
f984df1f 1313
7b371636
JA
1314 __blk_mq_requeue_request(rq);
1315
1316 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1317 *cookie = BLK_QC_T_NONE;
1318 rq->errors = -EIO;
1319 blk_mq_end_request(rq, rq->errors);
2253efc8 1320 return;
f984df1f 1321 }
7b371636 1322
2253efc8
BVA
1323insert:
1324 blk_mq_insert_request(rq, false, true, true);
f984df1f
SL
1325}
1326
07068d5b
JA
1327/*
1328 * Multiple hardware queue variant. This will not use per-process plugs,
1329 * but will attempt to bypass the hctx queueing if we can go straight to
1330 * hardware for SYNC IO.
1331 */
dece1635 1332static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
07068d5b 1333{
ef295ecf 1334 const int is_sync = op_is_sync(bio->bi_opf);
1eff9d32 1335 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
2552e3f8 1336 struct blk_mq_alloc_data data;
07068d5b 1337 struct request *rq;
6a83e74d 1338 unsigned int request_count = 0, srcu_idx;
f984df1f 1339 struct blk_plug *plug;
5b3f341f 1340 struct request *same_queue_rq = NULL;
7b371636 1341 blk_qc_t cookie;
07068d5b
JA
1342
1343 blk_queue_bounce(q, &bio);
1344
1345 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
4246a0b6 1346 bio_io_error(bio);
dece1635 1347 return BLK_QC_T_NONE;
07068d5b
JA
1348 }
1349
54efd50b
KO
1350 blk_queue_split(q, &bio, q->bio_split);
1351
87c279e6
OS
1352 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1353 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1354 return BLK_QC_T_NONE;
f984df1f 1355
07068d5b
JA
1356 rq = blk_mq_map_request(q, bio, &data);
1357 if (unlikely(!rq))
dece1635 1358 return BLK_QC_T_NONE;
07068d5b 1359
7b371636 1360 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
07068d5b
JA
1361
1362 if (unlikely(is_flush_fua)) {
1363 blk_mq_bio_to_request(rq, bio);
1364 blk_insert_flush(rq);
1365 goto run_queue;
1366 }
1367
f984df1f 1368 plug = current->plug;
e167dfb5
JA
1369 /*
1370 * If the driver supports defer issued based on 'last', then
1371 * queue it up like normal since we can potentially save some
1372 * CPU this way.
1373 */
f984df1f
SL
1374 if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1375 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1376 struct request *old_rq = NULL;
07068d5b
JA
1377
1378 blk_mq_bio_to_request(rq, bio);
07068d5b
JA
1379
1380 /*
6a83e74d 1381 * We do limited plugging. If the bio can be merged, do that.
f984df1f
SL
1382 * Otherwise the existing request in the plug list will be
1383 * issued. So the plug list will have one request at most
07068d5b 1384 */
f984df1f 1385 if (plug) {
5b3f341f
SL
1386 /*
1387 * The plug list might get flushed before this. If that
b094f89c
JA
1388 * happens, same_queue_rq is invalid and plug list is
1389 * empty
1390 */
5b3f341f
SL
1391 if (same_queue_rq && !list_empty(&plug->mq_list)) {
1392 old_rq = same_queue_rq;
f984df1f 1393 list_del_init(&old_rq->queuelist);
07068d5b 1394 }
f984df1f
SL
1395 list_add_tail(&rq->queuelist, &plug->mq_list);
1396 } else /* is_sync */
1397 old_rq = rq;
1398 blk_mq_put_ctx(data.ctx);
1399 if (!old_rq)
7b371636 1400 goto done;
6a83e74d
BVA
1401
1402 if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
1403 rcu_read_lock();
1404 blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1405 rcu_read_unlock();
1406 } else {
1407 srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
1408 blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1409 srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
1410 }
7b371636 1411 goto done;
07068d5b
JA
1412 }
1413
1414 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1415 /*
1416 * For a SYNC request, send it to the hardware immediately. For
1417 * an ASYNC request, just ensure that we run it later on. The
1418 * latter allows for merging opportunities and more efficient
1419 * dispatching.
1420 */
1421run_queue:
1422 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1423 }
07068d5b 1424 blk_mq_put_ctx(data.ctx);
7b371636
JA
1425done:
1426 return cookie;
07068d5b
JA
1427}
1428
1429/*
1430 * Single hardware queue variant. This will attempt to use any per-process
1431 * plug for merging and IO deferral.
1432 */
dece1635 1433static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
07068d5b 1434{
ef295ecf 1435 const int is_sync = op_is_sync(bio->bi_opf);
1eff9d32 1436 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
e6c4438b
JM
1437 struct blk_plug *plug;
1438 unsigned int request_count = 0;
2552e3f8 1439 struct blk_mq_alloc_data data;
07068d5b 1440 struct request *rq;
7b371636 1441 blk_qc_t cookie;
07068d5b 1442
07068d5b
JA
1443 blk_queue_bounce(q, &bio);
1444
1445 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
4246a0b6 1446 bio_io_error(bio);
dece1635 1447 return BLK_QC_T_NONE;
07068d5b
JA
1448 }
1449
54efd50b
KO
1450 blk_queue_split(q, &bio, q->bio_split);
1451
87c279e6
OS
1452 if (!is_flush_fua && !blk_queue_nomerges(q)) {
1453 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
1454 return BLK_QC_T_NONE;
1455 } else
1456 request_count = blk_plug_queued_count(q);
07068d5b
JA
1457
1458 rq = blk_mq_map_request(q, bio, &data);
ff87bcec 1459 if (unlikely(!rq))
dece1635 1460 return BLK_QC_T_NONE;
320ae51f 1461
7b371636 1462 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
320ae51f
JA
1463
1464 if (unlikely(is_flush_fua)) {
1465 blk_mq_bio_to_request(rq, bio);
320ae51f
JA
1466 blk_insert_flush(rq);
1467 goto run_queue;
1468 }
1469
1470 /*
1471 * A task plug currently exists. Since this is completely lockless,
1472 * utilize that to temporarily store requests until the task is
1473 * either done or scheduled away.
1474 */
e6c4438b
JM
1475 plug = current->plug;
1476 if (plug) {
600271d9
SL
1477 struct request *last = NULL;
1478
e6c4438b 1479 blk_mq_bio_to_request(rq, bio);
676d0607 1480 if (!request_count)
e6c4438b 1481 trace_block_plug(q);
600271d9
SL
1482 else
1483 last = list_entry_rq(plug->mq_list.prev);
b094f89c
JA
1484
1485 blk_mq_put_ctx(data.ctx);
1486
600271d9
SL
1487 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1488 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
e6c4438b
JM
1489 blk_flush_plug_list(plug, false);
1490 trace_block_plug(q);
320ae51f 1491 }
b094f89c 1492
e6c4438b 1493 list_add_tail(&rq->queuelist, &plug->mq_list);
7b371636 1494 return cookie;
320ae51f
JA
1495 }
1496
07068d5b
JA
1497 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1498 /*
1499 * For a SYNC request, send it to the hardware immediately. For
1500 * an ASYNC request, just ensure that we run it later on. The
1501 * latter allows for merging opportunities and more efficient
1502 * dispatching.
1503 */
1504run_queue:
1505 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
320ae51f
JA
1506 }
1507
07068d5b 1508 blk_mq_put_ctx(data.ctx);
7b371636 1509 return cookie;
320ae51f
JA
1510}
1511
24d2f903
CH
1512static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1513 struct blk_mq_tags *tags, unsigned int hctx_idx)
95363efd 1514{
e9b267d9 1515 struct page *page;
320ae51f 1516
24d2f903 1517 if (tags->rqs && set->ops->exit_request) {
e9b267d9 1518 int i;
320ae51f 1519
24d2f903
CH
1520 for (i = 0; i < tags->nr_tags; i++) {
1521 if (!tags->rqs[i])
e9b267d9 1522 continue;
24d2f903
CH
1523 set->ops->exit_request(set->driver_data, tags->rqs[i],
1524 hctx_idx, i);
a5164405 1525 tags->rqs[i] = NULL;
e9b267d9 1526 }
320ae51f 1527 }
320ae51f 1528
24d2f903
CH
1529 while (!list_empty(&tags->page_list)) {
1530 page = list_first_entry(&tags->page_list, struct page, lru);
6753471c 1531 list_del_init(&page->lru);
f75782e4
CM
1532 /*
1533 * Remove kmemleak object previously allocated in
1534 * blk_mq_init_rq_map().
1535 */
1536 kmemleak_free(page_address(page));
320ae51f
JA
1537 __free_pages(page, page->private);
1538 }
1539
24d2f903 1540 kfree(tags->rqs);
320ae51f 1541
24d2f903 1542 blk_mq_free_tags(tags);
320ae51f
JA
1543}
1544
1545static size_t order_to_size(unsigned int order)
1546{
4ca08500 1547 return (size_t)PAGE_SIZE << order;
320ae51f
JA
1548}
1549
24d2f903
CH
1550static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1551 unsigned int hctx_idx)
320ae51f 1552{
24d2f903 1553 struct blk_mq_tags *tags;
320ae51f
JA
1554 unsigned int i, j, entries_per_page, max_order = 4;
1555 size_t rq_size, left;
1556
24d2f903 1557 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
24391c0d
SL
1558 set->numa_node,
1559 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
24d2f903
CH
1560 if (!tags)
1561 return NULL;
320ae51f 1562
24d2f903
CH
1563 INIT_LIST_HEAD(&tags->page_list);
1564
a5164405
JA
1565 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1566 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1567 set->numa_node);
24d2f903
CH
1568 if (!tags->rqs) {
1569 blk_mq_free_tags(tags);
1570 return NULL;
1571 }
320ae51f
JA
1572
1573 /*
1574 * rq_size is the size of the request plus driver payload, rounded
1575 * to the cacheline size
1576 */
24d2f903 1577 rq_size = round_up(sizeof(struct request) + set->cmd_size,
320ae51f 1578 cache_line_size());
24d2f903 1579 left = rq_size * set->queue_depth;
320ae51f 1580
24d2f903 1581 for (i = 0; i < set->queue_depth; ) {
320ae51f
JA
1582 int this_order = max_order;
1583 struct page *page;
1584 int to_do;
1585 void *p;
1586
b3a834b1 1587 while (this_order && left < order_to_size(this_order - 1))
320ae51f
JA
1588 this_order--;
1589
1590 do {
a5164405 1591 page = alloc_pages_node(set->numa_node,
ac211175 1592 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
a5164405 1593 this_order);
320ae51f
JA
1594 if (page)
1595 break;
1596 if (!this_order--)
1597 break;
1598 if (order_to_size(this_order) < rq_size)
1599 break;
1600 } while (1);
1601
1602 if (!page)
24d2f903 1603 goto fail;
320ae51f
JA
1604
1605 page->private = this_order;
24d2f903 1606 list_add_tail(&page->lru, &tags->page_list);
320ae51f
JA
1607
1608 p = page_address(page);
f75782e4
CM
1609 /*
1610 * Allow kmemleak to scan these pages as they contain pointers
1611 * to additional allocations like via ops->init_request().
1612 */
1613 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
320ae51f 1614 entries_per_page = order_to_size(this_order) / rq_size;
24d2f903 1615 to_do = min(entries_per_page, set->queue_depth - i);
320ae51f
JA
1616 left -= to_do * rq_size;
1617 for (j = 0; j < to_do; j++) {
24d2f903
CH
1618 tags->rqs[i] = p;
1619 if (set->ops->init_request) {
1620 if (set->ops->init_request(set->driver_data,
1621 tags->rqs[i], hctx_idx, i,
a5164405
JA
1622 set->numa_node)) {
1623 tags->rqs[i] = NULL;
24d2f903 1624 goto fail;
a5164405 1625 }
e9b267d9
CH
1626 }
1627
320ae51f
JA
1628 p += rq_size;
1629 i++;
1630 }
1631 }
24d2f903 1632 return tags;
320ae51f 1633
24d2f903 1634fail:
24d2f903
CH
1635 blk_mq_free_rq_map(set, tags, hctx_idx);
1636 return NULL;
320ae51f
JA
1637}
1638
e57690fe
JA
1639/*
1640 * 'cpu' is going away. splice any existing rq_list entries from this
1641 * software queue to the hw queue dispatch list, and ensure that it
1642 * gets run.
1643 */
9467f859 1644static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
484b4061 1645{
9467f859 1646 struct blk_mq_hw_ctx *hctx;
484b4061
JA
1647 struct blk_mq_ctx *ctx;
1648 LIST_HEAD(tmp);
1649
9467f859 1650 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
e57690fe 1651 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
484b4061
JA
1652
1653 spin_lock(&ctx->lock);
1654 if (!list_empty(&ctx->rq_list)) {
1655 list_splice_init(&ctx->rq_list, &tmp);
1656 blk_mq_hctx_clear_pending(hctx, ctx);
1657 }
1658 spin_unlock(&ctx->lock);
1659
1660 if (list_empty(&tmp))
9467f859 1661 return 0;
484b4061 1662
e57690fe
JA
1663 spin_lock(&hctx->lock);
1664 list_splice_tail_init(&tmp, &hctx->dispatch);
1665 spin_unlock(&hctx->lock);
484b4061
JA
1666
1667 blk_mq_run_hw_queue(hctx, true);
9467f859 1668 return 0;
484b4061
JA
1669}
1670
9467f859 1671static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
484b4061 1672{
9467f859
TG
1673 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
1674 &hctx->cpuhp_dead);
484b4061
JA
1675}
1676
c3b4afca 1677/* hctx->ctxs will be freed in queue's release handler */
08e98fc6
ML
1678static void blk_mq_exit_hctx(struct request_queue *q,
1679 struct blk_mq_tag_set *set,
1680 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1681{
f70ced09
ML
1682 unsigned flush_start_tag = set->queue_depth;
1683
08e98fc6
ML
1684 blk_mq_tag_idle(hctx);
1685
f70ced09
ML
1686 if (set->ops->exit_request)
1687 set->ops->exit_request(set->driver_data,
1688 hctx->fq->flush_rq, hctx_idx,
1689 flush_start_tag + hctx_idx);
1690
08e98fc6
ML
1691 if (set->ops->exit_hctx)
1692 set->ops->exit_hctx(hctx, hctx_idx);
1693
6a83e74d
BVA
1694 if (hctx->flags & BLK_MQ_F_BLOCKING)
1695 cleanup_srcu_struct(&hctx->queue_rq_srcu);
1696
9467f859 1697 blk_mq_remove_cpuhp(hctx);
f70ced09 1698 blk_free_flush_queue(hctx->fq);
88459642 1699 sbitmap_free(&hctx->ctx_map);
08e98fc6
ML
1700}
1701
624dbe47
ML
1702static void blk_mq_exit_hw_queues(struct request_queue *q,
1703 struct blk_mq_tag_set *set, int nr_queue)
1704{
1705 struct blk_mq_hw_ctx *hctx;
1706 unsigned int i;
1707
1708 queue_for_each_hw_ctx(q, hctx, i) {
1709 if (i == nr_queue)
1710 break;
08e98fc6 1711 blk_mq_exit_hctx(q, set, hctx, i);
624dbe47 1712 }
624dbe47
ML
1713}
1714
1715static void blk_mq_free_hw_queues(struct request_queue *q,
1716 struct blk_mq_tag_set *set)
1717{
1718 struct blk_mq_hw_ctx *hctx;
1719 unsigned int i;
1720
e09aae7e 1721 queue_for_each_hw_ctx(q, hctx, i)
624dbe47 1722 free_cpumask_var(hctx->cpumask);
624dbe47
ML
1723}
1724
08e98fc6
ML
1725static int blk_mq_init_hctx(struct request_queue *q,
1726 struct blk_mq_tag_set *set,
1727 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
320ae51f 1728{
08e98fc6 1729 int node;
f70ced09 1730 unsigned flush_start_tag = set->queue_depth;
08e98fc6
ML
1731
1732 node = hctx->numa_node;
1733 if (node == NUMA_NO_NODE)
1734 node = hctx->numa_node = set->numa_node;
1735
27489a3c 1736 INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
08e98fc6
ML
1737 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1738 spin_lock_init(&hctx->lock);
1739 INIT_LIST_HEAD(&hctx->dispatch);
1740 hctx->queue = q;
1741 hctx->queue_num = hctx_idx;
2404e607 1742 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
08e98fc6 1743
9467f859 1744 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
08e98fc6
ML
1745
1746 hctx->tags = set->tags[hctx_idx];
320ae51f
JA
1747
1748 /*
08e98fc6
ML
1749 * Allocate space for all possible cpus to avoid allocation at
1750 * runtime
320ae51f 1751 */
08e98fc6
ML
1752 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1753 GFP_KERNEL, node);
1754 if (!hctx->ctxs)
1755 goto unregister_cpu_notifier;
320ae51f 1756
88459642
OS
1757 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1758 node))
08e98fc6 1759 goto free_ctxs;
320ae51f 1760
08e98fc6 1761 hctx->nr_ctx = 0;
320ae51f 1762
08e98fc6
ML
1763 if (set->ops->init_hctx &&
1764 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1765 goto free_bitmap;
320ae51f 1766
f70ced09
ML
1767 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1768 if (!hctx->fq)
1769 goto exit_hctx;
320ae51f 1770
f70ced09
ML
1771 if (set->ops->init_request &&
1772 set->ops->init_request(set->driver_data,
1773 hctx->fq->flush_rq, hctx_idx,
1774 flush_start_tag + hctx_idx, node))
1775 goto free_fq;
320ae51f 1776
6a83e74d
BVA
1777 if (hctx->flags & BLK_MQ_F_BLOCKING)
1778 init_srcu_struct(&hctx->queue_rq_srcu);
1779
08e98fc6 1780 return 0;
320ae51f 1781
f70ced09
ML
1782 free_fq:
1783 kfree(hctx->fq);
1784 exit_hctx:
1785 if (set->ops->exit_hctx)
1786 set->ops->exit_hctx(hctx, hctx_idx);
08e98fc6 1787 free_bitmap:
88459642 1788 sbitmap_free(&hctx->ctx_map);
08e98fc6
ML
1789 free_ctxs:
1790 kfree(hctx->ctxs);
1791 unregister_cpu_notifier:
9467f859 1792 blk_mq_remove_cpuhp(hctx);
08e98fc6
ML
1793 return -1;
1794}
320ae51f 1795
320ae51f
JA
1796static void blk_mq_init_cpu_queues(struct request_queue *q,
1797 unsigned int nr_hw_queues)
1798{
1799 unsigned int i;
1800
1801 for_each_possible_cpu(i) {
1802 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1803 struct blk_mq_hw_ctx *hctx;
1804
1805 memset(__ctx, 0, sizeof(*__ctx));
1806 __ctx->cpu = i;
1807 spin_lock_init(&__ctx->lock);
1808 INIT_LIST_HEAD(&__ctx->rq_list);
1809 __ctx->queue = q;
cf43e6be
JA
1810 blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
1811 blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
320ae51f
JA
1812
1813 /* If the cpu isn't online, the cpu is mapped to first hctx */
320ae51f
JA
1814 if (!cpu_online(i))
1815 continue;
1816
7d7e0f90 1817 hctx = blk_mq_map_queue(q, i);
e4043dcf 1818
320ae51f
JA
1819 /*
1820 * Set local node, IFF we have more than one hw queue. If
1821 * not, we remain on the home node of the device
1822 */
1823 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
bffed457 1824 hctx->numa_node = local_memory_node(cpu_to_node(i));
320ae51f
JA
1825 }
1826}
1827
5778322e
AM
1828static void blk_mq_map_swqueue(struct request_queue *q,
1829 const struct cpumask *online_mask)
320ae51f
JA
1830{
1831 unsigned int i;
1832 struct blk_mq_hw_ctx *hctx;
1833 struct blk_mq_ctx *ctx;
2a34c087 1834 struct blk_mq_tag_set *set = q->tag_set;
320ae51f 1835
60de074b
AM
1836 /*
1837 * Avoid others reading imcomplete hctx->cpumask through sysfs
1838 */
1839 mutex_lock(&q->sysfs_lock);
1840
320ae51f 1841 queue_for_each_hw_ctx(q, hctx, i) {
e4043dcf 1842 cpumask_clear(hctx->cpumask);
320ae51f
JA
1843 hctx->nr_ctx = 0;
1844 }
1845
1846 /*
1847 * Map software to hardware queues
1848 */
897bb0c7 1849 for_each_possible_cpu(i) {
320ae51f 1850 /* If the cpu isn't online, the cpu is mapped to first hctx */
5778322e 1851 if (!cpumask_test_cpu(i, online_mask))
e4043dcf
JA
1852 continue;
1853
897bb0c7 1854 ctx = per_cpu_ptr(q->queue_ctx, i);
7d7e0f90 1855 hctx = blk_mq_map_queue(q, i);
868f2f0b 1856
e4043dcf 1857 cpumask_set_cpu(i, hctx->cpumask);
320ae51f
JA
1858 ctx->index_hw = hctx->nr_ctx;
1859 hctx->ctxs[hctx->nr_ctx++] = ctx;
1860 }
506e931f 1861
60de074b
AM
1862 mutex_unlock(&q->sysfs_lock);
1863
506e931f 1864 queue_for_each_hw_ctx(q, hctx, i) {
484b4061 1865 /*
a68aafa5
JA
1866 * If no software queues are mapped to this hardware queue,
1867 * disable it and free the request entries.
484b4061
JA
1868 */
1869 if (!hctx->nr_ctx) {
484b4061
JA
1870 if (set->tags[i]) {
1871 blk_mq_free_rq_map(set, set->tags[i], i);
1872 set->tags[i] = NULL;
484b4061 1873 }
2a34c087 1874 hctx->tags = NULL;
484b4061
JA
1875 continue;
1876 }
1877
2a34c087
ML
1878 /* unmapped hw queue can be remapped after CPU topo changed */
1879 if (!set->tags[i])
1880 set->tags[i] = blk_mq_init_rq_map(set, i);
1881 hctx->tags = set->tags[i];
1882 WARN_ON(!hctx->tags);
1883
889fa31f
CY
1884 /*
1885 * Set the map size to the number of mapped software queues.
1886 * This is more accurate and more efficient than looping
1887 * over all possibly mapped software queues.
1888 */
88459642 1889 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
889fa31f 1890
484b4061
JA
1891 /*
1892 * Initialize batch roundrobin counts
1893 */
506e931f
JA
1894 hctx->next_cpu = cpumask_first(hctx->cpumask);
1895 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1896 }
320ae51f
JA
1897}
1898
2404e607 1899static void queue_set_hctx_shared(struct request_queue *q, bool shared)
0d2602ca
JA
1900{
1901 struct blk_mq_hw_ctx *hctx;
0d2602ca
JA
1902 int i;
1903
2404e607
JM
1904 queue_for_each_hw_ctx(q, hctx, i) {
1905 if (shared)
1906 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1907 else
1908 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1909 }
1910}
1911
1912static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
1913{
1914 struct request_queue *q;
0d2602ca
JA
1915
1916 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1917 blk_mq_freeze_queue(q);
2404e607 1918 queue_set_hctx_shared(q, shared);
0d2602ca
JA
1919 blk_mq_unfreeze_queue(q);
1920 }
1921}
1922
1923static void blk_mq_del_queue_tag_set(struct request_queue *q)
1924{
1925 struct blk_mq_tag_set *set = q->tag_set;
1926
0d2602ca
JA
1927 mutex_lock(&set->tag_list_lock);
1928 list_del_init(&q->tag_set_list);
2404e607
JM
1929 if (list_is_singular(&set->tag_list)) {
1930 /* just transitioned to unshared */
1931 set->flags &= ~BLK_MQ_F_TAG_SHARED;
1932 /* update existing queue */
1933 blk_mq_update_tag_set_depth(set, false);
1934 }
0d2602ca 1935 mutex_unlock(&set->tag_list_lock);
0d2602ca
JA
1936}
1937
1938static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1939 struct request_queue *q)
1940{
1941 q->tag_set = set;
1942
1943 mutex_lock(&set->tag_list_lock);
2404e607
JM
1944
1945 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
1946 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
1947 set->flags |= BLK_MQ_F_TAG_SHARED;
1948 /* update existing queue */
1949 blk_mq_update_tag_set_depth(set, true);
1950 }
1951 if (set->flags & BLK_MQ_F_TAG_SHARED)
1952 queue_set_hctx_shared(q, true);
0d2602ca 1953 list_add_tail(&q->tag_set_list, &set->tag_list);
2404e607 1954
0d2602ca
JA
1955 mutex_unlock(&set->tag_list_lock);
1956}
1957
e09aae7e
ML
1958/*
1959 * It is the actual release handler for mq, but we do it from
1960 * request queue's release handler for avoiding use-after-free
1961 * and headache because q->mq_kobj shouldn't have been introduced,
1962 * but we can't group ctx/kctx kobj without it.
1963 */
1964void blk_mq_release(struct request_queue *q)
1965{
1966 struct blk_mq_hw_ctx *hctx;
1967 unsigned int i;
1968
1969 /* hctx kobj stays in hctx */
c3b4afca
ML
1970 queue_for_each_hw_ctx(q, hctx, i) {
1971 if (!hctx)
1972 continue;
1973 kfree(hctx->ctxs);
e09aae7e 1974 kfree(hctx);
c3b4afca 1975 }
e09aae7e 1976
a723bab3
AM
1977 q->mq_map = NULL;
1978
e09aae7e
ML
1979 kfree(q->queue_hw_ctx);
1980
1981 /* ctx kobj stays in queue_ctx */
1982 free_percpu(q->queue_ctx);
1983}
1984
24d2f903 1985struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
b62c21b7
MS
1986{
1987 struct request_queue *uninit_q, *q;
1988
1989 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1990 if (!uninit_q)
1991 return ERR_PTR(-ENOMEM);
1992
1993 q = blk_mq_init_allocated_queue(set, uninit_q);
1994 if (IS_ERR(q))
1995 blk_cleanup_queue(uninit_q);
1996
1997 return q;
1998}
1999EXPORT_SYMBOL(blk_mq_init_queue);
2000
868f2f0b
KB
2001static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2002 struct request_queue *q)
320ae51f 2003{
868f2f0b
KB
2004 int i, j;
2005 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
f14bbe77 2006
868f2f0b 2007 blk_mq_sysfs_unregister(q);
24d2f903 2008 for (i = 0; i < set->nr_hw_queues; i++) {
868f2f0b 2009 int node;
f14bbe77 2010
868f2f0b
KB
2011 if (hctxs[i])
2012 continue;
2013
2014 node = blk_mq_hw_queue_to_node(q->mq_map, i);
cdef54dd
CH
2015 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
2016 GFP_KERNEL, node);
320ae51f 2017 if (!hctxs[i])
868f2f0b 2018 break;
320ae51f 2019
a86073e4 2020 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
868f2f0b
KB
2021 node)) {
2022 kfree(hctxs[i]);
2023 hctxs[i] = NULL;
2024 break;
2025 }
e4043dcf 2026
0d2602ca 2027 atomic_set(&hctxs[i]->nr_active, 0);
f14bbe77 2028 hctxs[i]->numa_node = node;
320ae51f 2029 hctxs[i]->queue_num = i;
868f2f0b
KB
2030
2031 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2032 free_cpumask_var(hctxs[i]->cpumask);
2033 kfree(hctxs[i]);
2034 hctxs[i] = NULL;
2035 break;
2036 }
2037 blk_mq_hctx_kobj_init(hctxs[i]);
320ae51f 2038 }
868f2f0b
KB
2039 for (j = i; j < q->nr_hw_queues; j++) {
2040 struct blk_mq_hw_ctx *hctx = hctxs[j];
2041
2042 if (hctx) {
2043 if (hctx->tags) {
2044 blk_mq_free_rq_map(set, hctx->tags, j);
2045 set->tags[j] = NULL;
2046 }
2047 blk_mq_exit_hctx(q, set, hctx, j);
2048 free_cpumask_var(hctx->cpumask);
2049 kobject_put(&hctx->kobj);
2050 kfree(hctx->ctxs);
2051 kfree(hctx);
2052 hctxs[j] = NULL;
2053
2054 }
2055 }
2056 q->nr_hw_queues = i;
2057 blk_mq_sysfs_register(q);
2058}
2059
2060struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2061 struct request_queue *q)
2062{
66841672
ML
2063 /* mark the queue as mq asap */
2064 q->mq_ops = set->ops;
2065
868f2f0b
KB
2066 q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2067 if (!q->queue_ctx)
c7de5726 2068 goto err_exit;
868f2f0b
KB
2069
2070 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2071 GFP_KERNEL, set->numa_node);
2072 if (!q->queue_hw_ctx)
2073 goto err_percpu;
2074
bdd17e75 2075 q->mq_map = set->mq_map;
868f2f0b
KB
2076
2077 blk_mq_realloc_hw_ctxs(set, q);
2078 if (!q->nr_hw_queues)
2079 goto err_hctxs;
320ae51f 2080
287922eb 2081 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
e56f698b 2082 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
320ae51f
JA
2083
2084 q->nr_queues = nr_cpu_ids;
320ae51f 2085
94eddfbe 2086 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
320ae51f 2087
05f1dd53
JA
2088 if (!(set->flags & BLK_MQ_F_SG_MERGE))
2089 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2090
1be036e9
CH
2091 q->sg_reserved_size = INT_MAX;
2092
2849450a 2093 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
6fca6a61
CH
2094 INIT_LIST_HEAD(&q->requeue_list);
2095 spin_lock_init(&q->requeue_lock);
2096
07068d5b
JA
2097 if (q->nr_hw_queues > 1)
2098 blk_queue_make_request(q, blk_mq_make_request);
2099 else
2100 blk_queue_make_request(q, blk_sq_make_request);
2101
eba71768
JA
2102 /*
2103 * Do this after blk_queue_make_request() overrides it...
2104 */
2105 q->nr_requests = set->queue_depth;
2106
24d2f903
CH
2107 if (set->ops->complete)
2108 blk_queue_softirq_done(q, set->ops->complete);
30a91cb4 2109
24d2f903 2110 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
320ae51f 2111
5778322e 2112 get_online_cpus();
320ae51f 2113 mutex_lock(&all_q_mutex);
320ae51f 2114
4593fdbe 2115 list_add_tail(&q->all_q_node, &all_q_list);
0d2602ca 2116 blk_mq_add_queue_tag_set(set, q);
5778322e 2117 blk_mq_map_swqueue(q, cpu_online_mask);
484b4061 2118
4593fdbe 2119 mutex_unlock(&all_q_mutex);
5778322e 2120 put_online_cpus();
4593fdbe 2121
320ae51f 2122 return q;
18741986 2123
320ae51f 2124err_hctxs:
868f2f0b 2125 kfree(q->queue_hw_ctx);
320ae51f 2126err_percpu:
868f2f0b 2127 free_percpu(q->queue_ctx);
c7de5726
ML
2128err_exit:
2129 q->mq_ops = NULL;
320ae51f
JA
2130 return ERR_PTR(-ENOMEM);
2131}
b62c21b7 2132EXPORT_SYMBOL(blk_mq_init_allocated_queue);
320ae51f
JA
2133
2134void blk_mq_free_queue(struct request_queue *q)
2135{
624dbe47 2136 struct blk_mq_tag_set *set = q->tag_set;
320ae51f 2137
0e626368
AM
2138 mutex_lock(&all_q_mutex);
2139 list_del_init(&q->all_q_node);
2140 mutex_unlock(&all_q_mutex);
2141
0d2602ca
JA
2142 blk_mq_del_queue_tag_set(q);
2143
624dbe47
ML
2144 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2145 blk_mq_free_hw_queues(q, set);
320ae51f 2146}
320ae51f
JA
2147
2148/* Basically redo blk_mq_init_queue with queue frozen */
5778322e
AM
2149static void blk_mq_queue_reinit(struct request_queue *q,
2150 const struct cpumask *online_mask)
320ae51f 2151{
4ecd4fef 2152 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
320ae51f 2153
67aec14c
JA
2154 blk_mq_sysfs_unregister(q);
2155
320ae51f
JA
2156 /*
2157 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2158 * we should change hctx numa_node according to new topology (this
2159 * involves free and re-allocate memory, worthy doing?)
2160 */
2161
5778322e 2162 blk_mq_map_swqueue(q, online_mask);
320ae51f 2163
67aec14c 2164 blk_mq_sysfs_register(q);
320ae51f
JA
2165}
2166
65d5291e
SAS
2167/*
2168 * New online cpumask which is going to be set in this hotplug event.
2169 * Declare this cpumasks as global as cpu-hotplug operation is invoked
2170 * one-by-one and dynamically allocating this could result in a failure.
2171 */
2172static struct cpumask cpuhp_online_new;
2173
2174static void blk_mq_queue_reinit_work(void)
320ae51f
JA
2175{
2176 struct request_queue *q;
320ae51f
JA
2177
2178 mutex_lock(&all_q_mutex);
f3af020b
TH
2179 /*
2180 * We need to freeze and reinit all existing queues. Freezing
2181 * involves synchronous wait for an RCU grace period and doing it
2182 * one by one may take a long time. Start freezing all queues in
2183 * one swoop and then wait for the completions so that freezing can
2184 * take place in parallel.
2185 */
2186 list_for_each_entry(q, &all_q_list, all_q_node)
2187 blk_mq_freeze_queue_start(q);
f054b56c 2188 list_for_each_entry(q, &all_q_list, all_q_node) {
f3af020b
TH
2189 blk_mq_freeze_queue_wait(q);
2190
f054b56c
ML
2191 /*
2192 * timeout handler can't touch hw queue during the
2193 * reinitialization
2194 */
2195 del_timer_sync(&q->timeout);
2196 }
2197
320ae51f 2198 list_for_each_entry(q, &all_q_list, all_q_node)
65d5291e 2199 blk_mq_queue_reinit(q, &cpuhp_online_new);
f3af020b
TH
2200
2201 list_for_each_entry(q, &all_q_list, all_q_node)
2202 blk_mq_unfreeze_queue(q);
2203
320ae51f 2204 mutex_unlock(&all_q_mutex);
65d5291e
SAS
2205}
2206
2207static int blk_mq_queue_reinit_dead(unsigned int cpu)
2208{
97a32864 2209 cpumask_copy(&cpuhp_online_new, cpu_online_mask);
65d5291e
SAS
2210 blk_mq_queue_reinit_work();
2211 return 0;
2212}
2213
2214/*
2215 * Before hotadded cpu starts handling requests, new mappings must be
2216 * established. Otherwise, these requests in hw queue might never be
2217 * dispatched.
2218 *
2219 * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
2220 * for CPU0, and ctx1 for CPU1).
2221 *
2222 * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
2223 * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
2224 *
2225 * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
2226 * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
2227 * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
2228 * is ignored.
2229 */
2230static int blk_mq_queue_reinit_prepare(unsigned int cpu)
2231{
2232 cpumask_copy(&cpuhp_online_new, cpu_online_mask);
2233 cpumask_set_cpu(cpu, &cpuhp_online_new);
2234 blk_mq_queue_reinit_work();
2235 return 0;
320ae51f
JA
2236}
2237
a5164405
JA
2238static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2239{
2240 int i;
2241
2242 for (i = 0; i < set->nr_hw_queues; i++) {
2243 set->tags[i] = blk_mq_init_rq_map(set, i);
2244 if (!set->tags[i])
2245 goto out_unwind;
2246 }
2247
2248 return 0;
2249
2250out_unwind:
2251 while (--i >= 0)
2252 blk_mq_free_rq_map(set, set->tags[i], i);
2253
a5164405
JA
2254 return -ENOMEM;
2255}
2256
2257/*
2258 * Allocate the request maps associated with this tag_set. Note that this
2259 * may reduce the depth asked for, if memory is tight. set->queue_depth
2260 * will be updated to reflect the allocated depth.
2261 */
2262static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2263{
2264 unsigned int depth;
2265 int err;
2266
2267 depth = set->queue_depth;
2268 do {
2269 err = __blk_mq_alloc_rq_maps(set);
2270 if (!err)
2271 break;
2272
2273 set->queue_depth >>= 1;
2274 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2275 err = -ENOMEM;
2276 break;
2277 }
2278 } while (set->queue_depth);
2279
2280 if (!set->queue_depth || err) {
2281 pr_err("blk-mq: failed to allocate request map\n");
2282 return -ENOMEM;
2283 }
2284
2285 if (depth != set->queue_depth)
2286 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2287 depth, set->queue_depth);
2288
2289 return 0;
2290}
2291
a4391c64
JA
2292/*
2293 * Alloc a tag set to be associated with one or more request queues.
2294 * May fail with EINVAL for various error conditions. May adjust the
2295 * requested depth down, if if it too large. In that case, the set
2296 * value will be stored in set->queue_depth.
2297 */
24d2f903
CH
2298int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2299{
da695ba2
CH
2300 int ret;
2301
205fb5f5
BVA
2302 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2303
24d2f903
CH
2304 if (!set->nr_hw_queues)
2305 return -EINVAL;
a4391c64 2306 if (!set->queue_depth)
24d2f903
CH
2307 return -EINVAL;
2308 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2309 return -EINVAL;
2310
7d7e0f90 2311 if (!set->ops->queue_rq)
24d2f903
CH
2312 return -EINVAL;
2313
a4391c64
JA
2314 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2315 pr_info("blk-mq: reduced tag depth to %u\n",
2316 BLK_MQ_MAX_DEPTH);
2317 set->queue_depth = BLK_MQ_MAX_DEPTH;
2318 }
24d2f903 2319
6637fadf
SL
2320 /*
2321 * If a crashdump is active, then we are potentially in a very
2322 * memory constrained environment. Limit us to 1 queue and
2323 * 64 tags to prevent using too much memory.
2324 */
2325 if (is_kdump_kernel()) {
2326 set->nr_hw_queues = 1;
2327 set->queue_depth = min(64U, set->queue_depth);
2328 }
868f2f0b
KB
2329 /*
2330 * There is no use for more h/w queues than cpus.
2331 */
2332 if (set->nr_hw_queues > nr_cpu_ids)
2333 set->nr_hw_queues = nr_cpu_ids;
6637fadf 2334
868f2f0b 2335 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
24d2f903
CH
2336 GFP_KERNEL, set->numa_node);
2337 if (!set->tags)
a5164405 2338 return -ENOMEM;
24d2f903 2339
da695ba2
CH
2340 ret = -ENOMEM;
2341 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2342 GFP_KERNEL, set->numa_node);
bdd17e75
CH
2343 if (!set->mq_map)
2344 goto out_free_tags;
2345
da695ba2
CH
2346 if (set->ops->map_queues)
2347 ret = set->ops->map_queues(set);
2348 else
2349 ret = blk_mq_map_queues(set);
2350 if (ret)
2351 goto out_free_mq_map;
2352
2353 ret = blk_mq_alloc_rq_maps(set);
2354 if (ret)
bdd17e75 2355 goto out_free_mq_map;
24d2f903 2356
0d2602ca
JA
2357 mutex_init(&set->tag_list_lock);
2358 INIT_LIST_HEAD(&set->tag_list);
2359
24d2f903 2360 return 0;
bdd17e75
CH
2361
2362out_free_mq_map:
2363 kfree(set->mq_map);
2364 set->mq_map = NULL;
2365out_free_tags:
5676e7b6
RE
2366 kfree(set->tags);
2367 set->tags = NULL;
da695ba2 2368 return ret;
24d2f903
CH
2369}
2370EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2371
2372void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2373{
2374 int i;
2375
868f2f0b 2376 for (i = 0; i < nr_cpu_ids; i++) {
f42d79ab 2377 if (set->tags[i])
484b4061
JA
2378 blk_mq_free_rq_map(set, set->tags[i], i);
2379 }
2380
bdd17e75
CH
2381 kfree(set->mq_map);
2382 set->mq_map = NULL;
2383
981bd189 2384 kfree(set->tags);
5676e7b6 2385 set->tags = NULL;
24d2f903
CH
2386}
2387EXPORT_SYMBOL(blk_mq_free_tag_set);
2388
e3a2b3f9
JA
2389int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2390{
2391 struct blk_mq_tag_set *set = q->tag_set;
2392 struct blk_mq_hw_ctx *hctx;
2393 int i, ret;
2394
2395 if (!set || nr > set->queue_depth)
2396 return -EINVAL;
2397
2398 ret = 0;
2399 queue_for_each_hw_ctx(q, hctx, i) {
e9137d4b
KB
2400 if (!hctx->tags)
2401 continue;
e3a2b3f9
JA
2402 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2403 if (ret)
2404 break;
2405 }
2406
2407 if (!ret)
2408 q->nr_requests = nr;
2409
2410 return ret;
2411}
2412
868f2f0b
KB
2413void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2414{
2415 struct request_queue *q;
2416
2417 if (nr_hw_queues > nr_cpu_ids)
2418 nr_hw_queues = nr_cpu_ids;
2419 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2420 return;
2421
2422 list_for_each_entry(q, &set->tag_list, tag_set_list)
2423 blk_mq_freeze_queue(q);
2424
2425 set->nr_hw_queues = nr_hw_queues;
2426 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2427 blk_mq_realloc_hw_ctxs(set, q);
2428
2429 if (q->nr_hw_queues > 1)
2430 blk_queue_make_request(q, blk_mq_make_request);
2431 else
2432 blk_queue_make_request(q, blk_sq_make_request);
2433
2434 blk_mq_queue_reinit(q, cpu_online_mask);
2435 }
2436
2437 list_for_each_entry(q, &set->tag_list, tag_set_list)
2438 blk_mq_unfreeze_queue(q);
2439}
2440EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2441
676141e4
JA
2442void blk_mq_disable_hotplug(void)
2443{
2444 mutex_lock(&all_q_mutex);
2445}
2446
2447void blk_mq_enable_hotplug(void)
2448{
2449 mutex_unlock(&all_q_mutex);
2450}
2451
320ae51f
JA
2452static int __init blk_mq_init(void)
2453{
9467f859
TG
2454 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2455 blk_mq_hctx_notify_dead);
320ae51f 2456
65d5291e
SAS
2457 cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
2458 blk_mq_queue_reinit_prepare,
2459 blk_mq_queue_reinit_dead);
320ae51f
JA
2460 return 0;
2461}
2462subsys_initcall(blk_mq_init);