]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - block/blk-mq.c
block: add scalable completion tracking of requests
[mirror_ubuntu-artful-kernel.git] / block / blk-mq.c
1 /*
2 * Block multiqueue core code
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/backing-dev.h>
10 #include <linux/bio.h>
11 #include <linux/blkdev.h>
12 #include <linux/kmemleak.h>
13 #include <linux/mm.h>
14 #include <linux/init.h>
15 #include <linux/slab.h>
16 #include <linux/workqueue.h>
17 #include <linux/smp.h>
18 #include <linux/llist.h>
19 #include <linux/list_sort.h>
20 #include <linux/cpu.h>
21 #include <linux/cache.h>
22 #include <linux/sched/sysctl.h>
23 #include <linux/delay.h>
24 #include <linux/crash_dump.h>
25 #include <linux/prefetch.h>
26
27 #include <trace/events/block.h>
28
29 #include <linux/blk-mq.h>
30 #include "blk.h"
31 #include "blk-mq.h"
32 #include "blk-mq-tag.h"
33 #include "blk-stat.h"
34
35 static DEFINE_MUTEX(all_q_mutex);
36 static LIST_HEAD(all_q_list);
37
38 /*
39 * Check if any of the ctx's have pending work in this hardware queue
40 */
41 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
42 {
43 return sbitmap_any_bit_set(&hctx->ctx_map);
44 }
45
46 /*
47 * Mark this ctx as having pending work in this hardware queue
48 */
49 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
50 struct blk_mq_ctx *ctx)
51 {
52 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
53 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
54 }
55
56 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
57 struct blk_mq_ctx *ctx)
58 {
59 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
60 }
61
62 void blk_mq_freeze_queue_start(struct request_queue *q)
63 {
64 int freeze_depth;
65
66 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
67 if (freeze_depth == 1) {
68 percpu_ref_kill(&q->q_usage_counter);
69 blk_mq_run_hw_queues(q, false);
70 }
71 }
72 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
73
74 static void blk_mq_freeze_queue_wait(struct request_queue *q)
75 {
76 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
77 }
78
79 /*
80 * Guarantee no request is in use, so we can change any data structure of
81 * the queue afterward.
82 */
83 void blk_freeze_queue(struct request_queue *q)
84 {
85 /*
86 * In the !blk_mq case we are only calling this to kill the
87 * q_usage_counter, otherwise this increases the freeze depth
88 * and waits for it to return to zero. For this reason there is
89 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
90 * exported to drivers as the only user for unfreeze is blk_mq.
91 */
92 blk_mq_freeze_queue_start(q);
93 blk_mq_freeze_queue_wait(q);
94 }
95
96 void blk_mq_freeze_queue(struct request_queue *q)
97 {
98 /*
99 * ...just an alias to keep freeze and unfreeze actions balanced
100 * in the blk_mq_* namespace
101 */
102 blk_freeze_queue(q);
103 }
104 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
105
106 void blk_mq_unfreeze_queue(struct request_queue *q)
107 {
108 int freeze_depth;
109
110 freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
111 WARN_ON_ONCE(freeze_depth < 0);
112 if (!freeze_depth) {
113 percpu_ref_reinit(&q->q_usage_counter);
114 wake_up_all(&q->mq_freeze_wq);
115 }
116 }
117 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
118
119 /**
120 * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
121 * @q: request queue.
122 *
123 * Note: this function does not prevent that the struct request end_io()
124 * callback function is invoked. Additionally, it is not prevented that
125 * new queue_rq() calls occur unless the queue has been stopped first.
126 */
127 void blk_mq_quiesce_queue(struct request_queue *q)
128 {
129 struct blk_mq_hw_ctx *hctx;
130 unsigned int i;
131 bool rcu = false;
132
133 blk_mq_stop_hw_queues(q);
134
135 queue_for_each_hw_ctx(q, hctx, i) {
136 if (hctx->flags & BLK_MQ_F_BLOCKING)
137 synchronize_srcu(&hctx->queue_rq_srcu);
138 else
139 rcu = true;
140 }
141 if (rcu)
142 synchronize_rcu();
143 }
144 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
145
146 void blk_mq_wake_waiters(struct request_queue *q)
147 {
148 struct blk_mq_hw_ctx *hctx;
149 unsigned int i;
150
151 queue_for_each_hw_ctx(q, hctx, i)
152 if (blk_mq_hw_queue_mapped(hctx))
153 blk_mq_tag_wakeup_all(hctx->tags, true);
154
155 /*
156 * If we are called because the queue has now been marked as
157 * dying, we need to ensure that processes currently waiting on
158 * the queue are notified as well.
159 */
160 wake_up_all(&q->mq_freeze_wq);
161 }
162
163 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
164 {
165 return blk_mq_has_free_tags(hctx->tags);
166 }
167 EXPORT_SYMBOL(blk_mq_can_queue);
168
169 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
170 struct request *rq, unsigned int op)
171 {
172 INIT_LIST_HEAD(&rq->queuelist);
173 /* csd/requeue_work/fifo_time is initialized before use */
174 rq->q = q;
175 rq->mq_ctx = ctx;
176 rq->cmd_flags = op;
177 if (blk_queue_io_stat(q))
178 rq->rq_flags |= RQF_IO_STAT;
179 /* do not touch atomic flags, it needs atomic ops against the timer */
180 rq->cpu = -1;
181 INIT_HLIST_NODE(&rq->hash);
182 RB_CLEAR_NODE(&rq->rb_node);
183 rq->rq_disk = NULL;
184 rq->part = NULL;
185 rq->start_time = jiffies;
186 #ifdef CONFIG_BLK_CGROUP
187 rq->rl = NULL;
188 set_start_time_ns(rq);
189 rq->io_start_time_ns = 0;
190 #endif
191 rq->nr_phys_segments = 0;
192 #if defined(CONFIG_BLK_DEV_INTEGRITY)
193 rq->nr_integrity_segments = 0;
194 #endif
195 rq->special = NULL;
196 /* tag was already set */
197 rq->errors = 0;
198
199 rq->cmd = rq->__cmd;
200
201 rq->extra_len = 0;
202 rq->sense_len = 0;
203 rq->resid_len = 0;
204 rq->sense = NULL;
205
206 INIT_LIST_HEAD(&rq->timeout_list);
207 rq->timeout = 0;
208
209 rq->end_io = NULL;
210 rq->end_io_data = NULL;
211 rq->next_rq = NULL;
212
213 ctx->rq_dispatched[op_is_sync(op)]++;
214 }
215
216 static struct request *
217 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
218 {
219 struct request *rq;
220 unsigned int tag;
221
222 tag = blk_mq_get_tag(data);
223 if (tag != BLK_MQ_TAG_FAIL) {
224 rq = data->hctx->tags->rqs[tag];
225
226 if (blk_mq_tag_busy(data->hctx)) {
227 rq->rq_flags = RQF_MQ_INFLIGHT;
228 atomic_inc(&data->hctx->nr_active);
229 }
230
231 rq->tag = tag;
232 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
233 return rq;
234 }
235
236 return NULL;
237 }
238
239 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
240 unsigned int flags)
241 {
242 struct blk_mq_ctx *ctx;
243 struct blk_mq_hw_ctx *hctx;
244 struct request *rq;
245 struct blk_mq_alloc_data alloc_data;
246 int ret;
247
248 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
249 if (ret)
250 return ERR_PTR(ret);
251
252 ctx = blk_mq_get_ctx(q);
253 hctx = blk_mq_map_queue(q, ctx->cpu);
254 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
255 rq = __blk_mq_alloc_request(&alloc_data, rw);
256 blk_mq_put_ctx(ctx);
257
258 if (!rq) {
259 blk_queue_exit(q);
260 return ERR_PTR(-EWOULDBLOCK);
261 }
262
263 rq->__data_len = 0;
264 rq->__sector = (sector_t) -1;
265 rq->bio = rq->biotail = NULL;
266 return rq;
267 }
268 EXPORT_SYMBOL(blk_mq_alloc_request);
269
270 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
271 unsigned int flags, unsigned int hctx_idx)
272 {
273 struct blk_mq_hw_ctx *hctx;
274 struct blk_mq_ctx *ctx;
275 struct request *rq;
276 struct blk_mq_alloc_data alloc_data;
277 int ret;
278
279 /*
280 * If the tag allocator sleeps we could get an allocation for a
281 * different hardware context. No need to complicate the low level
282 * allocator for this for the rare use case of a command tied to
283 * a specific queue.
284 */
285 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
286 return ERR_PTR(-EINVAL);
287
288 if (hctx_idx >= q->nr_hw_queues)
289 return ERR_PTR(-EIO);
290
291 ret = blk_queue_enter(q, true);
292 if (ret)
293 return ERR_PTR(ret);
294
295 /*
296 * Check if the hardware context is actually mapped to anything.
297 * If not tell the caller that it should skip this queue.
298 */
299 hctx = q->queue_hw_ctx[hctx_idx];
300 if (!blk_mq_hw_queue_mapped(hctx)) {
301 ret = -EXDEV;
302 goto out_queue_exit;
303 }
304 ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
305
306 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
307 rq = __blk_mq_alloc_request(&alloc_data, rw);
308 if (!rq) {
309 ret = -EWOULDBLOCK;
310 goto out_queue_exit;
311 }
312
313 return rq;
314
315 out_queue_exit:
316 blk_queue_exit(q);
317 return ERR_PTR(ret);
318 }
319 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
320
321 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
322 struct blk_mq_ctx *ctx, struct request *rq)
323 {
324 const int tag = rq->tag;
325 struct request_queue *q = rq->q;
326
327 if (rq->rq_flags & RQF_MQ_INFLIGHT)
328 atomic_dec(&hctx->nr_active);
329 rq->rq_flags = 0;
330
331 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
332 blk_mq_put_tag(hctx, ctx, tag);
333 blk_queue_exit(q);
334 }
335
336 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
337 {
338 struct blk_mq_ctx *ctx = rq->mq_ctx;
339
340 ctx->rq_completed[rq_is_sync(rq)]++;
341 __blk_mq_free_request(hctx, ctx, rq);
342
343 }
344 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
345
346 void blk_mq_free_request(struct request *rq)
347 {
348 blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
349 }
350 EXPORT_SYMBOL_GPL(blk_mq_free_request);
351
352 inline void __blk_mq_end_request(struct request *rq, int error)
353 {
354 blk_account_io_done(rq);
355
356 if (rq->end_io) {
357 rq->end_io(rq, error);
358 } else {
359 if (unlikely(blk_bidi_rq(rq)))
360 blk_mq_free_request(rq->next_rq);
361 blk_mq_free_request(rq);
362 }
363 }
364 EXPORT_SYMBOL(__blk_mq_end_request);
365
366 void blk_mq_end_request(struct request *rq, int error)
367 {
368 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
369 BUG();
370 __blk_mq_end_request(rq, error);
371 }
372 EXPORT_SYMBOL(blk_mq_end_request);
373
374 static void __blk_mq_complete_request_remote(void *data)
375 {
376 struct request *rq = data;
377
378 rq->q->softirq_done_fn(rq);
379 }
380
381 static void blk_mq_ipi_complete_request(struct request *rq)
382 {
383 struct blk_mq_ctx *ctx = rq->mq_ctx;
384 bool shared = false;
385 int cpu;
386
387 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
388 rq->q->softirq_done_fn(rq);
389 return;
390 }
391
392 cpu = get_cpu();
393 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
394 shared = cpus_share_cache(cpu, ctx->cpu);
395
396 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
397 rq->csd.func = __blk_mq_complete_request_remote;
398 rq->csd.info = rq;
399 rq->csd.flags = 0;
400 smp_call_function_single_async(ctx->cpu, &rq->csd);
401 } else {
402 rq->q->softirq_done_fn(rq);
403 }
404 put_cpu();
405 }
406
407 static void blk_mq_stat_add(struct request *rq)
408 {
409 if (rq->rq_flags & RQF_STATS) {
410 /*
411 * We could rq->mq_ctx here, but there's less of a risk
412 * of races if we have the completion event add the stats
413 * to the local software queue.
414 */
415 struct blk_mq_ctx *ctx;
416
417 ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
418 blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
419 }
420 }
421
422 static void __blk_mq_complete_request(struct request *rq)
423 {
424 struct request_queue *q = rq->q;
425
426 blk_mq_stat_add(rq);
427
428 if (!q->softirq_done_fn)
429 blk_mq_end_request(rq, rq->errors);
430 else
431 blk_mq_ipi_complete_request(rq);
432 }
433
434 /**
435 * blk_mq_complete_request - end I/O on a request
436 * @rq: the request being processed
437 *
438 * Description:
439 * Ends all I/O on a request. It does not handle partial completions.
440 * The actual completion happens out-of-order, through a IPI handler.
441 **/
442 void blk_mq_complete_request(struct request *rq, int error)
443 {
444 struct request_queue *q = rq->q;
445
446 if (unlikely(blk_should_fake_timeout(q)))
447 return;
448 if (!blk_mark_rq_complete(rq)) {
449 rq->errors = error;
450 __blk_mq_complete_request(rq);
451 }
452 }
453 EXPORT_SYMBOL(blk_mq_complete_request);
454
455 int blk_mq_request_started(struct request *rq)
456 {
457 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
458 }
459 EXPORT_SYMBOL_GPL(blk_mq_request_started);
460
461 void blk_mq_start_request(struct request *rq)
462 {
463 struct request_queue *q = rq->q;
464
465 trace_block_rq_issue(q, rq);
466
467 rq->resid_len = blk_rq_bytes(rq);
468 if (unlikely(blk_bidi_rq(rq)))
469 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
470
471 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
472 blk_stat_set_issue_time(&rq->issue_stat);
473 rq->rq_flags |= RQF_STATS;
474 }
475
476 blk_add_timer(rq);
477
478 /*
479 * Ensure that ->deadline is visible before set the started
480 * flag and clear the completed flag.
481 */
482 smp_mb__before_atomic();
483
484 /*
485 * Mark us as started and clear complete. Complete might have been
486 * set if requeue raced with timeout, which then marked it as
487 * complete. So be sure to clear complete again when we start
488 * the request, otherwise we'll ignore the completion event.
489 */
490 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
491 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
492 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
493 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
494
495 if (q->dma_drain_size && blk_rq_bytes(rq)) {
496 /*
497 * Make sure space for the drain appears. We know we can do
498 * this because max_hw_segments has been adjusted to be one
499 * fewer than the device can handle.
500 */
501 rq->nr_phys_segments++;
502 }
503 }
504 EXPORT_SYMBOL(blk_mq_start_request);
505
506 static void __blk_mq_requeue_request(struct request *rq)
507 {
508 struct request_queue *q = rq->q;
509
510 trace_block_rq_requeue(q, rq);
511
512 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
513 if (q->dma_drain_size && blk_rq_bytes(rq))
514 rq->nr_phys_segments--;
515 }
516 }
517
518 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
519 {
520 __blk_mq_requeue_request(rq);
521
522 BUG_ON(blk_queued_rq(rq));
523 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
524 }
525 EXPORT_SYMBOL(blk_mq_requeue_request);
526
527 static void blk_mq_requeue_work(struct work_struct *work)
528 {
529 struct request_queue *q =
530 container_of(work, struct request_queue, requeue_work.work);
531 LIST_HEAD(rq_list);
532 struct request *rq, *next;
533 unsigned long flags;
534
535 spin_lock_irqsave(&q->requeue_lock, flags);
536 list_splice_init(&q->requeue_list, &rq_list);
537 spin_unlock_irqrestore(&q->requeue_lock, flags);
538
539 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
540 if (!(rq->rq_flags & RQF_SOFTBARRIER))
541 continue;
542
543 rq->rq_flags &= ~RQF_SOFTBARRIER;
544 list_del_init(&rq->queuelist);
545 blk_mq_insert_request(rq, true, false, false);
546 }
547
548 while (!list_empty(&rq_list)) {
549 rq = list_entry(rq_list.next, struct request, queuelist);
550 list_del_init(&rq->queuelist);
551 blk_mq_insert_request(rq, false, false, false);
552 }
553
554 blk_mq_run_hw_queues(q, false);
555 }
556
557 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
558 bool kick_requeue_list)
559 {
560 struct request_queue *q = rq->q;
561 unsigned long flags;
562
563 /*
564 * We abuse this flag that is otherwise used by the I/O scheduler to
565 * request head insertation from the workqueue.
566 */
567 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
568
569 spin_lock_irqsave(&q->requeue_lock, flags);
570 if (at_head) {
571 rq->rq_flags |= RQF_SOFTBARRIER;
572 list_add(&rq->queuelist, &q->requeue_list);
573 } else {
574 list_add_tail(&rq->queuelist, &q->requeue_list);
575 }
576 spin_unlock_irqrestore(&q->requeue_lock, flags);
577
578 if (kick_requeue_list)
579 blk_mq_kick_requeue_list(q);
580 }
581 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
582
583 void blk_mq_kick_requeue_list(struct request_queue *q)
584 {
585 kblockd_schedule_delayed_work(&q->requeue_work, 0);
586 }
587 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
588
589 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
590 unsigned long msecs)
591 {
592 kblockd_schedule_delayed_work(&q->requeue_work,
593 msecs_to_jiffies(msecs));
594 }
595 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
596
597 void blk_mq_abort_requeue_list(struct request_queue *q)
598 {
599 unsigned long flags;
600 LIST_HEAD(rq_list);
601
602 spin_lock_irqsave(&q->requeue_lock, flags);
603 list_splice_init(&q->requeue_list, &rq_list);
604 spin_unlock_irqrestore(&q->requeue_lock, flags);
605
606 while (!list_empty(&rq_list)) {
607 struct request *rq;
608
609 rq = list_first_entry(&rq_list, struct request, queuelist);
610 list_del_init(&rq->queuelist);
611 rq->errors = -EIO;
612 blk_mq_end_request(rq, rq->errors);
613 }
614 }
615 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
616
617 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
618 {
619 if (tag < tags->nr_tags) {
620 prefetch(tags->rqs[tag]);
621 return tags->rqs[tag];
622 }
623
624 return NULL;
625 }
626 EXPORT_SYMBOL(blk_mq_tag_to_rq);
627
628 struct blk_mq_timeout_data {
629 unsigned long next;
630 unsigned int next_set;
631 };
632
633 void blk_mq_rq_timed_out(struct request *req, bool reserved)
634 {
635 struct blk_mq_ops *ops = req->q->mq_ops;
636 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
637
638 /*
639 * We know that complete is set at this point. If STARTED isn't set
640 * anymore, then the request isn't active and the "timeout" should
641 * just be ignored. This can happen due to the bitflag ordering.
642 * Timeout first checks if STARTED is set, and if it is, assumes
643 * the request is active. But if we race with completion, then
644 * we both flags will get cleared. So check here again, and ignore
645 * a timeout event with a request that isn't active.
646 */
647 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
648 return;
649
650 if (ops->timeout)
651 ret = ops->timeout(req, reserved);
652
653 switch (ret) {
654 case BLK_EH_HANDLED:
655 __blk_mq_complete_request(req);
656 break;
657 case BLK_EH_RESET_TIMER:
658 blk_add_timer(req);
659 blk_clear_rq_complete(req);
660 break;
661 case BLK_EH_NOT_HANDLED:
662 break;
663 default:
664 printk(KERN_ERR "block: bad eh return: %d\n", ret);
665 break;
666 }
667 }
668
669 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
670 struct request *rq, void *priv, bool reserved)
671 {
672 struct blk_mq_timeout_data *data = priv;
673
674 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
675 /*
676 * If a request wasn't started before the queue was
677 * marked dying, kill it here or it'll go unnoticed.
678 */
679 if (unlikely(blk_queue_dying(rq->q))) {
680 rq->errors = -EIO;
681 blk_mq_end_request(rq, rq->errors);
682 }
683 return;
684 }
685
686 if (time_after_eq(jiffies, rq->deadline)) {
687 if (!blk_mark_rq_complete(rq))
688 blk_mq_rq_timed_out(rq, reserved);
689 } else if (!data->next_set || time_after(data->next, rq->deadline)) {
690 data->next = rq->deadline;
691 data->next_set = 1;
692 }
693 }
694
695 static void blk_mq_timeout_work(struct work_struct *work)
696 {
697 struct request_queue *q =
698 container_of(work, struct request_queue, timeout_work);
699 struct blk_mq_timeout_data data = {
700 .next = 0,
701 .next_set = 0,
702 };
703 int i;
704
705 /* A deadlock might occur if a request is stuck requiring a
706 * timeout at the same time a queue freeze is waiting
707 * completion, since the timeout code would not be able to
708 * acquire the queue reference here.
709 *
710 * That's why we don't use blk_queue_enter here; instead, we use
711 * percpu_ref_tryget directly, because we need to be able to
712 * obtain a reference even in the short window between the queue
713 * starting to freeze, by dropping the first reference in
714 * blk_mq_freeze_queue_start, and the moment the last request is
715 * consumed, marked by the instant q_usage_counter reaches
716 * zero.
717 */
718 if (!percpu_ref_tryget(&q->q_usage_counter))
719 return;
720
721 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
722
723 if (data.next_set) {
724 data.next = blk_rq_timeout(round_jiffies_up(data.next));
725 mod_timer(&q->timeout, data.next);
726 } else {
727 struct blk_mq_hw_ctx *hctx;
728
729 queue_for_each_hw_ctx(q, hctx, i) {
730 /* the hctx may be unmapped, so check it here */
731 if (blk_mq_hw_queue_mapped(hctx))
732 blk_mq_tag_idle(hctx);
733 }
734 }
735 blk_queue_exit(q);
736 }
737
738 /*
739 * Reverse check our software queue for entries that we could potentially
740 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
741 * too much time checking for merges.
742 */
743 static bool blk_mq_attempt_merge(struct request_queue *q,
744 struct blk_mq_ctx *ctx, struct bio *bio)
745 {
746 struct request *rq;
747 int checked = 8;
748
749 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
750 int el_ret;
751
752 if (!checked--)
753 break;
754
755 if (!blk_rq_merge_ok(rq, bio))
756 continue;
757
758 el_ret = blk_try_merge(rq, bio);
759 if (el_ret == ELEVATOR_BACK_MERGE) {
760 if (bio_attempt_back_merge(q, rq, bio)) {
761 ctx->rq_merged++;
762 return true;
763 }
764 break;
765 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
766 if (bio_attempt_front_merge(q, rq, bio)) {
767 ctx->rq_merged++;
768 return true;
769 }
770 break;
771 }
772 }
773
774 return false;
775 }
776
777 struct flush_busy_ctx_data {
778 struct blk_mq_hw_ctx *hctx;
779 struct list_head *list;
780 };
781
782 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
783 {
784 struct flush_busy_ctx_data *flush_data = data;
785 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
786 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
787
788 sbitmap_clear_bit(sb, bitnr);
789 spin_lock(&ctx->lock);
790 list_splice_tail_init(&ctx->rq_list, flush_data->list);
791 spin_unlock(&ctx->lock);
792 return true;
793 }
794
795 /*
796 * Process software queues that have been marked busy, splicing them
797 * to the for-dispatch
798 */
799 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
800 {
801 struct flush_busy_ctx_data data = {
802 .hctx = hctx,
803 .list = list,
804 };
805
806 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
807 }
808
809 static inline unsigned int queued_to_index(unsigned int queued)
810 {
811 if (!queued)
812 return 0;
813
814 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
815 }
816
817 /*
818 * Run this hardware queue, pulling any software queues mapped to it in.
819 * Note that this function currently has various problems around ordering
820 * of IO. In particular, we'd like FIFO behaviour on handling existing
821 * items on the hctx->dispatch list. Ignore that for now.
822 */
823 static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
824 {
825 struct request_queue *q = hctx->queue;
826 struct request *rq;
827 LIST_HEAD(rq_list);
828 LIST_HEAD(driver_list);
829 struct list_head *dptr;
830 int queued;
831
832 if (unlikely(blk_mq_hctx_stopped(hctx)))
833 return;
834
835 hctx->run++;
836
837 /*
838 * Touch any software queue that has pending entries.
839 */
840 flush_busy_ctxs(hctx, &rq_list);
841
842 /*
843 * If we have previous entries on our dispatch list, grab them
844 * and stuff them at the front for more fair dispatch.
845 */
846 if (!list_empty_careful(&hctx->dispatch)) {
847 spin_lock(&hctx->lock);
848 if (!list_empty(&hctx->dispatch))
849 list_splice_init(&hctx->dispatch, &rq_list);
850 spin_unlock(&hctx->lock);
851 }
852
853 /*
854 * Start off with dptr being NULL, so we start the first request
855 * immediately, even if we have more pending.
856 */
857 dptr = NULL;
858
859 /*
860 * Now process all the entries, sending them to the driver.
861 */
862 queued = 0;
863 while (!list_empty(&rq_list)) {
864 struct blk_mq_queue_data bd;
865 int ret;
866
867 rq = list_first_entry(&rq_list, struct request, queuelist);
868 list_del_init(&rq->queuelist);
869
870 bd.rq = rq;
871 bd.list = dptr;
872 bd.last = list_empty(&rq_list);
873
874 ret = q->mq_ops->queue_rq(hctx, &bd);
875 switch (ret) {
876 case BLK_MQ_RQ_QUEUE_OK:
877 queued++;
878 break;
879 case BLK_MQ_RQ_QUEUE_BUSY:
880 list_add(&rq->queuelist, &rq_list);
881 __blk_mq_requeue_request(rq);
882 break;
883 default:
884 pr_err("blk-mq: bad return on queue: %d\n", ret);
885 case BLK_MQ_RQ_QUEUE_ERROR:
886 rq->errors = -EIO;
887 blk_mq_end_request(rq, rq->errors);
888 break;
889 }
890
891 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
892 break;
893
894 /*
895 * We've done the first request. If we have more than 1
896 * left in the list, set dptr to defer issue.
897 */
898 if (!dptr && rq_list.next != rq_list.prev)
899 dptr = &driver_list;
900 }
901
902 hctx->dispatched[queued_to_index(queued)]++;
903
904 /*
905 * Any items that need requeuing? Stuff them into hctx->dispatch,
906 * that is where we will continue on next queue run.
907 */
908 if (!list_empty(&rq_list)) {
909 spin_lock(&hctx->lock);
910 list_splice(&rq_list, &hctx->dispatch);
911 spin_unlock(&hctx->lock);
912 /*
913 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
914 * it's possible the queue is stopped and restarted again
915 * before this. Queue restart will dispatch requests. And since
916 * requests in rq_list aren't added into hctx->dispatch yet,
917 * the requests in rq_list might get lost.
918 *
919 * blk_mq_run_hw_queue() already checks the STOPPED bit
920 **/
921 blk_mq_run_hw_queue(hctx, true);
922 }
923 }
924
925 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
926 {
927 int srcu_idx;
928
929 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
930 cpu_online(hctx->next_cpu));
931
932 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
933 rcu_read_lock();
934 blk_mq_process_rq_list(hctx);
935 rcu_read_unlock();
936 } else {
937 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
938 blk_mq_process_rq_list(hctx);
939 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
940 }
941 }
942
943 /*
944 * It'd be great if the workqueue API had a way to pass
945 * in a mask and had some smarts for more clever placement.
946 * For now we just round-robin here, switching for every
947 * BLK_MQ_CPU_WORK_BATCH queued items.
948 */
949 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
950 {
951 if (hctx->queue->nr_hw_queues == 1)
952 return WORK_CPU_UNBOUND;
953
954 if (--hctx->next_cpu_batch <= 0) {
955 int next_cpu;
956
957 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
958 if (next_cpu >= nr_cpu_ids)
959 next_cpu = cpumask_first(hctx->cpumask);
960
961 hctx->next_cpu = next_cpu;
962 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
963 }
964
965 return hctx->next_cpu;
966 }
967
968 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
969 {
970 if (unlikely(blk_mq_hctx_stopped(hctx) ||
971 !blk_mq_hw_queue_mapped(hctx)))
972 return;
973
974 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
975 int cpu = get_cpu();
976 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
977 __blk_mq_run_hw_queue(hctx);
978 put_cpu();
979 return;
980 }
981
982 put_cpu();
983 }
984
985 kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
986 }
987
988 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
989 {
990 struct blk_mq_hw_ctx *hctx;
991 int i;
992
993 queue_for_each_hw_ctx(q, hctx, i) {
994 if ((!blk_mq_hctx_has_pending(hctx) &&
995 list_empty_careful(&hctx->dispatch)) ||
996 blk_mq_hctx_stopped(hctx))
997 continue;
998
999 blk_mq_run_hw_queue(hctx, async);
1000 }
1001 }
1002 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1003
1004 /**
1005 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1006 * @q: request queue.
1007 *
1008 * The caller is responsible for serializing this function against
1009 * blk_mq_{start,stop}_hw_queue().
1010 */
1011 bool blk_mq_queue_stopped(struct request_queue *q)
1012 {
1013 struct blk_mq_hw_ctx *hctx;
1014 int i;
1015
1016 queue_for_each_hw_ctx(q, hctx, i)
1017 if (blk_mq_hctx_stopped(hctx))
1018 return true;
1019
1020 return false;
1021 }
1022 EXPORT_SYMBOL(blk_mq_queue_stopped);
1023
1024 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1025 {
1026 cancel_work(&hctx->run_work);
1027 cancel_delayed_work(&hctx->delay_work);
1028 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1029 }
1030 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1031
1032 void blk_mq_stop_hw_queues(struct request_queue *q)
1033 {
1034 struct blk_mq_hw_ctx *hctx;
1035 int i;
1036
1037 queue_for_each_hw_ctx(q, hctx, i)
1038 blk_mq_stop_hw_queue(hctx);
1039 }
1040 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1041
1042 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1043 {
1044 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1045
1046 blk_mq_run_hw_queue(hctx, false);
1047 }
1048 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1049
1050 void blk_mq_start_hw_queues(struct request_queue *q)
1051 {
1052 struct blk_mq_hw_ctx *hctx;
1053 int i;
1054
1055 queue_for_each_hw_ctx(q, hctx, i)
1056 blk_mq_start_hw_queue(hctx);
1057 }
1058 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1059
1060 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1061 {
1062 struct blk_mq_hw_ctx *hctx;
1063 int i;
1064
1065 queue_for_each_hw_ctx(q, hctx, i) {
1066 if (!blk_mq_hctx_stopped(hctx))
1067 continue;
1068
1069 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1070 blk_mq_run_hw_queue(hctx, async);
1071 }
1072 }
1073 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1074
1075 static void blk_mq_run_work_fn(struct work_struct *work)
1076 {
1077 struct blk_mq_hw_ctx *hctx;
1078
1079 hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
1080
1081 __blk_mq_run_hw_queue(hctx);
1082 }
1083
1084 static void blk_mq_delay_work_fn(struct work_struct *work)
1085 {
1086 struct blk_mq_hw_ctx *hctx;
1087
1088 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
1089
1090 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
1091 __blk_mq_run_hw_queue(hctx);
1092 }
1093
1094 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1095 {
1096 if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1097 return;
1098
1099 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1100 &hctx->delay_work, msecs_to_jiffies(msecs));
1101 }
1102 EXPORT_SYMBOL(blk_mq_delay_queue);
1103
1104 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1105 struct request *rq,
1106 bool at_head)
1107 {
1108 struct blk_mq_ctx *ctx = rq->mq_ctx;
1109
1110 trace_block_rq_insert(hctx->queue, rq);
1111
1112 if (at_head)
1113 list_add(&rq->queuelist, &ctx->rq_list);
1114 else
1115 list_add_tail(&rq->queuelist, &ctx->rq_list);
1116 }
1117
1118 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
1119 struct request *rq, bool at_head)
1120 {
1121 struct blk_mq_ctx *ctx = rq->mq_ctx;
1122
1123 __blk_mq_insert_req_list(hctx, rq, at_head);
1124 blk_mq_hctx_mark_pending(hctx, ctx);
1125 }
1126
1127 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
1128 bool async)
1129 {
1130 struct blk_mq_ctx *ctx = rq->mq_ctx;
1131 struct request_queue *q = rq->q;
1132 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1133
1134 spin_lock(&ctx->lock);
1135 __blk_mq_insert_request(hctx, rq, at_head);
1136 spin_unlock(&ctx->lock);
1137
1138 if (run_queue)
1139 blk_mq_run_hw_queue(hctx, async);
1140 }
1141
1142 static void blk_mq_insert_requests(struct request_queue *q,
1143 struct blk_mq_ctx *ctx,
1144 struct list_head *list,
1145 int depth,
1146 bool from_schedule)
1147
1148 {
1149 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1150
1151 trace_block_unplug(q, depth, !from_schedule);
1152
1153 /*
1154 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1155 * offline now
1156 */
1157 spin_lock(&ctx->lock);
1158 while (!list_empty(list)) {
1159 struct request *rq;
1160
1161 rq = list_first_entry(list, struct request, queuelist);
1162 BUG_ON(rq->mq_ctx != ctx);
1163 list_del_init(&rq->queuelist);
1164 __blk_mq_insert_req_list(hctx, rq, false);
1165 }
1166 blk_mq_hctx_mark_pending(hctx, ctx);
1167 spin_unlock(&ctx->lock);
1168
1169 blk_mq_run_hw_queue(hctx, from_schedule);
1170 }
1171
1172 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1173 {
1174 struct request *rqa = container_of(a, struct request, queuelist);
1175 struct request *rqb = container_of(b, struct request, queuelist);
1176
1177 return !(rqa->mq_ctx < rqb->mq_ctx ||
1178 (rqa->mq_ctx == rqb->mq_ctx &&
1179 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1180 }
1181
1182 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1183 {
1184 struct blk_mq_ctx *this_ctx;
1185 struct request_queue *this_q;
1186 struct request *rq;
1187 LIST_HEAD(list);
1188 LIST_HEAD(ctx_list);
1189 unsigned int depth;
1190
1191 list_splice_init(&plug->mq_list, &list);
1192
1193 list_sort(NULL, &list, plug_ctx_cmp);
1194
1195 this_q = NULL;
1196 this_ctx = NULL;
1197 depth = 0;
1198
1199 while (!list_empty(&list)) {
1200 rq = list_entry_rq(list.next);
1201 list_del_init(&rq->queuelist);
1202 BUG_ON(!rq->q);
1203 if (rq->mq_ctx != this_ctx) {
1204 if (this_ctx) {
1205 blk_mq_insert_requests(this_q, this_ctx,
1206 &ctx_list, depth,
1207 from_schedule);
1208 }
1209
1210 this_ctx = rq->mq_ctx;
1211 this_q = rq->q;
1212 depth = 0;
1213 }
1214
1215 depth++;
1216 list_add_tail(&rq->queuelist, &ctx_list);
1217 }
1218
1219 /*
1220 * If 'this_ctx' is set, we know we have entries to complete
1221 * on 'ctx_list'. Do those.
1222 */
1223 if (this_ctx) {
1224 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1225 from_schedule);
1226 }
1227 }
1228
1229 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1230 {
1231 init_request_from_bio(rq, bio);
1232
1233 blk_account_io_start(rq, 1);
1234 }
1235
1236 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1237 {
1238 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1239 !blk_queue_nomerges(hctx->queue);
1240 }
1241
1242 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1243 struct blk_mq_ctx *ctx,
1244 struct request *rq, struct bio *bio)
1245 {
1246 if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
1247 blk_mq_bio_to_request(rq, bio);
1248 spin_lock(&ctx->lock);
1249 insert_rq:
1250 __blk_mq_insert_request(hctx, rq, false);
1251 spin_unlock(&ctx->lock);
1252 return false;
1253 } else {
1254 struct request_queue *q = hctx->queue;
1255
1256 spin_lock(&ctx->lock);
1257 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1258 blk_mq_bio_to_request(rq, bio);
1259 goto insert_rq;
1260 }
1261
1262 spin_unlock(&ctx->lock);
1263 __blk_mq_free_request(hctx, ctx, rq);
1264 return true;
1265 }
1266 }
1267
1268 static struct request *blk_mq_map_request(struct request_queue *q,
1269 struct bio *bio,
1270 struct blk_mq_alloc_data *data)
1271 {
1272 struct blk_mq_hw_ctx *hctx;
1273 struct blk_mq_ctx *ctx;
1274 struct request *rq;
1275
1276 blk_queue_enter_live(q);
1277 ctx = blk_mq_get_ctx(q);
1278 hctx = blk_mq_map_queue(q, ctx->cpu);
1279
1280 trace_block_getrq(q, bio, bio->bi_opf);
1281 blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
1282 rq = __blk_mq_alloc_request(data, bio->bi_opf);
1283
1284 data->hctx->queued++;
1285 return rq;
1286 }
1287
1288 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1289 struct request *rq, blk_qc_t *cookie)
1290 {
1291 int ret;
1292 struct request_queue *q = rq->q;
1293 struct blk_mq_queue_data bd = {
1294 .rq = rq,
1295 .list = NULL,
1296 .last = 1
1297 };
1298 blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
1299
1300 if (blk_mq_hctx_stopped(hctx))
1301 goto insert;
1302
1303 /*
1304 * For OK queue, we are done. For error, kill it. Any other
1305 * error (busy), just add it to our list as we previously
1306 * would have done
1307 */
1308 ret = q->mq_ops->queue_rq(hctx, &bd);
1309 if (ret == BLK_MQ_RQ_QUEUE_OK) {
1310 *cookie = new_cookie;
1311 return;
1312 }
1313
1314 __blk_mq_requeue_request(rq);
1315
1316 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1317 *cookie = BLK_QC_T_NONE;
1318 rq->errors = -EIO;
1319 blk_mq_end_request(rq, rq->errors);
1320 return;
1321 }
1322
1323 insert:
1324 blk_mq_insert_request(rq, false, true, true);
1325 }
1326
1327 /*
1328 * Multiple hardware queue variant. This will not use per-process plugs,
1329 * but will attempt to bypass the hctx queueing if we can go straight to
1330 * hardware for SYNC IO.
1331 */
1332 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1333 {
1334 const int is_sync = op_is_sync(bio->bi_opf);
1335 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
1336 struct blk_mq_alloc_data data;
1337 struct request *rq;
1338 unsigned int request_count = 0, srcu_idx;
1339 struct blk_plug *plug;
1340 struct request *same_queue_rq = NULL;
1341 blk_qc_t cookie;
1342
1343 blk_queue_bounce(q, &bio);
1344
1345 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1346 bio_io_error(bio);
1347 return BLK_QC_T_NONE;
1348 }
1349
1350 blk_queue_split(q, &bio, q->bio_split);
1351
1352 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1353 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1354 return BLK_QC_T_NONE;
1355
1356 rq = blk_mq_map_request(q, bio, &data);
1357 if (unlikely(!rq))
1358 return BLK_QC_T_NONE;
1359
1360 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
1361
1362 if (unlikely(is_flush_fua)) {
1363 blk_mq_bio_to_request(rq, bio);
1364 blk_insert_flush(rq);
1365 goto run_queue;
1366 }
1367
1368 plug = current->plug;
1369 /*
1370 * If the driver supports defer issued based on 'last', then
1371 * queue it up like normal since we can potentially save some
1372 * CPU this way.
1373 */
1374 if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1375 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1376 struct request *old_rq = NULL;
1377
1378 blk_mq_bio_to_request(rq, bio);
1379
1380 /*
1381 * We do limited plugging. If the bio can be merged, do that.
1382 * Otherwise the existing request in the plug list will be
1383 * issued. So the plug list will have one request at most
1384 */
1385 if (plug) {
1386 /*
1387 * The plug list might get flushed before this. If that
1388 * happens, same_queue_rq is invalid and plug list is
1389 * empty
1390 */
1391 if (same_queue_rq && !list_empty(&plug->mq_list)) {
1392 old_rq = same_queue_rq;
1393 list_del_init(&old_rq->queuelist);
1394 }
1395 list_add_tail(&rq->queuelist, &plug->mq_list);
1396 } else /* is_sync */
1397 old_rq = rq;
1398 blk_mq_put_ctx(data.ctx);
1399 if (!old_rq)
1400 goto done;
1401
1402 if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
1403 rcu_read_lock();
1404 blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1405 rcu_read_unlock();
1406 } else {
1407 srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
1408 blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1409 srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
1410 }
1411 goto done;
1412 }
1413
1414 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1415 /*
1416 * For a SYNC request, send it to the hardware immediately. For
1417 * an ASYNC request, just ensure that we run it later on. The
1418 * latter allows for merging opportunities and more efficient
1419 * dispatching.
1420 */
1421 run_queue:
1422 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1423 }
1424 blk_mq_put_ctx(data.ctx);
1425 done:
1426 return cookie;
1427 }
1428
1429 /*
1430 * Single hardware queue variant. This will attempt to use any per-process
1431 * plug for merging and IO deferral.
1432 */
1433 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1434 {
1435 const int is_sync = op_is_sync(bio->bi_opf);
1436 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
1437 struct blk_plug *plug;
1438 unsigned int request_count = 0;
1439 struct blk_mq_alloc_data data;
1440 struct request *rq;
1441 blk_qc_t cookie;
1442
1443 blk_queue_bounce(q, &bio);
1444
1445 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1446 bio_io_error(bio);
1447 return BLK_QC_T_NONE;
1448 }
1449
1450 blk_queue_split(q, &bio, q->bio_split);
1451
1452 if (!is_flush_fua && !blk_queue_nomerges(q)) {
1453 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
1454 return BLK_QC_T_NONE;
1455 } else
1456 request_count = blk_plug_queued_count(q);
1457
1458 rq = blk_mq_map_request(q, bio, &data);
1459 if (unlikely(!rq))
1460 return BLK_QC_T_NONE;
1461
1462 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
1463
1464 if (unlikely(is_flush_fua)) {
1465 blk_mq_bio_to_request(rq, bio);
1466 blk_insert_flush(rq);
1467 goto run_queue;
1468 }
1469
1470 /*
1471 * A task plug currently exists. Since this is completely lockless,
1472 * utilize that to temporarily store requests until the task is
1473 * either done or scheduled away.
1474 */
1475 plug = current->plug;
1476 if (plug) {
1477 struct request *last = NULL;
1478
1479 blk_mq_bio_to_request(rq, bio);
1480 if (!request_count)
1481 trace_block_plug(q);
1482 else
1483 last = list_entry_rq(plug->mq_list.prev);
1484
1485 blk_mq_put_ctx(data.ctx);
1486
1487 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1488 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1489 blk_flush_plug_list(plug, false);
1490 trace_block_plug(q);
1491 }
1492
1493 list_add_tail(&rq->queuelist, &plug->mq_list);
1494 return cookie;
1495 }
1496
1497 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1498 /*
1499 * For a SYNC request, send it to the hardware immediately. For
1500 * an ASYNC request, just ensure that we run it later on. The
1501 * latter allows for merging opportunities and more efficient
1502 * dispatching.
1503 */
1504 run_queue:
1505 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1506 }
1507
1508 blk_mq_put_ctx(data.ctx);
1509 return cookie;
1510 }
1511
1512 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1513 struct blk_mq_tags *tags, unsigned int hctx_idx)
1514 {
1515 struct page *page;
1516
1517 if (tags->rqs && set->ops->exit_request) {
1518 int i;
1519
1520 for (i = 0; i < tags->nr_tags; i++) {
1521 if (!tags->rqs[i])
1522 continue;
1523 set->ops->exit_request(set->driver_data, tags->rqs[i],
1524 hctx_idx, i);
1525 tags->rqs[i] = NULL;
1526 }
1527 }
1528
1529 while (!list_empty(&tags->page_list)) {
1530 page = list_first_entry(&tags->page_list, struct page, lru);
1531 list_del_init(&page->lru);
1532 /*
1533 * Remove kmemleak object previously allocated in
1534 * blk_mq_init_rq_map().
1535 */
1536 kmemleak_free(page_address(page));
1537 __free_pages(page, page->private);
1538 }
1539
1540 kfree(tags->rqs);
1541
1542 blk_mq_free_tags(tags);
1543 }
1544
1545 static size_t order_to_size(unsigned int order)
1546 {
1547 return (size_t)PAGE_SIZE << order;
1548 }
1549
1550 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1551 unsigned int hctx_idx)
1552 {
1553 struct blk_mq_tags *tags;
1554 unsigned int i, j, entries_per_page, max_order = 4;
1555 size_t rq_size, left;
1556
1557 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1558 set->numa_node,
1559 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1560 if (!tags)
1561 return NULL;
1562
1563 INIT_LIST_HEAD(&tags->page_list);
1564
1565 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1566 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1567 set->numa_node);
1568 if (!tags->rqs) {
1569 blk_mq_free_tags(tags);
1570 return NULL;
1571 }
1572
1573 /*
1574 * rq_size is the size of the request plus driver payload, rounded
1575 * to the cacheline size
1576 */
1577 rq_size = round_up(sizeof(struct request) + set->cmd_size,
1578 cache_line_size());
1579 left = rq_size * set->queue_depth;
1580
1581 for (i = 0; i < set->queue_depth; ) {
1582 int this_order = max_order;
1583 struct page *page;
1584 int to_do;
1585 void *p;
1586
1587 while (this_order && left < order_to_size(this_order - 1))
1588 this_order--;
1589
1590 do {
1591 page = alloc_pages_node(set->numa_node,
1592 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1593 this_order);
1594 if (page)
1595 break;
1596 if (!this_order--)
1597 break;
1598 if (order_to_size(this_order) < rq_size)
1599 break;
1600 } while (1);
1601
1602 if (!page)
1603 goto fail;
1604
1605 page->private = this_order;
1606 list_add_tail(&page->lru, &tags->page_list);
1607
1608 p = page_address(page);
1609 /*
1610 * Allow kmemleak to scan these pages as they contain pointers
1611 * to additional allocations like via ops->init_request().
1612 */
1613 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
1614 entries_per_page = order_to_size(this_order) / rq_size;
1615 to_do = min(entries_per_page, set->queue_depth - i);
1616 left -= to_do * rq_size;
1617 for (j = 0; j < to_do; j++) {
1618 tags->rqs[i] = p;
1619 if (set->ops->init_request) {
1620 if (set->ops->init_request(set->driver_data,
1621 tags->rqs[i], hctx_idx, i,
1622 set->numa_node)) {
1623 tags->rqs[i] = NULL;
1624 goto fail;
1625 }
1626 }
1627
1628 p += rq_size;
1629 i++;
1630 }
1631 }
1632 return tags;
1633
1634 fail:
1635 blk_mq_free_rq_map(set, tags, hctx_idx);
1636 return NULL;
1637 }
1638
1639 /*
1640 * 'cpu' is going away. splice any existing rq_list entries from this
1641 * software queue to the hw queue dispatch list, and ensure that it
1642 * gets run.
1643 */
1644 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
1645 {
1646 struct blk_mq_hw_ctx *hctx;
1647 struct blk_mq_ctx *ctx;
1648 LIST_HEAD(tmp);
1649
1650 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
1651 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1652
1653 spin_lock(&ctx->lock);
1654 if (!list_empty(&ctx->rq_list)) {
1655 list_splice_init(&ctx->rq_list, &tmp);
1656 blk_mq_hctx_clear_pending(hctx, ctx);
1657 }
1658 spin_unlock(&ctx->lock);
1659
1660 if (list_empty(&tmp))
1661 return 0;
1662
1663 spin_lock(&hctx->lock);
1664 list_splice_tail_init(&tmp, &hctx->dispatch);
1665 spin_unlock(&hctx->lock);
1666
1667 blk_mq_run_hw_queue(hctx, true);
1668 return 0;
1669 }
1670
1671 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
1672 {
1673 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
1674 &hctx->cpuhp_dead);
1675 }
1676
1677 /* hctx->ctxs will be freed in queue's release handler */
1678 static void blk_mq_exit_hctx(struct request_queue *q,
1679 struct blk_mq_tag_set *set,
1680 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1681 {
1682 unsigned flush_start_tag = set->queue_depth;
1683
1684 blk_mq_tag_idle(hctx);
1685
1686 if (set->ops->exit_request)
1687 set->ops->exit_request(set->driver_data,
1688 hctx->fq->flush_rq, hctx_idx,
1689 flush_start_tag + hctx_idx);
1690
1691 if (set->ops->exit_hctx)
1692 set->ops->exit_hctx(hctx, hctx_idx);
1693
1694 if (hctx->flags & BLK_MQ_F_BLOCKING)
1695 cleanup_srcu_struct(&hctx->queue_rq_srcu);
1696
1697 blk_mq_remove_cpuhp(hctx);
1698 blk_free_flush_queue(hctx->fq);
1699 sbitmap_free(&hctx->ctx_map);
1700 }
1701
1702 static void blk_mq_exit_hw_queues(struct request_queue *q,
1703 struct blk_mq_tag_set *set, int nr_queue)
1704 {
1705 struct blk_mq_hw_ctx *hctx;
1706 unsigned int i;
1707
1708 queue_for_each_hw_ctx(q, hctx, i) {
1709 if (i == nr_queue)
1710 break;
1711 blk_mq_exit_hctx(q, set, hctx, i);
1712 }
1713 }
1714
1715 static void blk_mq_free_hw_queues(struct request_queue *q,
1716 struct blk_mq_tag_set *set)
1717 {
1718 struct blk_mq_hw_ctx *hctx;
1719 unsigned int i;
1720
1721 queue_for_each_hw_ctx(q, hctx, i)
1722 free_cpumask_var(hctx->cpumask);
1723 }
1724
1725 static int blk_mq_init_hctx(struct request_queue *q,
1726 struct blk_mq_tag_set *set,
1727 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1728 {
1729 int node;
1730 unsigned flush_start_tag = set->queue_depth;
1731
1732 node = hctx->numa_node;
1733 if (node == NUMA_NO_NODE)
1734 node = hctx->numa_node = set->numa_node;
1735
1736 INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
1737 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1738 spin_lock_init(&hctx->lock);
1739 INIT_LIST_HEAD(&hctx->dispatch);
1740 hctx->queue = q;
1741 hctx->queue_num = hctx_idx;
1742 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1743
1744 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
1745
1746 hctx->tags = set->tags[hctx_idx];
1747
1748 /*
1749 * Allocate space for all possible cpus to avoid allocation at
1750 * runtime
1751 */
1752 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1753 GFP_KERNEL, node);
1754 if (!hctx->ctxs)
1755 goto unregister_cpu_notifier;
1756
1757 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1758 node))
1759 goto free_ctxs;
1760
1761 hctx->nr_ctx = 0;
1762
1763 if (set->ops->init_hctx &&
1764 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1765 goto free_bitmap;
1766
1767 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1768 if (!hctx->fq)
1769 goto exit_hctx;
1770
1771 if (set->ops->init_request &&
1772 set->ops->init_request(set->driver_data,
1773 hctx->fq->flush_rq, hctx_idx,
1774 flush_start_tag + hctx_idx, node))
1775 goto free_fq;
1776
1777 if (hctx->flags & BLK_MQ_F_BLOCKING)
1778 init_srcu_struct(&hctx->queue_rq_srcu);
1779
1780 return 0;
1781
1782 free_fq:
1783 kfree(hctx->fq);
1784 exit_hctx:
1785 if (set->ops->exit_hctx)
1786 set->ops->exit_hctx(hctx, hctx_idx);
1787 free_bitmap:
1788 sbitmap_free(&hctx->ctx_map);
1789 free_ctxs:
1790 kfree(hctx->ctxs);
1791 unregister_cpu_notifier:
1792 blk_mq_remove_cpuhp(hctx);
1793 return -1;
1794 }
1795
1796 static void blk_mq_init_cpu_queues(struct request_queue *q,
1797 unsigned int nr_hw_queues)
1798 {
1799 unsigned int i;
1800
1801 for_each_possible_cpu(i) {
1802 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1803 struct blk_mq_hw_ctx *hctx;
1804
1805 memset(__ctx, 0, sizeof(*__ctx));
1806 __ctx->cpu = i;
1807 spin_lock_init(&__ctx->lock);
1808 INIT_LIST_HEAD(&__ctx->rq_list);
1809 __ctx->queue = q;
1810 blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
1811 blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
1812
1813 /* If the cpu isn't online, the cpu is mapped to first hctx */
1814 if (!cpu_online(i))
1815 continue;
1816
1817 hctx = blk_mq_map_queue(q, i);
1818
1819 /*
1820 * Set local node, IFF we have more than one hw queue. If
1821 * not, we remain on the home node of the device
1822 */
1823 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1824 hctx->numa_node = local_memory_node(cpu_to_node(i));
1825 }
1826 }
1827
1828 static void blk_mq_map_swqueue(struct request_queue *q,
1829 const struct cpumask *online_mask)
1830 {
1831 unsigned int i;
1832 struct blk_mq_hw_ctx *hctx;
1833 struct blk_mq_ctx *ctx;
1834 struct blk_mq_tag_set *set = q->tag_set;
1835
1836 /*
1837 * Avoid others reading imcomplete hctx->cpumask through sysfs
1838 */
1839 mutex_lock(&q->sysfs_lock);
1840
1841 queue_for_each_hw_ctx(q, hctx, i) {
1842 cpumask_clear(hctx->cpumask);
1843 hctx->nr_ctx = 0;
1844 }
1845
1846 /*
1847 * Map software to hardware queues
1848 */
1849 for_each_possible_cpu(i) {
1850 /* If the cpu isn't online, the cpu is mapped to first hctx */
1851 if (!cpumask_test_cpu(i, online_mask))
1852 continue;
1853
1854 ctx = per_cpu_ptr(q->queue_ctx, i);
1855 hctx = blk_mq_map_queue(q, i);
1856
1857 cpumask_set_cpu(i, hctx->cpumask);
1858 ctx->index_hw = hctx->nr_ctx;
1859 hctx->ctxs[hctx->nr_ctx++] = ctx;
1860 }
1861
1862 mutex_unlock(&q->sysfs_lock);
1863
1864 queue_for_each_hw_ctx(q, hctx, i) {
1865 /*
1866 * If no software queues are mapped to this hardware queue,
1867 * disable it and free the request entries.
1868 */
1869 if (!hctx->nr_ctx) {
1870 if (set->tags[i]) {
1871 blk_mq_free_rq_map(set, set->tags[i], i);
1872 set->tags[i] = NULL;
1873 }
1874 hctx->tags = NULL;
1875 continue;
1876 }
1877
1878 /* unmapped hw queue can be remapped after CPU topo changed */
1879 if (!set->tags[i])
1880 set->tags[i] = blk_mq_init_rq_map(set, i);
1881 hctx->tags = set->tags[i];
1882 WARN_ON(!hctx->tags);
1883
1884 /*
1885 * Set the map size to the number of mapped software queues.
1886 * This is more accurate and more efficient than looping
1887 * over all possibly mapped software queues.
1888 */
1889 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
1890
1891 /*
1892 * Initialize batch roundrobin counts
1893 */
1894 hctx->next_cpu = cpumask_first(hctx->cpumask);
1895 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1896 }
1897 }
1898
1899 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
1900 {
1901 struct blk_mq_hw_ctx *hctx;
1902 int i;
1903
1904 queue_for_each_hw_ctx(q, hctx, i) {
1905 if (shared)
1906 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1907 else
1908 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1909 }
1910 }
1911
1912 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
1913 {
1914 struct request_queue *q;
1915
1916 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1917 blk_mq_freeze_queue(q);
1918 queue_set_hctx_shared(q, shared);
1919 blk_mq_unfreeze_queue(q);
1920 }
1921 }
1922
1923 static void blk_mq_del_queue_tag_set(struct request_queue *q)
1924 {
1925 struct blk_mq_tag_set *set = q->tag_set;
1926
1927 mutex_lock(&set->tag_list_lock);
1928 list_del_init(&q->tag_set_list);
1929 if (list_is_singular(&set->tag_list)) {
1930 /* just transitioned to unshared */
1931 set->flags &= ~BLK_MQ_F_TAG_SHARED;
1932 /* update existing queue */
1933 blk_mq_update_tag_set_depth(set, false);
1934 }
1935 mutex_unlock(&set->tag_list_lock);
1936 }
1937
1938 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1939 struct request_queue *q)
1940 {
1941 q->tag_set = set;
1942
1943 mutex_lock(&set->tag_list_lock);
1944
1945 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
1946 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
1947 set->flags |= BLK_MQ_F_TAG_SHARED;
1948 /* update existing queue */
1949 blk_mq_update_tag_set_depth(set, true);
1950 }
1951 if (set->flags & BLK_MQ_F_TAG_SHARED)
1952 queue_set_hctx_shared(q, true);
1953 list_add_tail(&q->tag_set_list, &set->tag_list);
1954
1955 mutex_unlock(&set->tag_list_lock);
1956 }
1957
1958 /*
1959 * It is the actual release handler for mq, but we do it from
1960 * request queue's release handler for avoiding use-after-free
1961 * and headache because q->mq_kobj shouldn't have been introduced,
1962 * but we can't group ctx/kctx kobj without it.
1963 */
1964 void blk_mq_release(struct request_queue *q)
1965 {
1966 struct blk_mq_hw_ctx *hctx;
1967 unsigned int i;
1968
1969 /* hctx kobj stays in hctx */
1970 queue_for_each_hw_ctx(q, hctx, i) {
1971 if (!hctx)
1972 continue;
1973 kfree(hctx->ctxs);
1974 kfree(hctx);
1975 }
1976
1977 q->mq_map = NULL;
1978
1979 kfree(q->queue_hw_ctx);
1980
1981 /* ctx kobj stays in queue_ctx */
1982 free_percpu(q->queue_ctx);
1983 }
1984
1985 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1986 {
1987 struct request_queue *uninit_q, *q;
1988
1989 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1990 if (!uninit_q)
1991 return ERR_PTR(-ENOMEM);
1992
1993 q = blk_mq_init_allocated_queue(set, uninit_q);
1994 if (IS_ERR(q))
1995 blk_cleanup_queue(uninit_q);
1996
1997 return q;
1998 }
1999 EXPORT_SYMBOL(blk_mq_init_queue);
2000
2001 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2002 struct request_queue *q)
2003 {
2004 int i, j;
2005 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2006
2007 blk_mq_sysfs_unregister(q);
2008 for (i = 0; i < set->nr_hw_queues; i++) {
2009 int node;
2010
2011 if (hctxs[i])
2012 continue;
2013
2014 node = blk_mq_hw_queue_to_node(q->mq_map, i);
2015 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
2016 GFP_KERNEL, node);
2017 if (!hctxs[i])
2018 break;
2019
2020 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2021 node)) {
2022 kfree(hctxs[i]);
2023 hctxs[i] = NULL;
2024 break;
2025 }
2026
2027 atomic_set(&hctxs[i]->nr_active, 0);
2028 hctxs[i]->numa_node = node;
2029 hctxs[i]->queue_num = i;
2030
2031 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2032 free_cpumask_var(hctxs[i]->cpumask);
2033 kfree(hctxs[i]);
2034 hctxs[i] = NULL;
2035 break;
2036 }
2037 blk_mq_hctx_kobj_init(hctxs[i]);
2038 }
2039 for (j = i; j < q->nr_hw_queues; j++) {
2040 struct blk_mq_hw_ctx *hctx = hctxs[j];
2041
2042 if (hctx) {
2043 if (hctx->tags) {
2044 blk_mq_free_rq_map(set, hctx->tags, j);
2045 set->tags[j] = NULL;
2046 }
2047 blk_mq_exit_hctx(q, set, hctx, j);
2048 free_cpumask_var(hctx->cpumask);
2049 kobject_put(&hctx->kobj);
2050 kfree(hctx->ctxs);
2051 kfree(hctx);
2052 hctxs[j] = NULL;
2053
2054 }
2055 }
2056 q->nr_hw_queues = i;
2057 blk_mq_sysfs_register(q);
2058 }
2059
2060 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2061 struct request_queue *q)
2062 {
2063 /* mark the queue as mq asap */
2064 q->mq_ops = set->ops;
2065
2066 q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2067 if (!q->queue_ctx)
2068 goto err_exit;
2069
2070 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2071 GFP_KERNEL, set->numa_node);
2072 if (!q->queue_hw_ctx)
2073 goto err_percpu;
2074
2075 q->mq_map = set->mq_map;
2076
2077 blk_mq_realloc_hw_ctxs(set, q);
2078 if (!q->nr_hw_queues)
2079 goto err_hctxs;
2080
2081 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2082 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2083
2084 q->nr_queues = nr_cpu_ids;
2085
2086 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2087
2088 if (!(set->flags & BLK_MQ_F_SG_MERGE))
2089 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2090
2091 q->sg_reserved_size = INT_MAX;
2092
2093 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2094 INIT_LIST_HEAD(&q->requeue_list);
2095 spin_lock_init(&q->requeue_lock);
2096
2097 if (q->nr_hw_queues > 1)
2098 blk_queue_make_request(q, blk_mq_make_request);
2099 else
2100 blk_queue_make_request(q, blk_sq_make_request);
2101
2102 /*
2103 * Do this after blk_queue_make_request() overrides it...
2104 */
2105 q->nr_requests = set->queue_depth;
2106
2107 if (set->ops->complete)
2108 blk_queue_softirq_done(q, set->ops->complete);
2109
2110 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2111
2112 get_online_cpus();
2113 mutex_lock(&all_q_mutex);
2114
2115 list_add_tail(&q->all_q_node, &all_q_list);
2116 blk_mq_add_queue_tag_set(set, q);
2117 blk_mq_map_swqueue(q, cpu_online_mask);
2118
2119 mutex_unlock(&all_q_mutex);
2120 put_online_cpus();
2121
2122 return q;
2123
2124 err_hctxs:
2125 kfree(q->queue_hw_ctx);
2126 err_percpu:
2127 free_percpu(q->queue_ctx);
2128 err_exit:
2129 q->mq_ops = NULL;
2130 return ERR_PTR(-ENOMEM);
2131 }
2132 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2133
2134 void blk_mq_free_queue(struct request_queue *q)
2135 {
2136 struct blk_mq_tag_set *set = q->tag_set;
2137
2138 mutex_lock(&all_q_mutex);
2139 list_del_init(&q->all_q_node);
2140 mutex_unlock(&all_q_mutex);
2141
2142 blk_mq_del_queue_tag_set(q);
2143
2144 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2145 blk_mq_free_hw_queues(q, set);
2146 }
2147
2148 /* Basically redo blk_mq_init_queue with queue frozen */
2149 static void blk_mq_queue_reinit(struct request_queue *q,
2150 const struct cpumask *online_mask)
2151 {
2152 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2153
2154 blk_mq_sysfs_unregister(q);
2155
2156 /*
2157 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2158 * we should change hctx numa_node according to new topology (this
2159 * involves free and re-allocate memory, worthy doing?)
2160 */
2161
2162 blk_mq_map_swqueue(q, online_mask);
2163
2164 blk_mq_sysfs_register(q);
2165 }
2166
2167 /*
2168 * New online cpumask which is going to be set in this hotplug event.
2169 * Declare this cpumasks as global as cpu-hotplug operation is invoked
2170 * one-by-one and dynamically allocating this could result in a failure.
2171 */
2172 static struct cpumask cpuhp_online_new;
2173
2174 static void blk_mq_queue_reinit_work(void)
2175 {
2176 struct request_queue *q;
2177
2178 mutex_lock(&all_q_mutex);
2179 /*
2180 * We need to freeze and reinit all existing queues. Freezing
2181 * involves synchronous wait for an RCU grace period and doing it
2182 * one by one may take a long time. Start freezing all queues in
2183 * one swoop and then wait for the completions so that freezing can
2184 * take place in parallel.
2185 */
2186 list_for_each_entry(q, &all_q_list, all_q_node)
2187 blk_mq_freeze_queue_start(q);
2188 list_for_each_entry(q, &all_q_list, all_q_node) {
2189 blk_mq_freeze_queue_wait(q);
2190
2191 /*
2192 * timeout handler can't touch hw queue during the
2193 * reinitialization
2194 */
2195 del_timer_sync(&q->timeout);
2196 }
2197
2198 list_for_each_entry(q, &all_q_list, all_q_node)
2199 blk_mq_queue_reinit(q, &cpuhp_online_new);
2200
2201 list_for_each_entry(q, &all_q_list, all_q_node)
2202 blk_mq_unfreeze_queue(q);
2203
2204 mutex_unlock(&all_q_mutex);
2205 }
2206
2207 static int blk_mq_queue_reinit_dead(unsigned int cpu)
2208 {
2209 cpumask_copy(&cpuhp_online_new, cpu_online_mask);
2210 blk_mq_queue_reinit_work();
2211 return 0;
2212 }
2213
2214 /*
2215 * Before hotadded cpu starts handling requests, new mappings must be
2216 * established. Otherwise, these requests in hw queue might never be
2217 * dispatched.
2218 *
2219 * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
2220 * for CPU0, and ctx1 for CPU1).
2221 *
2222 * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
2223 * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
2224 *
2225 * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
2226 * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
2227 * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
2228 * is ignored.
2229 */
2230 static int blk_mq_queue_reinit_prepare(unsigned int cpu)
2231 {
2232 cpumask_copy(&cpuhp_online_new, cpu_online_mask);
2233 cpumask_set_cpu(cpu, &cpuhp_online_new);
2234 blk_mq_queue_reinit_work();
2235 return 0;
2236 }
2237
2238 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2239 {
2240 int i;
2241
2242 for (i = 0; i < set->nr_hw_queues; i++) {
2243 set->tags[i] = blk_mq_init_rq_map(set, i);
2244 if (!set->tags[i])
2245 goto out_unwind;
2246 }
2247
2248 return 0;
2249
2250 out_unwind:
2251 while (--i >= 0)
2252 blk_mq_free_rq_map(set, set->tags[i], i);
2253
2254 return -ENOMEM;
2255 }
2256
2257 /*
2258 * Allocate the request maps associated with this tag_set. Note that this
2259 * may reduce the depth asked for, if memory is tight. set->queue_depth
2260 * will be updated to reflect the allocated depth.
2261 */
2262 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2263 {
2264 unsigned int depth;
2265 int err;
2266
2267 depth = set->queue_depth;
2268 do {
2269 err = __blk_mq_alloc_rq_maps(set);
2270 if (!err)
2271 break;
2272
2273 set->queue_depth >>= 1;
2274 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2275 err = -ENOMEM;
2276 break;
2277 }
2278 } while (set->queue_depth);
2279
2280 if (!set->queue_depth || err) {
2281 pr_err("blk-mq: failed to allocate request map\n");
2282 return -ENOMEM;
2283 }
2284
2285 if (depth != set->queue_depth)
2286 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2287 depth, set->queue_depth);
2288
2289 return 0;
2290 }
2291
2292 /*
2293 * Alloc a tag set to be associated with one or more request queues.
2294 * May fail with EINVAL for various error conditions. May adjust the
2295 * requested depth down, if if it too large. In that case, the set
2296 * value will be stored in set->queue_depth.
2297 */
2298 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2299 {
2300 int ret;
2301
2302 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2303
2304 if (!set->nr_hw_queues)
2305 return -EINVAL;
2306 if (!set->queue_depth)
2307 return -EINVAL;
2308 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2309 return -EINVAL;
2310
2311 if (!set->ops->queue_rq)
2312 return -EINVAL;
2313
2314 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2315 pr_info("blk-mq: reduced tag depth to %u\n",
2316 BLK_MQ_MAX_DEPTH);
2317 set->queue_depth = BLK_MQ_MAX_DEPTH;
2318 }
2319
2320 /*
2321 * If a crashdump is active, then we are potentially in a very
2322 * memory constrained environment. Limit us to 1 queue and
2323 * 64 tags to prevent using too much memory.
2324 */
2325 if (is_kdump_kernel()) {
2326 set->nr_hw_queues = 1;
2327 set->queue_depth = min(64U, set->queue_depth);
2328 }
2329 /*
2330 * There is no use for more h/w queues than cpus.
2331 */
2332 if (set->nr_hw_queues > nr_cpu_ids)
2333 set->nr_hw_queues = nr_cpu_ids;
2334
2335 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2336 GFP_KERNEL, set->numa_node);
2337 if (!set->tags)
2338 return -ENOMEM;
2339
2340 ret = -ENOMEM;
2341 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2342 GFP_KERNEL, set->numa_node);
2343 if (!set->mq_map)
2344 goto out_free_tags;
2345
2346 if (set->ops->map_queues)
2347 ret = set->ops->map_queues(set);
2348 else
2349 ret = blk_mq_map_queues(set);
2350 if (ret)
2351 goto out_free_mq_map;
2352
2353 ret = blk_mq_alloc_rq_maps(set);
2354 if (ret)
2355 goto out_free_mq_map;
2356
2357 mutex_init(&set->tag_list_lock);
2358 INIT_LIST_HEAD(&set->tag_list);
2359
2360 return 0;
2361
2362 out_free_mq_map:
2363 kfree(set->mq_map);
2364 set->mq_map = NULL;
2365 out_free_tags:
2366 kfree(set->tags);
2367 set->tags = NULL;
2368 return ret;
2369 }
2370 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2371
2372 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2373 {
2374 int i;
2375
2376 for (i = 0; i < nr_cpu_ids; i++) {
2377 if (set->tags[i])
2378 blk_mq_free_rq_map(set, set->tags[i], i);
2379 }
2380
2381 kfree(set->mq_map);
2382 set->mq_map = NULL;
2383
2384 kfree(set->tags);
2385 set->tags = NULL;
2386 }
2387 EXPORT_SYMBOL(blk_mq_free_tag_set);
2388
2389 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2390 {
2391 struct blk_mq_tag_set *set = q->tag_set;
2392 struct blk_mq_hw_ctx *hctx;
2393 int i, ret;
2394
2395 if (!set || nr > set->queue_depth)
2396 return -EINVAL;
2397
2398 ret = 0;
2399 queue_for_each_hw_ctx(q, hctx, i) {
2400 if (!hctx->tags)
2401 continue;
2402 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2403 if (ret)
2404 break;
2405 }
2406
2407 if (!ret)
2408 q->nr_requests = nr;
2409
2410 return ret;
2411 }
2412
2413 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2414 {
2415 struct request_queue *q;
2416
2417 if (nr_hw_queues > nr_cpu_ids)
2418 nr_hw_queues = nr_cpu_ids;
2419 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2420 return;
2421
2422 list_for_each_entry(q, &set->tag_list, tag_set_list)
2423 blk_mq_freeze_queue(q);
2424
2425 set->nr_hw_queues = nr_hw_queues;
2426 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2427 blk_mq_realloc_hw_ctxs(set, q);
2428
2429 if (q->nr_hw_queues > 1)
2430 blk_queue_make_request(q, blk_mq_make_request);
2431 else
2432 blk_queue_make_request(q, blk_sq_make_request);
2433
2434 blk_mq_queue_reinit(q, cpu_online_mask);
2435 }
2436
2437 list_for_each_entry(q, &set->tag_list, tag_set_list)
2438 blk_mq_unfreeze_queue(q);
2439 }
2440 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2441
2442 void blk_mq_disable_hotplug(void)
2443 {
2444 mutex_lock(&all_q_mutex);
2445 }
2446
2447 void blk_mq_enable_hotplug(void)
2448 {
2449 mutex_unlock(&all_q_mutex);
2450 }
2451
2452 static int __init blk_mq_init(void)
2453 {
2454 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2455 blk_mq_hctx_notify_dead);
2456
2457 cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
2458 blk_mq_queue_reinit_prepare,
2459 blk_mq_queue_reinit_dead);
2460 return 0;
2461 }
2462 subsys_initcall(blk_mq_init);