block/blk-mq.c

   1 /*
   2  * Block multiqueue core code
   3  *
   4  * Copyright (C) 2013-2014 Jens Axboe
   5  * Copyright (C) 2013-2014 Christoph Hellwig
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/backing-dev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/kmemleak.h>
  13 #include <linux/mm.h>
  14 #include <linux/init.h>
  15 #include <linux/slab.h>
  16 #include <linux/workqueue.h>
  17 #include <linux/smp.h>
  18 #include <linux/llist.h>
  19 #include <linux/list_sort.h>
  20 #include <linux/cpu.h>
  21 #include <linux/cache.h>
  22 #include <linux/sched/sysctl.h>
  23 #include <linux/delay.h>
  24 #include <linux/crash_dump.h>
  25 #include <linux/prefetch.h>
  26
  27 #include <trace/events/block.h>
  28
  29 #include <linux/blk-mq.h>
  30 #include "blk.h"
  31 #include "blk-mq.h"
  32 #include "blk-mq-tag.h"
  33
  34 static DEFINE_MUTEX(all_q_mutex);
  35 static LIST_HEAD(all_q_list);
  36
  37 /*
  38  * Check if any of the ctx's have pending work in this hardware queue
  39  */
  40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  41 {
  42         return sbitmap_any_bit_set(&hctx->ctx_map);
  43 }
  44
  45 /*
  46  * Mark this ctx as having pending work in this hardware queue
  47  */
  48 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  49                                      struct blk_mq_ctx *ctx)
  50 {
  51         if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
  52                 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
  53 }
  54
  55 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  56                                       struct blk_mq_ctx *ctx)
  57 {
  58         sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
  59 }
  60
  61 void blk_mq_freeze_queue_start(struct request_queue *q)
  62 {
  63         int freeze_depth;
  64
  65         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
  66         if (freeze_depth == 1) {
  67                 percpu_ref_kill(&q->q_usage_counter);
  68                 blk_mq_run_hw_queues(q, false);
  69         }
  70 }
  71 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
  72
  73 static void blk_mq_freeze_queue_wait(struct request_queue *q)
  74 {
  75         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
  76 }
  77
  78 /*
  79  * Guarantee no request is in use, so we can change any data structure of
  80  * the queue afterward.
  81  */
  82 void blk_freeze_queue(struct request_queue *q)
  83 {
  84         /*
  85          * In the !blk_mq case we are only calling this to kill the
  86          * q_usage_counter, otherwise this increases the freeze depth
  87          * and waits for it to return to zero.  For this reason there is
  88          * no blk_unfreeze_queue(), and blk_freeze_queue() is not
  89          * exported to drivers as the only user for unfreeze is blk_mq.
  90          */
  91         blk_mq_freeze_queue_start(q);
  92         blk_mq_freeze_queue_wait(q);
  93 }
  94
  95 void blk_mq_freeze_queue(struct request_queue *q)
  96 {
  97         /*
  98          * ...just an alias to keep freeze and unfreeze actions balanced
  99          * in the blk_mq_* namespace
 100          */
 101         blk_freeze_queue(q);
 102 }
 103 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 104
 105 void blk_mq_unfreeze_queue(struct request_queue *q)
 106 {
 107         int freeze_depth;
 108
 109         freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 110         WARN_ON_ONCE(freeze_depth < 0);
 111         if (!freeze_depth) {
 112                 percpu_ref_reinit(&q->q_usage_counter);
 113                 wake_up_all(&q->mq_freeze_wq);
 114         }
 115 }
 116 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 117
 118 /**
 119  * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
 120  * @q: request queue.
 121  *
 122  * Note: this function does not prevent that the struct request end_io()
 123  * callback function is invoked. Additionally, it is not prevented that
 124  * new queue_rq() calls occur unless the queue has been stopped first.
 125  */
 126 void blk_mq_quiesce_queue(struct request_queue *q)
 127 {
 128         struct blk_mq_hw_ctx *hctx;
 129         unsigned int i;
 130         bool rcu = false;
 131
 132         blk_mq_stop_hw_queues(q);
 133
 134         queue_for_each_hw_ctx(q, hctx, i) {
 135                 if (hctx->flags & BLK_MQ_F_BLOCKING)
 136                         synchronize_srcu(&hctx->queue_rq_srcu);
 137                 else
 138                         rcu = true;
 139         }
 140         if (rcu)
 141                 synchronize_rcu();
 142 }
 143 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 144
 145 void blk_mq_wake_waiters(struct request_queue *q)
 146 {
 147         struct blk_mq_hw_ctx *hctx;
 148         unsigned int i;
 149
 150         queue_for_each_hw_ctx(q, hctx, i)
 151                 if (blk_mq_hw_queue_mapped(hctx))
 152                         blk_mq_tag_wakeup_all(hctx->tags, true);
 153
 154         /*
 155          * If we are called because the queue has now been marked as
 156          * dying, we need to ensure that processes currently waiting on
 157          * the queue are notified as well.
 158          */
 159         wake_up_all(&q->mq_freeze_wq);
 160 }
 161
 162 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 163 {
 164         return blk_mq_has_free_tags(hctx->tags);
 165 }
 166 EXPORT_SYMBOL(blk_mq_can_queue);
 167
 168 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 169                                struct request *rq, unsigned int op)
 170 {
 171         INIT_LIST_HEAD(&rq->queuelist);
 172         /* csd/requeue_work/fifo_time is initialized before use */
 173         rq->q = q;
 174         rq->mq_ctx = ctx;
 175         rq->cmd_flags = op;
 176         if (blk_queue_io_stat(q))
 177                 rq->rq_flags |= RQF_IO_STAT;
 178         /* do not touch atomic flags, it needs atomic ops against the timer */
 179         rq->cpu = -1;
 180         INIT_HLIST_NODE(&rq->hash);
 181         RB_CLEAR_NODE(&rq->rb_node);
 182         rq->rq_disk = NULL;
 183         rq->part = NULL;
 184         rq->start_time = jiffies;
 185 #ifdef CONFIG_BLK_CGROUP
 186         rq->rl = NULL;
 187         set_start_time_ns(rq);
 188         rq->io_start_time_ns = 0;
 189 #endif
 190         rq->nr_phys_segments = 0;
 191 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 192         rq->nr_integrity_segments = 0;
 193 #endif
 194         rq->special = NULL;
 195         /* tag was already set */
 196         rq->errors = 0;
 197
 198         rq->cmd = rq->__cmd;
 199
 200         rq->extra_len = 0;
 201         rq->sense_len = 0;
 202         rq->resid_len = 0;
 203         rq->sense = NULL;
 204
 205         INIT_LIST_HEAD(&rq->timeout_list);
 206         rq->timeout = 0;
 207
 208         rq->end_io = NULL;
 209         rq->end_io_data = NULL;
 210         rq->next_rq = NULL;
 211
 212         ctx->rq_dispatched[op_is_sync(op)]++;
 213 }
 214
 215 static struct request *
 216 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 217 {
 218         struct request *rq;
 219         unsigned int tag;
 220
 221         tag = blk_mq_get_tag(data);
 222         if (tag != BLK_MQ_TAG_FAIL) {
 223                 rq = data->hctx->tags->rqs[tag];
 224
 225                 if (blk_mq_tag_busy(data->hctx)) {
 226                         rq->rq_flags = RQF_MQ_INFLIGHT;
 227                         atomic_inc(&data->hctx->nr_active);
 228                 }
 229
 230                 rq->tag = tag;
 231                 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 232                 return rq;
 233         }
 234
 235         return NULL;
 236 }
 237
 238 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 239                 unsigned int flags)
 240 {
 241         struct blk_mq_ctx *ctx;
 242         struct blk_mq_hw_ctx *hctx;
 243         struct request *rq;
 244         struct blk_mq_alloc_data alloc_data;
 245         int ret;
 246
 247         ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 248         if (ret)
 249                 return ERR_PTR(ret);
 250
 251         ctx = blk_mq_get_ctx(q);
 252         hctx = blk_mq_map_queue(q, ctx->cpu);
 253         blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 254         rq = __blk_mq_alloc_request(&alloc_data, rw);
 255         blk_mq_put_ctx(ctx);
 256
 257         if (!rq) {
 258                 blk_queue_exit(q);
 259                 return ERR_PTR(-EWOULDBLOCK);
 260         }
 261
 262         rq->__data_len = 0;
 263         rq->__sector = (sector_t) -1;
 264         rq->bio = rq->biotail = NULL;
 265         return rq;
 266 }
 267 EXPORT_SYMBOL(blk_mq_alloc_request);
 268
 269 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 270                 unsigned int flags, unsigned int hctx_idx)
 271 {
 272         struct blk_mq_hw_ctx *hctx;
 273         struct blk_mq_ctx *ctx;
 274         struct request *rq;
 275         struct blk_mq_alloc_data alloc_data;
 276         int ret;
 277
 278         /*
 279          * If the tag allocator sleeps we could get an allocation for a
 280          * different hardware context.  No need to complicate the low level
 281          * allocator for this for the rare use case of a command tied to
 282          * a specific queue.
 283          */
 284         if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 285                 return ERR_PTR(-EINVAL);
 286
 287         if (hctx_idx >= q->nr_hw_queues)
 288                 return ERR_PTR(-EIO);
 289
 290         ret = blk_queue_enter(q, true);
 291         if (ret)
 292                 return ERR_PTR(ret);
 293
 294         /*
 295          * Check if the hardware context is actually mapped to anything.
 296          * If not tell the caller that it should skip this queue.
 297          */
 298         hctx = q->queue_hw_ctx[hctx_idx];
 299         if (!blk_mq_hw_queue_mapped(hctx)) {
 300                 ret = -EXDEV;
 301                 goto out_queue_exit;
 302         }
 303         ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
 304
 305         blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 306         rq = __blk_mq_alloc_request(&alloc_data, rw);
 307         if (!rq) {
 308                 ret = -EWOULDBLOCK;
 309                 goto out_queue_exit;
 310         }
 311
 312         return rq;
 313
 314 out_queue_exit:
 315         blk_queue_exit(q);
 316         return ERR_PTR(ret);
 317 }
 318 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 319
 320 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 321                                   struct blk_mq_ctx *ctx, struct request *rq)
 322 {
 323         const int tag = rq->tag;
 324         struct request_queue *q = rq->q;
 325
 326         if (rq->rq_flags & RQF_MQ_INFLIGHT)
 327                 atomic_dec(&hctx->nr_active);
 328         rq->rq_flags = 0;
 329
 330         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 331         blk_mq_put_tag(hctx, ctx, tag);
 332         blk_queue_exit(q);
 333 }
 334
 335 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 336 {
 337         struct blk_mq_ctx *ctx = rq->mq_ctx;
 338
 339         ctx->rq_completed[rq_is_sync(rq)]++;
 340         __blk_mq_free_request(hctx, ctx, rq);
 341
 342 }
 343 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 344
 345 void blk_mq_free_request(struct request *rq)
 346 {
 347         blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 348 }
 349 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 350
 351 inline void __blk_mq_end_request(struct request *rq, int error)
 352 {
 353         blk_account_io_done(rq);
 354
 355         if (rq->end_io) {
 356                 rq->end_io(rq, error);
 357         } else {
 358                 if (unlikely(blk_bidi_rq(rq)))
 359                         blk_mq_free_request(rq->next_rq);
 360                 blk_mq_free_request(rq);
 361         }
 362 }
 363 EXPORT_SYMBOL(__blk_mq_end_request);
 364
 365 void blk_mq_end_request(struct request *rq, int error)
 366 {
 367         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 368                 BUG();
 369         __blk_mq_end_request(rq, error);
 370 }
 371 EXPORT_SYMBOL(blk_mq_end_request);
 372
 373 static void __blk_mq_complete_request_remote(void *data)
 374 {
 375         struct request *rq = data;
 376
 377         rq->q->softirq_done_fn(rq);
 378 }
 379
 380 static void blk_mq_ipi_complete_request(struct request *rq)
 381 {
 382         struct blk_mq_ctx *ctx = rq->mq_ctx;
 383         bool shared = false;
 384         int cpu;
 385
 386         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 387                 rq->q->softirq_done_fn(rq);
 388                 return;
 389         }
 390
 391         cpu = get_cpu();
 392         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 393                 shared = cpus_share_cache(cpu, ctx->cpu);
 394
 395         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 396                 rq->csd.func = __blk_mq_complete_request_remote;
 397                 rq->csd.info = rq;
 398                 rq->csd.flags = 0;
 399                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 400         } else {
 401                 rq->q->softirq_done_fn(rq);
 402         }
 403         put_cpu();
 404 }
 405
 406 static void __blk_mq_complete_request(struct request *rq)
 407 {
 408         struct request_queue *q = rq->q;
 409
 410         if (!q->softirq_done_fn)
 411                 blk_mq_end_request(rq, rq->errors);
 412         else
 413                 blk_mq_ipi_complete_request(rq);
 414 }
 415
 416 /**
 417  * blk_mq_complete_request - end I/O on a request
 418  * @rq:         the request being processed
 419  *
 420  * Description:
 421  *      Ends all I/O on a request. It does not handle partial completions.
 422  *      The actual completion happens out-of-order, through a IPI handler.
 423  **/
 424 void blk_mq_complete_request(struct request *rq, int error)
 425 {
 426         struct request_queue *q = rq->q;
 427
 428         if (unlikely(blk_should_fake_timeout(q)))
 429                 return;
 430         if (!blk_mark_rq_complete(rq)) {
 431                 rq->errors = error;
 432                 __blk_mq_complete_request(rq);
 433         }
 434 }
 435 EXPORT_SYMBOL(blk_mq_complete_request);
 436
 437 int blk_mq_request_started(struct request *rq)
 438 {
 439         return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 440 }
 441 EXPORT_SYMBOL_GPL(blk_mq_request_started);
 442
 443 void blk_mq_start_request(struct request *rq)
 444 {
 445         struct request_queue *q = rq->q;
 446
 447         trace_block_rq_issue(q, rq);
 448
 449         rq->resid_len = blk_rq_bytes(rq);
 450         if (unlikely(blk_bidi_rq(rq)))
 451                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 452
 453         blk_add_timer(rq);
 454
 455         /*
 456          * Ensure that ->deadline is visible before set the started
 457          * flag and clear the completed flag.
 458          */
 459         smp_mb__before_atomic();
 460
 461         /*
 462          * Mark us as started and clear complete. Complete might have been
 463          * set if requeue raced with timeout, which then marked it as
 464          * complete. So be sure to clear complete again when we start
 465          * the request, otherwise we'll ignore the completion event.
 466          */
 467         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 468                 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 469         if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 470                 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 471
 472         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 473                 /*
 474                  * Make sure space for the drain appears.  We know we can do
 475                  * this because max_hw_segments has been adjusted to be one
 476                  * fewer than the device can handle.
 477                  */
 478                 rq->nr_phys_segments++;
 479         }
 480 }
 481 EXPORT_SYMBOL(blk_mq_start_request);
 482
 483 static void __blk_mq_requeue_request(struct request *rq)
 484 {
 485         struct request_queue *q = rq->q;
 486
 487         trace_block_rq_requeue(q, rq);
 488
 489         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 490                 if (q->dma_drain_size && blk_rq_bytes(rq))
 491                         rq->nr_phys_segments--;
 492         }
 493 }
 494
 495 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 496 {
 497         __blk_mq_requeue_request(rq);
 498
 499         BUG_ON(blk_queued_rq(rq));
 500         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 501 }
 502 EXPORT_SYMBOL(blk_mq_requeue_request);
 503
 504 static void blk_mq_requeue_work(struct work_struct *work)
 505 {
 506         struct request_queue *q =
 507                 container_of(work, struct request_queue, requeue_work.work);
 508         LIST_HEAD(rq_list);
 509         struct request *rq, *next;
 510         unsigned long flags;
 511
 512         spin_lock_irqsave(&q->requeue_lock, flags);
 513         list_splice_init(&q->requeue_list, &rq_list);
 514         spin_unlock_irqrestore(&q->requeue_lock, flags);
 515
 516         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 517                 if (!(rq->rq_flags & RQF_SOFTBARRIER))
 518                         continue;
 519
 520                 rq->rq_flags &= ~RQF_SOFTBARRIER;
 521                 list_del_init(&rq->queuelist);
 522                 blk_mq_insert_request(rq, true, false, false);
 523         }
 524
 525         while (!list_empty(&rq_list)) {
 526                 rq = list_entry(rq_list.next, struct request, queuelist);
 527                 list_del_init(&rq->queuelist);
 528                 blk_mq_insert_request(rq, false, false, false);
 529         }
 530
 531         blk_mq_run_hw_queues(q, false);
 532 }
 533
 534 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 535                                 bool kick_requeue_list)
 536 {
 537         struct request_queue *q = rq->q;
 538         unsigned long flags;
 539
 540         /*
 541          * We abuse this flag that is otherwise used by the I/O scheduler to
 542          * request head insertation from the workqueue.
 543          */
 544         BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 545
 546         spin_lock_irqsave(&q->requeue_lock, flags);
 547         if (at_head) {
 548                 rq->rq_flags |= RQF_SOFTBARRIER;
 549                 list_add(&rq->queuelist, &q->requeue_list);
 550         } else {
 551                 list_add_tail(&rq->queuelist, &q->requeue_list);
 552         }
 553         spin_unlock_irqrestore(&q->requeue_lock, flags);
 554
 555         if (kick_requeue_list)
 556                 blk_mq_kick_requeue_list(q);
 557 }
 558 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 559
 560 void blk_mq_kick_requeue_list(struct request_queue *q)
 561 {
 562         kblockd_schedule_delayed_work(&q->requeue_work, 0);
 563 }
 564 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 565
 566 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 567                                     unsigned long msecs)
 568 {
 569         kblockd_schedule_delayed_work(&q->requeue_work,
 570                                       msecs_to_jiffies(msecs));
 571 }
 572 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 573
 574 void blk_mq_abort_requeue_list(struct request_queue *q)
 575 {
 576         unsigned long flags;
 577         LIST_HEAD(rq_list);
 578
 579         spin_lock_irqsave(&q->requeue_lock, flags);
 580         list_splice_init(&q->requeue_list, &rq_list);
 581         spin_unlock_irqrestore(&q->requeue_lock, flags);
 582
 583         while (!list_empty(&rq_list)) {
 584                 struct request *rq;
 585
 586                 rq = list_first_entry(&rq_list, struct request, queuelist);
 587                 list_del_init(&rq->queuelist);
 588                 rq->errors = -EIO;
 589                 blk_mq_end_request(rq, rq->errors);
 590         }
 591 }
 592 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 593
 594 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 595 {
 596         if (tag < tags->nr_tags) {
 597                 prefetch(tags->rqs[tag]);
 598                 return tags->rqs[tag];
 599         }
 600
 601         return NULL;
 602 }
 603 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 604
 605 struct blk_mq_timeout_data {
 606         unsigned long next;
 607         unsigned int next_set;
 608 };
 609
 610 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 611 {
 612         struct blk_mq_ops *ops = req->q->mq_ops;
 613         enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 614
 615         /*
 616          * We know that complete is set at this point. If STARTED isn't set
 617          * anymore, then the request isn't active and the "timeout" should
 618          * just be ignored. This can happen due to the bitflag ordering.
 619          * Timeout first checks if STARTED is set, and if it is, assumes
 620          * the request is active. But if we race with completion, then
 621          * we both flags will get cleared. So check here again, and ignore
 622          * a timeout event with a request that isn't active.
 623          */
 624         if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 625                 return;
 626
 627         if (ops->timeout)
 628                 ret = ops->timeout(req, reserved);
 629
 630         switch (ret) {
 631         case BLK_EH_HANDLED:
 632                 __blk_mq_complete_request(req);
 633                 break;
 634         case BLK_EH_RESET_TIMER:
 635                 blk_add_timer(req);
 636                 blk_clear_rq_complete(req);
 637                 break;
 638         case BLK_EH_NOT_HANDLED:
 639                 break;
 640         default:
 641                 printk(KERN_ERR "block: bad eh return: %d\n", ret);
 642                 break;
 643         }
 644 }
 645
 646 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 647                 struct request *rq, void *priv, bool reserved)
 648 {
 649         struct blk_mq_timeout_data *data = priv;
 650
 651         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 652                 /*
 653                  * If a request wasn't started before the queue was
 654                  * marked dying, kill it here or it'll go unnoticed.
 655                  */
 656                 if (unlikely(blk_queue_dying(rq->q))) {
 657                         rq->errors = -EIO;
 658                         blk_mq_end_request(rq, rq->errors);
 659                 }
 660                 return;
 661         }
 662
 663         if (time_after_eq(jiffies, rq->deadline)) {
 664                 if (!blk_mark_rq_complete(rq))
 665                         blk_mq_rq_timed_out(rq, reserved);
 666         } else if (!data->next_set || time_after(data->next, rq->deadline)) {
 667                 data->next = rq->deadline;
 668                 data->next_set = 1;
 669         }
 670 }
 671
 672 static void blk_mq_timeout_work(struct work_struct *work)
 673 {
 674         struct request_queue *q =
 675                 container_of(work, struct request_queue, timeout_work);
 676         struct blk_mq_timeout_data data = {
 677                 .next           = 0,
 678                 .next_set       = 0,
 679         };
 680         int i;
 681
 682         /* A deadlock might occur if a request is stuck requiring a
 683          * timeout at the same time a queue freeze is waiting
 684          * completion, since the timeout code would not be able to
 685          * acquire the queue reference here.
 686          *
 687          * That's why we don't use blk_queue_enter here; instead, we use
 688          * percpu_ref_tryget directly, because we need to be able to
 689          * obtain a reference even in the short window between the queue
 690          * starting to freeze, by dropping the first reference in
 691          * blk_mq_freeze_queue_start, and the moment the last request is
 692          * consumed, marked by the instant q_usage_counter reaches
 693          * zero.
 694          */
 695         if (!percpu_ref_tryget(&q->q_usage_counter))
 696                 return;
 697
 698         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 699
 700         if (data.next_set) {
 701                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
 702                 mod_timer(&q->timeout, data.next);
 703         } else {
 704                 struct blk_mq_hw_ctx *hctx;
 705
 706                 queue_for_each_hw_ctx(q, hctx, i) {
 707                         /* the hctx may be unmapped, so check it here */
 708                         if (blk_mq_hw_queue_mapped(hctx))
 709                                 blk_mq_tag_idle(hctx);
 710                 }
 711         }
 712         blk_queue_exit(q);
 713 }
 714
 715 /*
 716  * Reverse check our software queue for entries that we could potentially
 717  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 718  * too much time checking for merges.
 719  */
 720 static bool blk_mq_attempt_merge(struct request_queue *q,
 721                                  struct blk_mq_ctx *ctx, struct bio *bio)
 722 {
 723         struct request *rq;
 724         int checked = 8;
 725
 726         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 727                 int el_ret;
 728
 729                 if (!checked--)
 730                         break;
 731
 732                 if (!blk_rq_merge_ok(rq, bio))
 733                         continue;
 734
 735                 el_ret = blk_try_merge(rq, bio);
 736                 if (el_ret == ELEVATOR_BACK_MERGE) {
 737                         if (bio_attempt_back_merge(q, rq, bio)) {
 738                                 ctx->rq_merged++;
 739                                 return true;
 740                         }
 741                         break;
 742                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 743                         if (bio_attempt_front_merge(q, rq, bio)) {
 744                                 ctx->rq_merged++;
 745                                 return true;
 746                         }
 747                         break;
 748                 }
 749         }
 750
 751         return false;
 752 }
 753
 754 struct flush_busy_ctx_data {
 755         struct blk_mq_hw_ctx *hctx;
 756         struct list_head *list;
 757 };
 758
 759 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 760 {
 761         struct flush_busy_ctx_data *flush_data = data;
 762         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 763         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 764
 765         sbitmap_clear_bit(sb, bitnr);
 766         spin_lock(&ctx->lock);
 767         list_splice_tail_init(&ctx->rq_list, flush_data->list);
 768         spin_unlock(&ctx->lock);
 769         return true;
 770 }
 771
 772 /*
 773  * Process software queues that have been marked busy, splicing them
 774  * to the for-dispatch
 775  */
 776 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 777 {
 778         struct flush_busy_ctx_data data = {
 779                 .hctx = hctx,
 780                 .list = list,
 781         };
 782
 783         sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 784 }
 785
 786 static inline unsigned int queued_to_index(unsigned int queued)
 787 {
 788         if (!queued)
 789                 return 0;
 790
 791         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 792 }
 793
 794 /*
 795  * Run this hardware queue, pulling any software queues mapped to it in.
 796  * Note that this function currently has various problems around ordering
 797  * of IO. In particular, we'd like FIFO behaviour on handling existing
 798  * items on the hctx->dispatch list. Ignore that for now.
 799  */
 800 static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 801 {
 802         struct request_queue *q = hctx->queue;
 803         struct request *rq;
 804         LIST_HEAD(rq_list);
 805         LIST_HEAD(driver_list);
 806         struct list_head *dptr;
 807         int queued;
 808
 809         if (unlikely(blk_mq_hctx_stopped(hctx)))
 810                 return;
 811
 812         hctx->run++;
 813
 814         /*
 815          * Touch any software queue that has pending entries.
 816          */
 817         flush_busy_ctxs(hctx, &rq_list);
 818
 819         /*
 820          * If we have previous entries on our dispatch list, grab them
 821          * and stuff them at the front for more fair dispatch.
 822          */
 823         if (!list_empty_careful(&hctx->dispatch)) {
 824                 spin_lock(&hctx->lock);
 825                 if (!list_empty(&hctx->dispatch))
 826                         list_splice_init(&hctx->dispatch, &rq_list);
 827                 spin_unlock(&hctx->lock);
 828         }
 829
 830         /*
 831          * Start off with dptr being NULL, so we start the first request
 832          * immediately, even if we have more pending.
 833          */
 834         dptr = NULL;
 835
 836         /*
 837          * Now process all the entries, sending them to the driver.
 838          */
 839         queued = 0;
 840         while (!list_empty(&rq_list)) {
 841                 struct blk_mq_queue_data bd;
 842                 int ret;
 843
 844                 rq = list_first_entry(&rq_list, struct request, queuelist);
 845                 list_del_init(&rq->queuelist);
 846
 847                 bd.rq = rq;
 848                 bd.list = dptr;
 849                 bd.last = list_empty(&rq_list);
 850
 851                 ret = q->mq_ops->queue_rq(hctx, &bd);
 852                 switch (ret) {
 853                 case BLK_MQ_RQ_QUEUE_OK:
 854                         queued++;
 855                         break;
 856                 case BLK_MQ_RQ_QUEUE_BUSY:
 857                         list_add(&rq->queuelist, &rq_list);
 858                         __blk_mq_requeue_request(rq);
 859                         break;
 860                 default:
 861                         pr_err("blk-mq: bad return on queue: %d\n", ret);
 862                 case BLK_MQ_RQ_QUEUE_ERROR:
 863                         rq->errors = -EIO;
 864                         blk_mq_end_request(rq, rq->errors);
 865                         break;
 866                 }
 867
 868                 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 869                         break;
 870
 871                 /*
 872                  * We've done the first request. If we have more than 1
 873                  * left in the list, set dptr to defer issue.
 874                  */
 875                 if (!dptr && rq_list.next != rq_list.prev)
 876                         dptr = &driver_list;
 877         }
 878
 879         hctx->dispatched[queued_to_index(queued)]++;
 880
 881         /*
 882          * Any items that need requeuing? Stuff them into hctx->dispatch,
 883          * that is where we will continue on next queue run.
 884          */
 885         if (!list_empty(&rq_list)) {
 886                 spin_lock(&hctx->lock);
 887                 list_splice(&rq_list, &hctx->dispatch);
 888                 spin_unlock(&hctx->lock);
 889                 /*
 890                  * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
 891                  * it's possible the queue is stopped and restarted again
 892                  * before this. Queue restart will dispatch requests. And since
 893                  * requests in rq_list aren't added into hctx->dispatch yet,
 894                  * the requests in rq_list might get lost.
 895                  *
 896                  * blk_mq_run_hw_queue() already checks the STOPPED bit
 897                  **/
 898                 blk_mq_run_hw_queue(hctx, true);
 899         }
 900 }
 901
 902 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 903 {
 904         int srcu_idx;
 905
 906         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
 907                 cpu_online(hctx->next_cpu));
 908
 909         if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 910                 rcu_read_lock();
 911                 blk_mq_process_rq_list(hctx);
 912                 rcu_read_unlock();
 913         } else {
 914                 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
 915                 blk_mq_process_rq_list(hctx);
 916                 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 917         }
 918 }
 919
 920 /*
 921  * It'd be great if the workqueue API had a way to pass
 922  * in a mask and had some smarts for more clever placement.
 923  * For now we just round-robin here, switching for every
 924  * BLK_MQ_CPU_WORK_BATCH queued items.
 925  */
 926 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 927 {
 928         if (hctx->queue->nr_hw_queues == 1)
 929                 return WORK_CPU_UNBOUND;
 930
 931         if (--hctx->next_cpu_batch <= 0) {
 932                 int cpu = hctx->next_cpu, next_cpu;
 933
 934                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 935                 if (next_cpu >= nr_cpu_ids)
 936                         next_cpu = cpumask_first(hctx->cpumask);
 937
 938                 hctx->next_cpu = next_cpu;
 939                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 940
 941                 return cpu;
 942         }
 943
 944         return hctx->next_cpu;
 945 }
 946
 947 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 948 {
 949         if (unlikely(blk_mq_hctx_stopped(hctx) ||
 950                      !blk_mq_hw_queue_mapped(hctx)))
 951                 return;
 952
 953         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
 954                 int cpu = get_cpu();
 955                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 956                         __blk_mq_run_hw_queue(hctx);
 957                         put_cpu();
 958                         return;
 959                 }
 960
 961                 put_cpu();
 962         }
 963
 964         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
 965 }
 966
 967 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 968 {
 969         struct blk_mq_hw_ctx *hctx;
 970         int i;
 971
 972         queue_for_each_hw_ctx(q, hctx, i) {
 973                 if ((!blk_mq_hctx_has_pending(hctx) &&
 974                     list_empty_careful(&hctx->dispatch)) ||
 975                     blk_mq_hctx_stopped(hctx))
 976                         continue;
 977
 978                 blk_mq_run_hw_queue(hctx, async);
 979         }
 980 }
 981 EXPORT_SYMBOL(blk_mq_run_hw_queues);
 982
 983 /**
 984  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
 985  * @q: request queue.
 986  *
 987  * The caller is responsible for serializing this function against
 988  * blk_mq_{start,stop}_hw_queue().
 989  */
 990 bool blk_mq_queue_stopped(struct request_queue *q)
 991 {
 992         struct blk_mq_hw_ctx *hctx;
 993         int i;
 994
 995         queue_for_each_hw_ctx(q, hctx, i)
 996                 if (blk_mq_hctx_stopped(hctx))
 997                         return true;
 998
 999         return false;
1000 }
1001 EXPORT_SYMBOL(blk_mq_queue_stopped);
1002
1003 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1004 {
1005         cancel_work(&hctx->run_work);
1006         cancel_delayed_work(&hctx->delay_work);
1007         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1008 }
1009 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1010
1011 void blk_mq_stop_hw_queues(struct request_queue *q)
1012 {
1013         struct blk_mq_hw_ctx *hctx;
1014         int i;
1015
1016         queue_for_each_hw_ctx(q, hctx, i)
1017                 blk_mq_stop_hw_queue(hctx);
1018 }
1019 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1020
1021 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1022 {
1023         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1024
1025         blk_mq_run_hw_queue(hctx, false);
1026 }
1027 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1028
1029 void blk_mq_start_hw_queues(struct request_queue *q)
1030 {
1031         struct blk_mq_hw_ctx *hctx;
1032         int i;
1033
1034         queue_for_each_hw_ctx(q, hctx, i)
1035                 blk_mq_start_hw_queue(hctx);
1036 }
1037 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1038
1039 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1040 {
1041         struct blk_mq_hw_ctx *hctx;
1042         int i;
1043
1044         queue_for_each_hw_ctx(q, hctx, i) {
1045                 if (!blk_mq_hctx_stopped(hctx))
1046                         continue;
1047
1048                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1049                 blk_mq_run_hw_queue(hctx, async);
1050         }
1051 }
1052 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1053
1054 static void blk_mq_run_work_fn(struct work_struct *work)
1055 {
1056         struct blk_mq_hw_ctx *hctx;
1057
1058         hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
1059
1060         __blk_mq_run_hw_queue(hctx);
1061 }
1062
1063 static void blk_mq_delay_work_fn(struct work_struct *work)
1064 {
1065         struct blk_mq_hw_ctx *hctx;
1066
1067         hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
1068
1069         if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
1070                 __blk_mq_run_hw_queue(hctx);
1071 }
1072
1073 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1074 {
1075         if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1076                 return;
1077
1078         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1079                         &hctx->delay_work, msecs_to_jiffies(msecs));
1080 }
1081 EXPORT_SYMBOL(blk_mq_delay_queue);
1082
1083 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1084                                             struct request *rq,
1085                                             bool at_head)
1086 {
1087         struct blk_mq_ctx *ctx = rq->mq_ctx;
1088
1089         trace_block_rq_insert(hctx->queue, rq);
1090
1091         if (at_head)
1092                 list_add(&rq->queuelist, &ctx->rq_list);
1093         else
1094                 list_add_tail(&rq->queuelist, &ctx->rq_list);
1095 }
1096
1097 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
1098                                     struct request *rq, bool at_head)
1099 {
1100         struct blk_mq_ctx *ctx = rq->mq_ctx;
1101
1102         __blk_mq_insert_req_list(hctx, rq, at_head);
1103         blk_mq_hctx_mark_pending(hctx, ctx);
1104 }
1105
1106 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
1107                            bool async)
1108 {
1109         struct blk_mq_ctx *ctx = rq->mq_ctx;
1110         struct request_queue *q = rq->q;
1111         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1112
1113         spin_lock(&ctx->lock);
1114         __blk_mq_insert_request(hctx, rq, at_head);
1115         spin_unlock(&ctx->lock);
1116
1117         if (run_queue)
1118                 blk_mq_run_hw_queue(hctx, async);
1119 }
1120
1121 static void blk_mq_insert_requests(struct request_queue *q,
1122                                      struct blk_mq_ctx *ctx,
1123                                      struct list_head *list,
1124                                      int depth,
1125                                      bool from_schedule)
1126
1127 {
1128         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1129
1130         trace_block_unplug(q, depth, !from_schedule);
1131
1132         /*
1133          * preemption doesn't flush plug list, so it's possible ctx->cpu is
1134          * offline now
1135          */
1136         spin_lock(&ctx->lock);
1137         while (!list_empty(list)) {
1138                 struct request *rq;
1139
1140                 rq = list_first_entry(list, struct request, queuelist);
1141                 BUG_ON(rq->mq_ctx != ctx);
1142                 list_del_init(&rq->queuelist);
1143                 __blk_mq_insert_req_list(hctx, rq, false);
1144         }
1145         blk_mq_hctx_mark_pending(hctx, ctx);
1146         spin_unlock(&ctx->lock);
1147
1148         blk_mq_run_hw_queue(hctx, from_schedule);
1149 }
1150
1151 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1152 {
1153         struct request *rqa = container_of(a, struct request, queuelist);
1154         struct request *rqb = container_of(b, struct request, queuelist);
1155
1156         return !(rqa->mq_ctx < rqb->mq_ctx ||
1157                  (rqa->mq_ctx == rqb->mq_ctx &&
1158                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1159 }
1160
1161 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1162 {
1163         struct blk_mq_ctx *this_ctx;
1164         struct request_queue *this_q;
1165         struct request *rq;
1166         LIST_HEAD(list);
1167         LIST_HEAD(ctx_list);
1168         unsigned int depth;
1169
1170         list_splice_init(&plug->mq_list, &list);
1171
1172         list_sort(NULL, &list, plug_ctx_cmp);
1173
1174         this_q = NULL;
1175         this_ctx = NULL;
1176         depth = 0;
1177
1178         while (!list_empty(&list)) {
1179                 rq = list_entry_rq(list.next);
1180                 list_del_init(&rq->queuelist);
1181                 BUG_ON(!rq->q);
1182                 if (rq->mq_ctx != this_ctx) {
1183                         if (this_ctx) {
1184                                 blk_mq_insert_requests(this_q, this_ctx,
1185                                                         &ctx_list, depth,
1186                                                         from_schedule);
1187                         }
1188
1189                         this_ctx = rq->mq_ctx;
1190                         this_q = rq->q;
1191                         depth = 0;
1192                 }
1193
1194                 depth++;
1195                 list_add_tail(&rq->queuelist, &ctx_list);
1196         }
1197
1198         /*
1199          * If 'this_ctx' is set, we know we have entries to complete
1200          * on 'ctx_list'. Do those.
1201          */
1202         if (this_ctx) {
1203                 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1204                                        from_schedule);
1205         }
1206 }
1207
1208 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1209 {
1210         init_request_from_bio(rq, bio);
1211
1212         blk_account_io_start(rq, 1);
1213 }
1214
1215 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1216 {
1217         return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1218                 !blk_queue_nomerges(hctx->queue);
1219 }
1220
1221 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1222                                          struct blk_mq_ctx *ctx,
1223                                          struct request *rq, struct bio *bio)
1224 {
1225         if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
1226                 blk_mq_bio_to_request(rq, bio);
1227                 spin_lock(&ctx->lock);
1228 insert_rq:
1229                 __blk_mq_insert_request(hctx, rq, false);
1230                 spin_unlock(&ctx->lock);
1231                 return false;
1232         } else {
1233                 struct request_queue *q = hctx->queue;
1234
1235                 spin_lock(&ctx->lock);
1236                 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1237                         blk_mq_bio_to_request(rq, bio);
1238                         goto insert_rq;
1239                 }
1240
1241                 spin_unlock(&ctx->lock);
1242                 __blk_mq_free_request(hctx, ctx, rq);
1243                 return true;
1244         }
1245 }
1246
1247 static struct request *blk_mq_map_request(struct request_queue *q,
1248                                           struct bio *bio,
1249                                           struct blk_mq_alloc_data *data)
1250 {
1251         struct blk_mq_hw_ctx *hctx;
1252         struct blk_mq_ctx *ctx;
1253         struct request *rq;
1254
1255         blk_queue_enter_live(q);
1256         ctx = blk_mq_get_ctx(q);
1257         hctx = blk_mq_map_queue(q, ctx->cpu);
1258
1259         trace_block_getrq(q, bio, bio->bi_opf);
1260         blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
1261         rq = __blk_mq_alloc_request(data, bio->bi_opf);
1262
1263         data->hctx->queued++;
1264         return rq;
1265 }
1266
1267 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1268                                       struct request *rq, blk_qc_t *cookie)
1269 {
1270         int ret;
1271         struct request_queue *q = rq->q;
1272         struct blk_mq_queue_data bd = {
1273                 .rq = rq,
1274                 .list = NULL,
1275                 .last = 1
1276         };
1277         blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
1278
1279         if (blk_mq_hctx_stopped(hctx))
1280                 goto insert;
1281
1282         /*
1283          * For OK queue, we are done. For error, kill it. Any other
1284          * error (busy), just add it to our list as we previously
1285          * would have done
1286          */
1287         ret = q->mq_ops->queue_rq(hctx, &bd);
1288         if (ret == BLK_MQ_RQ_QUEUE_OK) {
1289                 *cookie = new_cookie;
1290                 return;
1291         }
1292
1293         __blk_mq_requeue_request(rq);
1294
1295         if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1296                 *cookie = BLK_QC_T_NONE;
1297                 rq->errors = -EIO;
1298                 blk_mq_end_request(rq, rq->errors);
1299                 return;
1300         }
1301
1302 insert:
1303         blk_mq_insert_request(rq, false, true, true);
1304 }
1305
1306 /*
1307  * Multiple hardware queue variant. This will not use per-process plugs,
1308  * but will attempt to bypass the hctx queueing if we can go straight to
1309  * hardware for SYNC IO.
1310  */
1311 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1312 {
1313         const int is_sync = op_is_sync(bio->bi_opf);
1314         const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
1315         struct blk_mq_alloc_data data;
1316         struct request *rq;
1317         unsigned int request_count = 0, srcu_idx;
1318         struct blk_plug *plug;
1319         struct request *same_queue_rq = NULL;
1320         blk_qc_t cookie;
1321
1322         blk_queue_bounce(q, &bio);
1323
1324         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1325                 bio_io_error(bio);
1326                 return BLK_QC_T_NONE;
1327         }
1328
1329         blk_queue_split(q, &bio, q->bio_split);
1330
1331         if (!is_flush_fua && !blk_queue_nomerges(q) &&
1332             blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1333                 return BLK_QC_T_NONE;
1334
1335         rq = blk_mq_map_request(q, bio, &data);
1336         if (unlikely(!rq))
1337                 return BLK_QC_T_NONE;
1338
1339         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
1340
1341         if (unlikely(is_flush_fua)) {
1342                 blk_mq_bio_to_request(rq, bio);
1343                 blk_insert_flush(rq);
1344                 goto run_queue;
1345         }
1346
1347         plug = current->plug;
1348         /*
1349          * If the driver supports defer issued based on 'last', then
1350          * queue it up like normal since we can potentially save some
1351          * CPU this way.
1352          */
1353         if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1354             !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1355                 struct request *old_rq = NULL;
1356
1357                 blk_mq_bio_to_request(rq, bio);
1358
1359                 /*
1360                  * We do limited plugging. If the bio can be merged, do that.
1361                  * Otherwise the existing request in the plug list will be
1362                  * issued. So the plug list will have one request at most
1363                  */
1364                 if (plug) {
1365                         /*
1366                          * The plug list might get flushed before this. If that
1367                          * happens, same_queue_rq is invalid and plug list is
1368                          * empty
1369                          */
1370                         if (same_queue_rq && !list_empty(&plug->mq_list)) {
1371                                 old_rq = same_queue_rq;
1372                                 list_del_init(&old_rq->queuelist);
1373                         }
1374                         list_add_tail(&rq->queuelist, &plug->mq_list);
1375                 } else /* is_sync */
1376                         old_rq = rq;
1377                 blk_mq_put_ctx(data.ctx);
1378                 if (!old_rq)
1379                         goto done;
1380
1381                 if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
1382                         rcu_read_lock();
1383                         blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1384                         rcu_read_unlock();
1385                 } else {
1386                         srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
1387                         blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
1388                         srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
1389                 }
1390                 goto done;
1391         }
1392
1393         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1394                 /*
1395                  * For a SYNC request, send it to the hardware immediately. For
1396                  * an ASYNC request, just ensure that we run it later on. The
1397                  * latter allows for merging opportunities and more efficient
1398                  * dispatching.
1399                  */
1400 run_queue:
1401                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1402         }
1403         blk_mq_put_ctx(data.ctx);
1404 done:
1405         return cookie;
1406 }
1407
1408 /*
1409  * Single hardware queue variant. This will attempt to use any per-process
1410  * plug for merging and IO deferral.
1411  */
1412 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
1413 {
1414         const int is_sync = op_is_sync(bio->bi_opf);
1415         const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
1416         struct blk_plug *plug;
1417         unsigned int request_count = 0;
1418         struct blk_mq_alloc_data data;
1419         struct request *rq;
1420         blk_qc_t cookie;
1421
1422         blk_queue_bounce(q, &bio);
1423
1424         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1425                 bio_io_error(bio);
1426                 return BLK_QC_T_NONE;
1427         }
1428
1429         blk_queue_split(q, &bio, q->bio_split);
1430
1431         if (!is_flush_fua && !blk_queue_nomerges(q)) {
1432                 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
1433                         return BLK_QC_T_NONE;
1434         } else
1435                 request_count = blk_plug_queued_count(q);
1436
1437         rq = blk_mq_map_request(q, bio, &data);
1438         if (unlikely(!rq))
1439                 return BLK_QC_T_NONE;
1440
1441         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
1442
1443         if (unlikely(is_flush_fua)) {
1444                 blk_mq_bio_to_request(rq, bio);
1445                 blk_insert_flush(rq);
1446                 goto run_queue;
1447         }
1448
1449         /*
1450          * A task plug currently exists. Since this is completely lockless,
1451          * utilize that to temporarily store requests until the task is
1452          * either done or scheduled away.
1453          */
1454         plug = current->plug;
1455         if (plug) {
1456                 blk_mq_bio_to_request(rq, bio);
1457                 if (!request_count)
1458                         trace_block_plug(q);
1459
1460                 blk_mq_put_ctx(data.ctx);
1461
1462                 if (request_count >= BLK_MAX_REQUEST_COUNT) {
1463                         blk_flush_plug_list(plug, false);
1464                         trace_block_plug(q);
1465                 }
1466
1467                 list_add_tail(&rq->queuelist, &plug->mq_list);
1468                 return cookie;
1469         }
1470
1471         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1472                 /*
1473                  * For a SYNC request, send it to the hardware immediately. For
1474                  * an ASYNC request, just ensure that we run it later on. The
1475                  * latter allows for merging opportunities and more efficient
1476                  * dispatching.
1477                  */
1478 run_queue:
1479                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1480         }
1481
1482         blk_mq_put_ctx(data.ctx);
1483         return cookie;
1484 }
1485
1486 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1487                 struct blk_mq_tags *tags, unsigned int hctx_idx)
1488 {
1489         struct page *page;
1490
1491         if (tags->rqs && set->ops->exit_request) {
1492                 int i;
1493
1494                 for (i = 0; i < tags->nr_tags; i++) {
1495                         if (!tags->rqs[i])
1496                                 continue;
1497                         set->ops->exit_request(set->driver_data, tags->rqs[i],
1498                                                 hctx_idx, i);
1499                         tags->rqs[i] = NULL;
1500                 }
1501         }
1502
1503         while (!list_empty(&tags->page_list)) {
1504                 page = list_first_entry(&tags->page_list, struct page, lru);
1505                 list_del_init(&page->lru);
1506                 /*
1507                  * Remove kmemleak object previously allocated in
1508                  * blk_mq_init_rq_map().
1509                  */
1510                 kmemleak_free(page_address(page));
1511                 __free_pages(page, page->private);
1512         }
1513
1514         kfree(tags->rqs);
1515
1516         blk_mq_free_tags(tags);
1517 }
1518
1519 static size_t order_to_size(unsigned int order)
1520 {
1521         return (size_t)PAGE_SIZE << order;
1522 }
1523
1524 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1525                 unsigned int hctx_idx)
1526 {
1527         struct blk_mq_tags *tags;
1528         unsigned int i, j, entries_per_page, max_order = 4;
1529         size_t rq_size, left;
1530
1531         tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1532                                 set->numa_node,
1533                                 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1534         if (!tags)
1535                 return NULL;
1536
1537         INIT_LIST_HEAD(&tags->page_list);
1538
1539         tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1540                                  GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1541                                  set->numa_node);
1542         if (!tags->rqs) {
1543                 blk_mq_free_tags(tags);
1544                 return NULL;
1545         }
1546
1547         /*
1548          * rq_size is the size of the request plus driver payload, rounded
1549          * to the cacheline size
1550          */
1551         rq_size = round_up(sizeof(struct request) + set->cmd_size,
1552                                 cache_line_size());
1553         left = rq_size * set->queue_depth;
1554
1555         for (i = 0; i < set->queue_depth; ) {
1556                 int this_order = max_order;
1557                 struct page *page;
1558                 int to_do;
1559                 void *p;
1560
1561                 while (this_order && left < order_to_size(this_order - 1))
1562                         this_order--;
1563
1564                 do {
1565                         page = alloc_pages_node(set->numa_node,
1566                                 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1567                                 this_order);
1568                         if (page)
1569                                 break;
1570                         if (!this_order--)
1571                                 break;
1572                         if (order_to_size(this_order) < rq_size)
1573                                 break;
1574                 } while (1);
1575
1576                 if (!page)
1577                         goto fail;
1578
1579                 page->private = this_order;
1580                 list_add_tail(&page->lru, &tags->page_list);
1581
1582                 p = page_address(page);
1583                 /*
1584                  * Allow kmemleak to scan these pages as they contain pointers
1585                  * to additional allocations like via ops->init_request().
1586                  */
1587                 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
1588                 entries_per_page = order_to_size(this_order) / rq_size;
1589                 to_do = min(entries_per_page, set->queue_depth - i);
1590                 left -= to_do * rq_size;
1591                 for (j = 0; j < to_do; j++) {
1592                         tags->rqs[i] = p;
1593                         if (set->ops->init_request) {
1594                                 if (set->ops->init_request(set->driver_data,
1595                                                 tags->rqs[i], hctx_idx, i,
1596                                                 set->numa_node)) {
1597                                         tags->rqs[i] = NULL;
1598                                         goto fail;
1599                                 }
1600                         }
1601
1602                         p += rq_size;
1603                         i++;
1604                 }
1605         }
1606         return tags;
1607
1608 fail:
1609         blk_mq_free_rq_map(set, tags, hctx_idx);
1610         return NULL;
1611 }
1612
1613 /*
1614  * 'cpu' is going away. splice any existing rq_list entries from this
1615  * software queue to the hw queue dispatch list, and ensure that it
1616  * gets run.
1617  */
1618 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
1619 {
1620         struct blk_mq_hw_ctx *hctx;
1621         struct blk_mq_ctx *ctx;
1622         LIST_HEAD(tmp);
1623
1624         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
1625         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1626
1627         spin_lock(&ctx->lock);
1628         if (!list_empty(&ctx->rq_list)) {
1629                 list_splice_init(&ctx->rq_list, &tmp);
1630                 blk_mq_hctx_clear_pending(hctx, ctx);
1631         }
1632         spin_unlock(&ctx->lock);
1633
1634         if (list_empty(&tmp))
1635                 return 0;
1636
1637         spin_lock(&hctx->lock);
1638         list_splice_tail_init(&tmp, &hctx->dispatch);
1639         spin_unlock(&hctx->lock);
1640
1641         blk_mq_run_hw_queue(hctx, true);
1642         return 0;
1643 }
1644
1645 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
1646 {
1647         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
1648                                             &hctx->cpuhp_dead);
1649 }
1650
1651 /* hctx->ctxs will be freed in queue's release handler */
1652 static void blk_mq_exit_hctx(struct request_queue *q,
1653                 struct blk_mq_tag_set *set,
1654                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1655 {
1656         unsigned flush_start_tag = set->queue_depth;
1657
1658         blk_mq_tag_idle(hctx);
1659
1660         if (set->ops->exit_request)
1661                 set->ops->exit_request(set->driver_data,
1662                                        hctx->fq->flush_rq, hctx_idx,
1663                                        flush_start_tag + hctx_idx);
1664
1665         if (set->ops->exit_hctx)
1666                 set->ops->exit_hctx(hctx, hctx_idx);
1667
1668         if (hctx->flags & BLK_MQ_F_BLOCKING)
1669                 cleanup_srcu_struct(&hctx->queue_rq_srcu);
1670
1671         blk_mq_remove_cpuhp(hctx);
1672         blk_free_flush_queue(hctx->fq);
1673         sbitmap_free(&hctx->ctx_map);
1674 }
1675
1676 static void blk_mq_exit_hw_queues(struct request_queue *q,
1677                 struct blk_mq_tag_set *set, int nr_queue)
1678 {
1679         struct blk_mq_hw_ctx *hctx;
1680         unsigned int i;
1681
1682         queue_for_each_hw_ctx(q, hctx, i) {
1683                 if (i == nr_queue)
1684                         break;
1685                 blk_mq_exit_hctx(q, set, hctx, i);
1686         }
1687 }
1688
1689 static void blk_mq_free_hw_queues(struct request_queue *q,
1690                 struct blk_mq_tag_set *set)
1691 {
1692         struct blk_mq_hw_ctx *hctx;
1693         unsigned int i;
1694
1695         queue_for_each_hw_ctx(q, hctx, i)
1696                 free_cpumask_var(hctx->cpumask);
1697 }
1698
1699 static int blk_mq_init_hctx(struct request_queue *q,
1700                 struct blk_mq_tag_set *set,
1701                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1702 {
1703         int node;
1704         unsigned flush_start_tag = set->queue_depth;
1705
1706         node = hctx->numa_node;
1707         if (node == NUMA_NO_NODE)
1708                 node = hctx->numa_node = set->numa_node;
1709
1710         INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
1711         INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1712         spin_lock_init(&hctx->lock);
1713         INIT_LIST_HEAD(&hctx->dispatch);
1714         hctx->queue = q;
1715         hctx->queue_num = hctx_idx;
1716         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1717
1718         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
1719
1720         hctx->tags = set->tags[hctx_idx];
1721
1722         /*
1723          * Allocate space for all possible cpus to avoid allocation at
1724          * runtime
1725          */
1726         hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1727                                         GFP_KERNEL, node);
1728         if (!hctx->ctxs)
1729                 goto unregister_cpu_notifier;
1730
1731         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1732                               node))
1733                 goto free_ctxs;
1734
1735         hctx->nr_ctx = 0;
1736
1737         if (set->ops->init_hctx &&
1738             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1739                 goto free_bitmap;
1740
1741         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1742         if (!hctx->fq)
1743                 goto exit_hctx;
1744
1745         if (set->ops->init_request &&
1746             set->ops->init_request(set->driver_data,
1747                                    hctx->fq->flush_rq, hctx_idx,
1748                                    flush_start_tag + hctx_idx, node))
1749                 goto free_fq;
1750
1751         if (hctx->flags & BLK_MQ_F_BLOCKING)
1752                 init_srcu_struct(&hctx->queue_rq_srcu);
1753
1754         return 0;
1755
1756  free_fq:
1757         kfree(hctx->fq);
1758  exit_hctx:
1759         if (set->ops->exit_hctx)
1760                 set->ops->exit_hctx(hctx, hctx_idx);
1761  free_bitmap:
1762         sbitmap_free(&hctx->ctx_map);
1763  free_ctxs:
1764         kfree(hctx->ctxs);
1765  unregister_cpu_notifier:
1766         blk_mq_remove_cpuhp(hctx);
1767         return -1;
1768 }
1769
1770 static void blk_mq_init_cpu_queues(struct request_queue *q,
1771                                    unsigned int nr_hw_queues)
1772 {
1773         unsigned int i;
1774
1775         for_each_possible_cpu(i) {
1776                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1777                 struct blk_mq_hw_ctx *hctx;
1778
1779                 memset(__ctx, 0, sizeof(*__ctx));
1780                 __ctx->cpu = i;
1781                 spin_lock_init(&__ctx->lock);
1782                 INIT_LIST_HEAD(&__ctx->rq_list);
1783                 __ctx->queue = q;
1784
1785                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1786                 if (!cpu_online(i))
1787                         continue;
1788
1789                 hctx = blk_mq_map_queue(q, i);
1790
1791                 /*
1792                  * Set local node, IFF we have more than one hw queue. If
1793                  * not, we remain on the home node of the device
1794                  */
1795                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1796                         hctx->numa_node = local_memory_node(cpu_to_node(i));
1797         }
1798 }
1799
1800 static void blk_mq_map_swqueue(struct request_queue *q,
1801                                const struct cpumask *online_mask)
1802 {
1803         unsigned int i;
1804         struct blk_mq_hw_ctx *hctx;
1805         struct blk_mq_ctx *ctx;
1806         struct blk_mq_tag_set *set = q->tag_set;
1807
1808         /*
1809          * Avoid others reading imcomplete hctx->cpumask through sysfs
1810          */
1811         mutex_lock(&q->sysfs_lock);
1812
1813         queue_for_each_hw_ctx(q, hctx, i) {
1814                 cpumask_clear(hctx->cpumask);
1815                 hctx->nr_ctx = 0;
1816         }
1817
1818         /*
1819          * Map software to hardware queues
1820          */
1821         for_each_possible_cpu(i) {
1822                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1823                 if (!cpumask_test_cpu(i, online_mask))
1824                         continue;
1825
1826                 ctx = per_cpu_ptr(q->queue_ctx, i);
1827                 hctx = blk_mq_map_queue(q, i);
1828
1829                 cpumask_set_cpu(i, hctx->cpumask);
1830                 ctx->index_hw = hctx->nr_ctx;
1831                 hctx->ctxs[hctx->nr_ctx++] = ctx;
1832         }
1833
1834         mutex_unlock(&q->sysfs_lock);
1835
1836         queue_for_each_hw_ctx(q, hctx, i) {
1837                 /*
1838                  * If no software queues are mapped to this hardware queue,
1839                  * disable it and free the request entries.
1840                  */
1841                 if (!hctx->nr_ctx) {
1842                         if (set->tags[i]) {
1843                                 blk_mq_free_rq_map(set, set->tags[i], i);
1844                                 set->tags[i] = NULL;
1845                         }
1846                         hctx->tags = NULL;
1847                         continue;
1848                 }
1849
1850                 /* unmapped hw queue can be remapped after CPU topo changed */
1851                 if (!set->tags[i])
1852                         set->tags[i] = blk_mq_init_rq_map(set, i);
1853                 hctx->tags = set->tags[i];
1854                 WARN_ON(!hctx->tags);
1855
1856                 /*
1857                  * Set the map size to the number of mapped software queues.
1858                  * This is more accurate and more efficient than looping
1859                  * over all possibly mapped software queues.
1860                  */
1861                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
1862
1863                 /*
1864                  * Initialize batch roundrobin counts
1865                  */
1866                 hctx->next_cpu = cpumask_first(hctx->cpumask);
1867                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1868         }
1869 }
1870
1871 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
1872 {
1873         struct blk_mq_hw_ctx *hctx;
1874         int i;
1875
1876         queue_for_each_hw_ctx(q, hctx, i) {
1877                 if (shared)
1878                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
1879                 else
1880                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1881         }
1882 }
1883
1884 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
1885 {
1886         struct request_queue *q;
1887
1888         list_for_each_entry(q, &set->tag_list, tag_set_list) {
1889                 blk_mq_freeze_queue(q);
1890                 queue_set_hctx_shared(q, shared);
1891                 blk_mq_unfreeze_queue(q);
1892         }
1893 }
1894
1895 static void blk_mq_del_queue_tag_set(struct request_queue *q)
1896 {
1897         struct blk_mq_tag_set *set = q->tag_set;
1898
1899         mutex_lock(&set->tag_list_lock);
1900         list_del_init(&q->tag_set_list);
1901         if (list_is_singular(&set->tag_list)) {
1902                 /* just transitioned to unshared */
1903                 set->flags &= ~BLK_MQ_F_TAG_SHARED;
1904                 /* update existing queue */
1905                 blk_mq_update_tag_set_depth(set, false);
1906         }
1907         mutex_unlock(&set->tag_list_lock);
1908 }
1909
1910 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1911                                      struct request_queue *q)
1912 {
1913         q->tag_set = set;
1914
1915         mutex_lock(&set->tag_list_lock);
1916
1917         /* Check to see if we're transitioning to shared (from 1 to 2 queues). */
1918         if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
1919                 set->flags |= BLK_MQ_F_TAG_SHARED;
1920                 /* update existing queue */
1921                 blk_mq_update_tag_set_depth(set, true);
1922         }
1923         if (set->flags & BLK_MQ_F_TAG_SHARED)
1924                 queue_set_hctx_shared(q, true);
1925         list_add_tail(&q->tag_set_list, &set->tag_list);
1926
1927         mutex_unlock(&set->tag_list_lock);
1928 }
1929
1930 /*
1931  * It is the actual release handler for mq, but we do it from
1932  * request queue's release handler for avoiding use-after-free
1933  * and headache because q->mq_kobj shouldn't have been introduced,
1934  * but we can't group ctx/kctx kobj without it.
1935  */
1936 void blk_mq_release(struct request_queue *q)
1937 {
1938         struct blk_mq_hw_ctx *hctx;
1939         unsigned int i;
1940
1941         /* hctx kobj stays in hctx */
1942         queue_for_each_hw_ctx(q, hctx, i) {
1943                 if (!hctx)
1944                         continue;
1945                 kfree(hctx->ctxs);
1946                 kfree(hctx);
1947         }
1948
1949         q->mq_map = NULL;
1950
1951         kfree(q->queue_hw_ctx);
1952
1953         /* ctx kobj stays in queue_ctx */
1954         free_percpu(q->queue_ctx);
1955 }
1956
1957 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1958 {
1959         struct request_queue *uninit_q, *q;
1960
1961         uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1962         if (!uninit_q)
1963                 return ERR_PTR(-ENOMEM);
1964
1965         q = blk_mq_init_allocated_queue(set, uninit_q);
1966         if (IS_ERR(q))
1967                 blk_cleanup_queue(uninit_q);
1968
1969         return q;
1970 }
1971 EXPORT_SYMBOL(blk_mq_init_queue);
1972
1973 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
1974                                                 struct request_queue *q)
1975 {
1976         int i, j;
1977         struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
1978
1979         blk_mq_sysfs_unregister(q);
1980         for (i = 0; i < set->nr_hw_queues; i++) {
1981                 int node;
1982
1983                 if (hctxs[i])
1984                         continue;
1985
1986                 node = blk_mq_hw_queue_to_node(q->mq_map, i);
1987                 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1988                                         GFP_KERNEL, node);
1989                 if (!hctxs[i])
1990                         break;
1991
1992                 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1993                                                 node)) {
1994                         kfree(hctxs[i]);
1995                         hctxs[i] = NULL;
1996                         break;
1997                 }
1998
1999                 atomic_set(&hctxs[i]->nr_active, 0);
2000                 hctxs[i]->numa_node = node;
2001                 hctxs[i]->queue_num = i;
2002
2003                 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2004                         free_cpumask_var(hctxs[i]->cpumask);
2005                         kfree(hctxs[i]);
2006                         hctxs[i] = NULL;
2007                         break;
2008                 }
2009                 blk_mq_hctx_kobj_init(hctxs[i]);
2010         }
2011         for (j = i; j < q->nr_hw_queues; j++) {
2012                 struct blk_mq_hw_ctx *hctx = hctxs[j];
2013
2014                 if (hctx) {
2015                         if (hctx->tags) {
2016                                 blk_mq_free_rq_map(set, hctx->tags, j);
2017                                 set->tags[j] = NULL;
2018                         }
2019                         blk_mq_exit_hctx(q, set, hctx, j);
2020                         free_cpumask_var(hctx->cpumask);
2021                         kobject_put(&hctx->kobj);
2022                         kfree(hctx->ctxs);
2023                         kfree(hctx);
2024                         hctxs[j] = NULL;
2025
2026                 }
2027         }
2028         q->nr_hw_queues = i;
2029         blk_mq_sysfs_register(q);
2030 }
2031
2032 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2033                                                   struct request_queue *q)
2034 {
2035         /* mark the queue as mq asap */
2036         q->mq_ops = set->ops;
2037
2038         q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2039         if (!q->queue_ctx)
2040                 goto err_exit;
2041
2042         q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2043                                                 GFP_KERNEL, set->numa_node);
2044         if (!q->queue_hw_ctx)
2045                 goto err_percpu;
2046
2047         q->mq_map = set->mq_map;
2048
2049         blk_mq_realloc_hw_ctxs(set, q);
2050         if (!q->nr_hw_queues)
2051                 goto err_hctxs;
2052
2053         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2054         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2055
2056         q->nr_queues = nr_cpu_ids;
2057
2058         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2059
2060         if (!(set->flags & BLK_MQ_F_SG_MERGE))
2061                 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2062
2063         q->sg_reserved_size = INT_MAX;
2064
2065         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2066         INIT_LIST_HEAD(&q->requeue_list);
2067         spin_lock_init(&q->requeue_lock);
2068
2069         if (q->nr_hw_queues > 1)
2070                 blk_queue_make_request(q, blk_mq_make_request);
2071         else
2072                 blk_queue_make_request(q, blk_sq_make_request);
2073
2074         /*
2075          * Do this after blk_queue_make_request() overrides it...
2076          */
2077         q->nr_requests = set->queue_depth;
2078
2079         if (set->ops->complete)
2080                 blk_queue_softirq_done(q, set->ops->complete);
2081
2082         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2083
2084         get_online_cpus();
2085         mutex_lock(&all_q_mutex);
2086
2087         list_add_tail(&q->all_q_node, &all_q_list);
2088         blk_mq_add_queue_tag_set(set, q);
2089         blk_mq_map_swqueue(q, cpu_online_mask);
2090
2091         mutex_unlock(&all_q_mutex);
2092         put_online_cpus();
2093
2094         return q;
2095
2096 err_hctxs:
2097         kfree(q->queue_hw_ctx);
2098 err_percpu:
2099         free_percpu(q->queue_ctx);
2100 err_exit:
2101         q->mq_ops = NULL;
2102         return ERR_PTR(-ENOMEM);
2103 }
2104 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2105
2106 void blk_mq_free_queue(struct request_queue *q)
2107 {
2108         struct blk_mq_tag_set   *set = q->tag_set;
2109
2110         mutex_lock(&all_q_mutex);
2111         list_del_init(&q->all_q_node);
2112         mutex_unlock(&all_q_mutex);
2113
2114         blk_mq_del_queue_tag_set(q);
2115
2116         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2117         blk_mq_free_hw_queues(q, set);
2118 }
2119
2120 /* Basically redo blk_mq_init_queue with queue frozen */
2121 static void blk_mq_queue_reinit(struct request_queue *q,
2122                                 const struct cpumask *online_mask)
2123 {
2124         WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2125
2126         blk_mq_sysfs_unregister(q);
2127
2128         /*
2129          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2130          * we should change hctx numa_node according to new topology (this
2131          * involves free and re-allocate memory, worthy doing?)
2132          */
2133
2134         blk_mq_map_swqueue(q, online_mask);
2135
2136         blk_mq_sysfs_register(q);
2137 }
2138
2139 /*
2140  * New online cpumask which is going to be set in this hotplug event.
2141  * Declare this cpumasks as global as cpu-hotplug operation is invoked
2142  * one-by-one and dynamically allocating this could result in a failure.
2143  */
2144 static struct cpumask cpuhp_online_new;
2145
2146 static void blk_mq_queue_reinit_work(void)
2147 {
2148         struct request_queue *q;
2149
2150         mutex_lock(&all_q_mutex);
2151         /*
2152          * We need to freeze and reinit all existing queues.  Freezing
2153          * involves synchronous wait for an RCU grace period and doing it
2154          * one by one may take a long time.  Start freezing all queues in
2155          * one swoop and then wait for the completions so that freezing can
2156          * take place in parallel.
2157          */
2158         list_for_each_entry(q, &all_q_list, all_q_node)
2159                 blk_mq_freeze_queue_start(q);
2160         list_for_each_entry(q, &all_q_list, all_q_node) {
2161                 blk_mq_freeze_queue_wait(q);
2162
2163                 /*
2164                  * timeout handler can't touch hw queue during the
2165                  * reinitialization
2166                  */
2167                 del_timer_sync(&q->timeout);
2168         }
2169
2170         list_for_each_entry(q, &all_q_list, all_q_node)
2171                 blk_mq_queue_reinit(q, &cpuhp_online_new);
2172
2173         list_for_each_entry(q, &all_q_list, all_q_node)
2174                 blk_mq_unfreeze_queue(q);
2175
2176         mutex_unlock(&all_q_mutex);
2177 }
2178
2179 static int blk_mq_queue_reinit_dead(unsigned int cpu)
2180 {
2181         cpumask_copy(&cpuhp_online_new, cpu_online_mask);
2182         blk_mq_queue_reinit_work();
2183         return 0;
2184 }
2185
2186 /*
2187  * Before hotadded cpu starts handling requests, new mappings must be
2188  * established.  Otherwise, these requests in hw queue might never be
2189  * dispatched.
2190  *
2191  * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
2192  * for CPU0, and ctx1 for CPU1).
2193  *
2194  * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
2195  * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
2196  *
2197  * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
2198  * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
2199  * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
2200  * is ignored.
2201  */
2202 static int blk_mq_queue_reinit_prepare(unsigned int cpu)
2203 {
2204         cpumask_copy(&cpuhp_online_new, cpu_online_mask);
2205         cpumask_set_cpu(cpu, &cpuhp_online_new);
2206         blk_mq_queue_reinit_work();
2207         return 0;
2208 }
2209
2210 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2211 {
2212         int i;
2213
2214         for (i = 0; i < set->nr_hw_queues; i++) {
2215                 set->tags[i] = blk_mq_init_rq_map(set, i);
2216                 if (!set->tags[i])
2217                         goto out_unwind;
2218         }
2219
2220         return 0;
2221
2222 out_unwind:
2223         while (--i >= 0)
2224                 blk_mq_free_rq_map(set, set->tags[i], i);
2225
2226         return -ENOMEM;
2227 }
2228
2229 /*
2230  * Allocate the request maps associated with this tag_set. Note that this
2231  * may reduce the depth asked for, if memory is tight. set->queue_depth
2232  * will be updated to reflect the allocated depth.
2233  */
2234 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2235 {
2236         unsigned int depth;
2237         int err;
2238
2239         depth = set->queue_depth;
2240         do {
2241                 err = __blk_mq_alloc_rq_maps(set);
2242                 if (!err)
2243                         break;
2244
2245                 set->queue_depth >>= 1;
2246                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2247                         err = -ENOMEM;
2248                         break;
2249                 }
2250         } while (set->queue_depth);
2251
2252         if (!set->queue_depth || err) {
2253                 pr_err("blk-mq: failed to allocate request map\n");
2254                 return -ENOMEM;
2255         }
2256
2257         if (depth != set->queue_depth)
2258                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2259                                                 depth, set->queue_depth);
2260
2261         return 0;
2262 }
2263
2264 /*
2265  * Alloc a tag set to be associated with one or more request queues.
2266  * May fail with EINVAL for various error conditions. May adjust the
2267  * requested depth down, if if it too large. In that case, the set
2268  * value will be stored in set->queue_depth.
2269  */
2270 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2271 {
2272         int ret;
2273
2274         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2275
2276         if (!set->nr_hw_queues)
2277                 return -EINVAL;
2278         if (!set->queue_depth)
2279                 return -EINVAL;
2280         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2281                 return -EINVAL;
2282
2283         if (!set->ops->queue_rq)
2284                 return -EINVAL;
2285
2286         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2287                 pr_info("blk-mq: reduced tag depth to %u\n",
2288                         BLK_MQ_MAX_DEPTH);
2289                 set->queue_depth = BLK_MQ_MAX_DEPTH;
2290         }
2291
2292         /*
2293          * If a crashdump is active, then we are potentially in a very
2294          * memory constrained environment. Limit us to 1 queue and
2295          * 64 tags to prevent using too much memory.
2296          */
2297         if (is_kdump_kernel()) {
2298                 set->nr_hw_queues = 1;
2299                 set->queue_depth = min(64U, set->queue_depth);
2300         }
2301         /*
2302          * There is no use for more h/w queues than cpus.
2303          */
2304         if (set->nr_hw_queues > nr_cpu_ids)
2305                 set->nr_hw_queues = nr_cpu_ids;
2306
2307         set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2308                                  GFP_KERNEL, set->numa_node);
2309         if (!set->tags)
2310                 return -ENOMEM;
2311
2312         ret = -ENOMEM;
2313         set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2314                         GFP_KERNEL, set->numa_node);
2315         if (!set->mq_map)
2316                 goto out_free_tags;
2317
2318         if (set->ops->map_queues)
2319                 ret = set->ops->map_queues(set);
2320         else
2321                 ret = blk_mq_map_queues(set);
2322         if (ret)
2323                 goto out_free_mq_map;
2324
2325         ret = blk_mq_alloc_rq_maps(set);
2326         if (ret)
2327                 goto out_free_mq_map;
2328
2329         mutex_init(&set->tag_list_lock);
2330         INIT_LIST_HEAD(&set->tag_list);
2331
2332         return 0;
2333
2334 out_free_mq_map:
2335         kfree(set->mq_map);
2336         set->mq_map = NULL;
2337 out_free_tags:
2338         kfree(set->tags);
2339         set->tags = NULL;
2340         return ret;
2341 }
2342 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2343
2344 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2345 {
2346         int i;
2347
2348         for (i = 0; i < nr_cpu_ids; i++) {
2349                 if (set->tags[i])
2350                         blk_mq_free_rq_map(set, set->tags[i], i);
2351         }
2352
2353         kfree(set->mq_map);
2354         set->mq_map = NULL;
2355
2356         kfree(set->tags);
2357         set->tags = NULL;
2358 }
2359 EXPORT_SYMBOL(blk_mq_free_tag_set);
2360
2361 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2362 {
2363         struct blk_mq_tag_set *set = q->tag_set;
2364         struct blk_mq_hw_ctx *hctx;
2365         int i, ret;
2366
2367         if (!set || nr > set->queue_depth)
2368                 return -EINVAL;
2369
2370         ret = 0;
2371         queue_for_each_hw_ctx(q, hctx, i) {
2372                 if (!hctx->tags)
2373                         continue;
2374                 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2375                 if (ret)
2376                         break;
2377         }
2378
2379         if (!ret)
2380                 q->nr_requests = nr;
2381
2382         return ret;
2383 }
2384
2385 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2386 {
2387         struct request_queue *q;
2388
2389         if (nr_hw_queues > nr_cpu_ids)
2390                 nr_hw_queues = nr_cpu_ids;
2391         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2392                 return;
2393
2394         list_for_each_entry(q, &set->tag_list, tag_set_list)
2395                 blk_mq_freeze_queue(q);
2396
2397         set->nr_hw_queues = nr_hw_queues;
2398         list_for_each_entry(q, &set->tag_list, tag_set_list) {
2399                 blk_mq_realloc_hw_ctxs(set, q);
2400
2401                 if (q->nr_hw_queues > 1)
2402                         blk_queue_make_request(q, blk_mq_make_request);
2403                 else
2404                         blk_queue_make_request(q, blk_sq_make_request);
2405
2406                 blk_mq_queue_reinit(q, cpu_online_mask);
2407         }
2408
2409         list_for_each_entry(q, &set->tag_list, tag_set_list)
2410                 blk_mq_unfreeze_queue(q);
2411 }
2412 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2413
2414 void blk_mq_disable_hotplug(void)
2415 {
2416         mutex_lock(&all_q_mutex);
2417 }
2418
2419 void blk_mq_enable_hotplug(void)
2420 {
2421         mutex_unlock(&all_q_mutex);
2422 }
2423
2424 static int __init blk_mq_init(void)
2425 {
2426         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
2427                                 blk_mq_hctx_notify_dead);
2428
2429         cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
2430                                   blk_mq_queue_reinit_prepare,
2431                                   blk_mq_queue_reinit_dead);
2432         return 0;
2433 }
2434 subsys_initcall(blk_mq_init);