block/blk-mq.c

   1 #include <linux/kernel.h>
   2 #include <linux/module.h>
   3 #include <linux/backing-dev.h>
   4 #include <linux/bio.h>
   5 #include <linux/blkdev.h>
   6 #include <linux/mm.h>
   7 #include <linux/init.h>
   8 #include <linux/slab.h>
   9 #include <linux/workqueue.h>
  10 #include <linux/smp.h>
  11 #include <linux/llist.h>
  12 #include <linux/list_sort.h>
  13 #include <linux/cpu.h>
  14 #include <linux/cache.h>
  15 #include <linux/sched/sysctl.h>
  16 #include <linux/delay.h>
  17
  18 #include <trace/events/block.h>
  19
  20 #include <linux/blk-mq.h>
  21 #include "blk.h"
  22 #include "blk-mq.h"
  23 #include "blk-mq-tag.h"
  24
  25 static DEFINE_MUTEX(all_q_mutex);
  26 static LIST_HEAD(all_q_list);
  27
  28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  29
  30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  31                                            unsigned int cpu)
  32 {
  33         return per_cpu_ptr(q->queue_ctx, cpu);
  34 }
  35
  36 /*
  37  * This assumes per-cpu software queueing queues. They could be per-node
  38  * as well, for instance. For now this is hardcoded as-is. Note that we don't
  39  * care about preemption, since we know the ctx's are persistent. This does
  40  * mean that we can't rely on ctx always matching the currently running CPU.
  41  */
  42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
  43 {
  44         return __blk_mq_get_ctx(q, get_cpu());
  45 }
  46
  47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
  48 {
  49         put_cpu();
  50 }
  51
  52 /*
  53  * Check if any of the ctx's have pending work in this hardware queue
  54  */
  55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  56 {
  57         unsigned int i;
  58
  59         for (i = 0; i < hctx->nr_ctx_map; i++)
  60                 if (hctx->ctx_map[i])
  61                         return true;
  62
  63         return false;
  64 }
  65
  66 /*
  67  * Mark this ctx as having pending work in this hardware queue
  68  */
  69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  70                                      struct blk_mq_ctx *ctx)
  71 {
  72         if (!test_bit(ctx->index_hw, hctx->ctx_map))
  73                 set_bit(ctx->index_hw, hctx->ctx_map);
  74 }
  75
  76 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
  77                                               struct blk_mq_ctx *ctx,
  78                                               gfp_t gfp, bool reserved)
  79 {
  80         struct request *rq;
  81         unsigned int tag;
  82
  83         tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
  84         if (tag != BLK_MQ_TAG_FAIL) {
  85                 rq = hctx->tags->rqs[tag];
  86                 rq->tag = tag;
  87                 return rq;
  88         }
  89
  90         return NULL;
  91 }
  92
  93 static int blk_mq_queue_enter(struct request_queue *q)
  94 {
  95         int ret;
  96
  97         __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  98         smp_wmb();
  99         /* we have problems to freeze the queue if it's initializing */
 100         if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
 101                 return 0;
 102
 103         __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 104
 105         spin_lock_irq(q->queue_lock);
 106         ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
 107                 !blk_queue_bypass(q) || blk_queue_dying(q),
 108                 *q->queue_lock);
 109         /* inc usage with lock hold to avoid freeze_queue runs here */
 110         if (!ret && !blk_queue_dying(q))
 111                 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
 112         else if (blk_queue_dying(q))
 113                 ret = -ENODEV;
 114         spin_unlock_irq(q->queue_lock);
 115
 116         return ret;
 117 }
 118
 119 static void blk_mq_queue_exit(struct request_queue *q)
 120 {
 121         __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 122 }
 123
 124 static void __blk_mq_drain_queue(struct request_queue *q)
 125 {
 126         while (true) {
 127                 s64 count;
 128
 129                 spin_lock_irq(q->queue_lock);
 130                 count = percpu_counter_sum(&q->mq_usage_counter);
 131                 spin_unlock_irq(q->queue_lock);
 132
 133                 if (count == 0)
 134                         break;
 135                 blk_mq_run_queues(q, false);
 136                 msleep(10);
 137         }
 138 }
 139
 140 /*
 141  * Guarantee no request is in use, so we can change any data structure of
 142  * the queue afterward.
 143  */
 144 static void blk_mq_freeze_queue(struct request_queue *q)
 145 {
 146         bool drain;
 147
 148         spin_lock_irq(q->queue_lock);
 149         drain = !q->bypass_depth++;
 150         queue_flag_set(QUEUE_FLAG_BYPASS, q);
 151         spin_unlock_irq(q->queue_lock);
 152
 153         if (drain)
 154                 __blk_mq_drain_queue(q);
 155 }
 156
 157 void blk_mq_drain_queue(struct request_queue *q)
 158 {
 159         __blk_mq_drain_queue(q);
 160 }
 161
 162 static void blk_mq_unfreeze_queue(struct request_queue *q)
 163 {
 164         bool wake = false;
 165
 166         spin_lock_irq(q->queue_lock);
 167         if (!--q->bypass_depth) {
 168                 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
 169                 wake = true;
 170         }
 171         WARN_ON_ONCE(q->bypass_depth < 0);
 172         spin_unlock_irq(q->queue_lock);
 173         if (wake)
 174                 wake_up_all(&q->mq_freeze_wq);
 175 }
 176
 177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 178 {
 179         return blk_mq_has_free_tags(hctx->tags);
 180 }
 181 EXPORT_SYMBOL(blk_mq_can_queue);
 182
 183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 184                                struct request *rq, unsigned int rw_flags)
 185 {
 186         if (blk_queue_io_stat(q))
 187                 rw_flags |= REQ_IO_STAT;
 188
 189         INIT_LIST_HEAD(&rq->queuelist);
 190         /* csd/requeue_work/fifo_time is initialized before use */
 191         rq->q = q;
 192         rq->mq_ctx = ctx;
 193         rq->cmd_flags = rw_flags;
 194         rq->cmd_type = 0;
 195         /* do not touch atomic flags, it needs atomic ops against the timer */
 196         rq->cpu = -1;
 197         rq->__data_len = 0;
 198         rq->__sector = (sector_t) -1;
 199         rq->bio = NULL;
 200         rq->biotail = NULL;
 201         INIT_HLIST_NODE(&rq->hash);
 202         RB_CLEAR_NODE(&rq->rb_node);
 203         memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv)));
 204         rq->rq_disk = NULL;
 205         rq->part = NULL;
 206         rq->start_time = jiffies;
 207 #ifdef CONFIG_BLK_CGROUP
 208         rq->rl = NULL;
 209         set_start_time_ns(rq);
 210         rq->io_start_time_ns = 0;
 211 #endif
 212         rq->nr_phys_segments = 0;
 213 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 214         rq->nr_integrity_segments = 0;
 215 #endif
 216         rq->ioprio = 0;
 217         rq->special = NULL;
 218         /* tag was already set */
 219         rq->errors = 0;
 220         memset(rq->__cmd, 0, sizeof(rq->__cmd));
 221         rq->cmd = rq->__cmd;
 222         rq->cmd_len = BLK_MAX_CDB;
 223
 224         rq->extra_len = 0;
 225         rq->sense_len = 0;
 226         rq->resid_len = 0;
 227         rq->sense = NULL;
 228
 229         rq->deadline = 0;
 230         INIT_LIST_HEAD(&rq->timeout_list);
 231         rq->timeout = 0;
 232         rq->retries = 0;
 233         rq->end_io = NULL;
 234         rq->end_io_data = NULL;
 235         rq->next_rq = NULL;
 236
 237         ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 238 }
 239
 240 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 241                                                    int rw, gfp_t gfp,
 242                                                    bool reserved)
 243 {
 244         struct request *rq;
 245
 246         do {
 247                 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 248                 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
 249
 250                 rq = __blk_mq_alloc_request(hctx, ctx, gfp & ~__GFP_WAIT,
 251                                                 reserved);
 252                 if (rq) {
 253                         blk_mq_rq_ctx_init(q, ctx, rq, rw);
 254                         break;
 255                 }
 256
 257                 if (gfp & __GFP_WAIT) {
 258                         __blk_mq_run_hw_queue(hctx);
 259                         blk_mq_put_ctx(ctx);
 260                 } else {
 261                         blk_mq_put_ctx(ctx);
 262                         break;
 263                 }
 264
 265                 blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
 266         } while (1);
 267
 268         return rq;
 269 }
 270
 271 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
 272 {
 273         struct request *rq;
 274
 275         if (blk_mq_queue_enter(q))
 276                 return NULL;
 277
 278         rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
 279         if (rq)
 280                 blk_mq_put_ctx(rq->mq_ctx);
 281         return rq;
 282 }
 283 EXPORT_SYMBOL(blk_mq_alloc_request);
 284
 285 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
 286                                               gfp_t gfp)
 287 {
 288         struct request *rq;
 289
 290         if (blk_mq_queue_enter(q))
 291                 return NULL;
 292
 293         rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
 294         if (rq)
 295                 blk_mq_put_ctx(rq->mq_ctx);
 296         return rq;
 297 }
 298 EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
 299
 300 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 301                                   struct blk_mq_ctx *ctx, struct request *rq)
 302 {
 303         const int tag = rq->tag;
 304         struct request_queue *q = rq->q;
 305
 306         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 307         blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
 308         blk_mq_queue_exit(q);
 309 }
 310
 311 void blk_mq_free_request(struct request *rq)
 312 {
 313         struct blk_mq_ctx *ctx = rq->mq_ctx;
 314         struct blk_mq_hw_ctx *hctx;
 315         struct request_queue *q = rq->q;
 316
 317         ctx->rq_completed[rq_is_sync(rq)]++;
 318
 319         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 320         __blk_mq_free_request(hctx, ctx, rq);
 321 }
 322
 323 /*
 324  * Clone all relevant state from a request that has been put on hold in
 325  * the flush state machine into the preallocated flush request that hangs
 326  * off the request queue.
 327  *
 328  * For a driver the flush request should be invisible, that's why we are
 329  * impersonating the original request here.
 330  */
 331 void blk_mq_clone_flush_request(struct request *flush_rq,
 332                 struct request *orig_rq)
 333 {
 334         struct blk_mq_hw_ctx *hctx =
 335                 orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
 336
 337         flush_rq->mq_ctx = orig_rq->mq_ctx;
 338         flush_rq->tag = orig_rq->tag;
 339         memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
 340                 hctx->cmd_size);
 341 }
 342
 343 inline void __blk_mq_end_io(struct request *rq, int error)
 344 {
 345         blk_account_io_done(rq);
 346
 347         if (rq->end_io) {
 348                 rq->end_io(rq, error);
 349         } else {
 350                 if (unlikely(blk_bidi_rq(rq)))
 351                         blk_mq_free_request(rq->next_rq);
 352                 blk_mq_free_request(rq);
 353         }
 354 }
 355 EXPORT_SYMBOL(__blk_mq_end_io);
 356
 357 void blk_mq_end_io(struct request *rq, int error)
 358 {
 359         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 360                 BUG();
 361         __blk_mq_end_io(rq, error);
 362 }
 363 EXPORT_SYMBOL(blk_mq_end_io);
 364
 365 static void __blk_mq_complete_request_remote(void *data)
 366 {
 367         struct request *rq = data;
 368
 369         rq->q->softirq_done_fn(rq);
 370 }
 371
 372 void __blk_mq_complete_request(struct request *rq)
 373 {
 374         struct blk_mq_ctx *ctx = rq->mq_ctx;
 375         bool shared = false;
 376         int cpu;
 377
 378         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 379                 rq->q->softirq_done_fn(rq);
 380                 return;
 381         }
 382
 383         cpu = get_cpu();
 384         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 385                 shared = cpus_share_cache(cpu, ctx->cpu);
 386
 387         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 388                 rq->csd.func = __blk_mq_complete_request_remote;
 389                 rq->csd.info = rq;
 390                 rq->csd.flags = 0;
 391                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 392         } else {
 393                 rq->q->softirq_done_fn(rq);
 394         }
 395         put_cpu();
 396 }
 397
 398 /**
 399  * blk_mq_complete_request - end I/O on a request
 400  * @rq:         the request being processed
 401  *
 402  * Description:
 403  *      Ends all I/O on a request. It does not handle partial completions.
 404  *      The actual completion happens out-of-order, through a IPI handler.
 405  **/
 406 void blk_mq_complete_request(struct request *rq)
 407 {
 408         if (unlikely(blk_should_fake_timeout(rq->q)))
 409                 return;
 410         if (!blk_mark_rq_complete(rq))
 411                 __blk_mq_complete_request(rq);
 412 }
 413 EXPORT_SYMBOL(blk_mq_complete_request);
 414
 415 static void blk_mq_start_request(struct request *rq, bool last)
 416 {
 417         struct request_queue *q = rq->q;
 418
 419         trace_block_rq_issue(q, rq);
 420
 421         rq->resid_len = blk_rq_bytes(rq);
 422         if (unlikely(blk_bidi_rq(rq)))
 423                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 424
 425         /*
 426          * Just mark start time and set the started bit. Due to memory
 427          * ordering, we know we'll see the correct deadline as long as
 428          * REQ_ATOMIC_STARTED is seen.
 429          */
 430         rq->deadline = jiffies + q->rq_timeout;
 431
 432         /*
 433          * Mark us as started and clear complete. Complete might have been
 434          * set if requeue raced with timeout, which then marked it as
 435          * complete. So be sure to clear complete again when we start
 436          * the request, otherwise we'll ignore the completion event.
 437          */
 438         set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 439         clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 440
 441         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 442                 /*
 443                  * Make sure space for the drain appears.  We know we can do
 444                  * this because max_hw_segments has been adjusted to be one
 445                  * fewer than the device can handle.
 446                  */
 447                 rq->nr_phys_segments++;
 448         }
 449
 450         /*
 451          * Flag the last request in the series so that drivers know when IO
 452          * should be kicked off, if they don't do it on a per-request basis.
 453          *
 454          * Note: the flag isn't the only condition drivers should do kick off.
 455          * If drive is busy, the last request might not have the bit set.
 456          */
 457         if (last)
 458                 rq->cmd_flags |= REQ_END;
 459 }
 460
 461 static void __blk_mq_requeue_request(struct request *rq)
 462 {
 463         struct request_queue *q = rq->q;
 464
 465         trace_block_rq_requeue(q, rq);
 466         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 467
 468         rq->cmd_flags &= ~REQ_END;
 469
 470         if (q->dma_drain_size && blk_rq_bytes(rq))
 471                 rq->nr_phys_segments--;
 472 }
 473
 474 void blk_mq_requeue_request(struct request *rq)
 475 {
 476         __blk_mq_requeue_request(rq);
 477         blk_clear_rq_complete(rq);
 478
 479         BUG_ON(blk_queued_rq(rq));
 480         blk_mq_insert_request(rq, true, true, false);
 481 }
 482 EXPORT_SYMBOL(blk_mq_requeue_request);
 483
 484 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 485 {
 486         return tags->rqs[tag];
 487 }
 488 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 489
 490 struct blk_mq_timeout_data {
 491         struct blk_mq_hw_ctx *hctx;
 492         unsigned long *next;
 493         unsigned int *next_set;
 494 };
 495
 496 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 497 {
 498         struct blk_mq_timeout_data *data = __data;
 499         struct blk_mq_hw_ctx *hctx = data->hctx;
 500         unsigned int tag;
 501
 502          /* It may not be in flight yet (this is where
 503          * the REQ_ATOMIC_STARTED flag comes in). The requests are
 504          * statically allocated, so we know it's always safe to access the
 505          * memory associated with a bit offset into ->rqs[].
 506          */
 507         tag = 0;
 508         do {
 509                 struct request *rq;
 510
 511                 tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
 512                 if (tag >= hctx->tags->nr_tags)
 513                         break;
 514
 515                 rq = blk_mq_tag_to_rq(hctx->tags, tag++);
 516                 if (rq->q != hctx->queue)
 517                         continue;
 518                 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 519                         continue;
 520
 521                 blk_rq_check_expired(rq, data->next, data->next_set);
 522         } while (1);
 523 }
 524
 525 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
 526                                         unsigned long *next,
 527                                         unsigned int *next_set)
 528 {
 529         struct blk_mq_timeout_data data = {
 530                 .hctx           = hctx,
 531                 .next           = next,
 532                 .next_set       = next_set,
 533         };
 534
 535         /*
 536          * Ask the tagging code to iterate busy requests, so we can
 537          * check them for timeout.
 538          */
 539         blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
 540 }
 541
 542 static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
 543 {
 544         struct request_queue *q = rq->q;
 545
 546         /*
 547          * We know that complete is set at this point. If STARTED isn't set
 548          * anymore, then the request isn't active and the "timeout" should
 549          * just be ignored. This can happen due to the bitflag ordering.
 550          * Timeout first checks if STARTED is set, and if it is, assumes
 551          * the request is active. But if we race with completion, then
 552          * we both flags will get cleared. So check here again, and ignore
 553          * a timeout event with a request that isn't active.
 554          */
 555         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 556                 return BLK_EH_NOT_HANDLED;
 557
 558         if (!q->mq_ops->timeout)
 559                 return BLK_EH_RESET_TIMER;
 560
 561         return q->mq_ops->timeout(rq);
 562 }
 563
 564 static void blk_mq_rq_timer(unsigned long data)
 565 {
 566         struct request_queue *q = (struct request_queue *) data;
 567         struct blk_mq_hw_ctx *hctx;
 568         unsigned long next = 0;
 569         int i, next_set = 0;
 570
 571         queue_for_each_hw_ctx(q, hctx, i)
 572                 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
 573
 574         if (next_set)
 575                 mod_timer(&q->timeout, round_jiffies_up(next));
 576 }
 577
 578 /*
 579  * Reverse check our software queue for entries that we could potentially
 580  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 581  * too much time checking for merges.
 582  */
 583 static bool blk_mq_attempt_merge(struct request_queue *q,
 584                                  struct blk_mq_ctx *ctx, struct bio *bio)
 585 {
 586         struct request *rq;
 587         int checked = 8;
 588
 589         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 590                 int el_ret;
 591
 592                 if (!checked--)
 593                         break;
 594
 595                 if (!blk_rq_merge_ok(rq, bio))
 596                         continue;
 597
 598                 el_ret = blk_try_merge(rq, bio);
 599                 if (el_ret == ELEVATOR_BACK_MERGE) {
 600                         if (bio_attempt_back_merge(q, rq, bio)) {
 601                                 ctx->rq_merged++;
 602                                 return true;
 603                         }
 604                         break;
 605                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 606                         if (bio_attempt_front_merge(q, rq, bio)) {
 607                                 ctx->rq_merged++;
 608                                 return true;
 609                         }
 610                         break;
 611                 }
 612         }
 613
 614         return false;
 615 }
 616
 617 /*
 618  * Run this hardware queue, pulling any software queues mapped to it in.
 619  * Note that this function currently has various problems around ordering
 620  * of IO. In particular, we'd like FIFO behaviour on handling existing
 621  * items on the hctx->dispatch list. Ignore that for now.
 622  */
 623 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 624 {
 625         struct request_queue *q = hctx->queue;
 626         struct blk_mq_ctx *ctx;
 627         struct request *rq;
 628         LIST_HEAD(rq_list);
 629         int bit, queued;
 630
 631         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 632
 633         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 634                 return;
 635
 636         hctx->run++;
 637
 638         /*
 639          * Touch any software queue that has pending entries.
 640          */
 641         for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
 642                 clear_bit(bit, hctx->ctx_map);
 643                 ctx = hctx->ctxs[bit];
 644
 645                 spin_lock(&ctx->lock);
 646                 list_splice_tail_init(&ctx->rq_list, &rq_list);
 647                 spin_unlock(&ctx->lock);
 648         }
 649
 650         /*
 651          * If we have previous entries on our dispatch list, grab them
 652          * and stuff them at the front for more fair dispatch.
 653          */
 654         if (!list_empty_careful(&hctx->dispatch)) {
 655                 spin_lock(&hctx->lock);
 656                 if (!list_empty(&hctx->dispatch))
 657                         list_splice_init(&hctx->dispatch, &rq_list);
 658                 spin_unlock(&hctx->lock);
 659         }
 660
 661         /*
 662          * Delete and return all entries from our dispatch list
 663          */
 664         queued = 0;
 665
 666         /*
 667          * Now process all the entries, sending them to the driver.
 668          */
 669         while (!list_empty(&rq_list)) {
 670                 int ret;
 671
 672                 rq = list_first_entry(&rq_list, struct request, queuelist);
 673                 list_del_init(&rq->queuelist);
 674
 675                 blk_mq_start_request(rq, list_empty(&rq_list));
 676
 677                 ret = q->mq_ops->queue_rq(hctx, rq);
 678                 switch (ret) {
 679                 case BLK_MQ_RQ_QUEUE_OK:
 680                         queued++;
 681                         continue;
 682                 case BLK_MQ_RQ_QUEUE_BUSY:
 683                         list_add(&rq->queuelist, &rq_list);
 684                         __blk_mq_requeue_request(rq);
 685                         break;
 686                 default:
 687                         pr_err("blk-mq: bad return on queue: %d\n", ret);
 688                 case BLK_MQ_RQ_QUEUE_ERROR:
 689                         rq->errors = -EIO;
 690                         blk_mq_end_io(rq, rq->errors);
 691                         break;
 692                 }
 693
 694                 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 695                         break;
 696         }
 697
 698         if (!queued)
 699                 hctx->dispatched[0]++;
 700         else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
 701                 hctx->dispatched[ilog2(queued) + 1]++;
 702
 703         /*
 704          * Any items that need requeuing? Stuff them into hctx->dispatch,
 705          * that is where we will continue on next queue run.
 706          */
 707         if (!list_empty(&rq_list)) {
 708                 spin_lock(&hctx->lock);
 709                 list_splice(&rq_list, &hctx->dispatch);
 710                 spin_unlock(&hctx->lock);
 711         }
 712 }
 713
 714 /*
 715  * It'd be great if the workqueue API had a way to pass
 716  * in a mask and had some smarts for more clever placement.
 717  * For now we just round-robin here, switching for every
 718  * BLK_MQ_CPU_WORK_BATCH queued items.
 719  */
 720 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 721 {
 722         int cpu = hctx->next_cpu;
 723
 724         if (--hctx->next_cpu_batch <= 0) {
 725                 int next_cpu;
 726
 727                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 728                 if (next_cpu >= nr_cpu_ids)
 729                         next_cpu = cpumask_first(hctx->cpumask);
 730
 731                 hctx->next_cpu = next_cpu;
 732                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 733         }
 734
 735         return cpu;
 736 }
 737
 738 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 739 {
 740         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 741                 return;
 742
 743         if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 744                 __blk_mq_run_hw_queue(hctx);
 745         else if (hctx->queue->nr_hw_queues == 1)
 746                 kblockd_schedule_delayed_work(&hctx->run_work, 0);
 747         else {
 748                 unsigned int cpu;
 749
 750                 cpu = blk_mq_hctx_next_cpu(hctx);
 751                 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 752         }
 753 }
 754
 755 void blk_mq_run_queues(struct request_queue *q, bool async)
 756 {
 757         struct blk_mq_hw_ctx *hctx;
 758         int i;
 759
 760         queue_for_each_hw_ctx(q, hctx, i) {
 761                 if ((!blk_mq_hctx_has_pending(hctx) &&
 762                     list_empty_careful(&hctx->dispatch)) ||
 763                     test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 764                         continue;
 765
 766                 preempt_disable();
 767                 blk_mq_run_hw_queue(hctx, async);
 768                 preempt_enable();
 769         }
 770 }
 771 EXPORT_SYMBOL(blk_mq_run_queues);
 772
 773 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 774 {
 775         cancel_delayed_work(&hctx->run_work);
 776         cancel_delayed_work(&hctx->delay_work);
 777         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 778 }
 779 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 780
 781 void blk_mq_stop_hw_queues(struct request_queue *q)
 782 {
 783         struct blk_mq_hw_ctx *hctx;
 784         int i;
 785
 786         queue_for_each_hw_ctx(q, hctx, i)
 787                 blk_mq_stop_hw_queue(hctx);
 788 }
 789 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 790
 791 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 792 {
 793         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 794
 795         preempt_disable();
 796         __blk_mq_run_hw_queue(hctx);
 797         preempt_enable();
 798 }
 799 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 800
 801 void blk_mq_start_hw_queues(struct request_queue *q)
 802 {
 803         struct blk_mq_hw_ctx *hctx;
 804         int i;
 805
 806         queue_for_each_hw_ctx(q, hctx, i)
 807                 blk_mq_start_hw_queue(hctx);
 808 }
 809 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 810
 811
 812 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 813 {
 814         struct blk_mq_hw_ctx *hctx;
 815         int i;
 816
 817         queue_for_each_hw_ctx(q, hctx, i) {
 818                 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 819                         continue;
 820
 821                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 822                 preempt_disable();
 823                 blk_mq_run_hw_queue(hctx, async);
 824                 preempt_enable();
 825         }
 826 }
 827 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 828
 829 static void blk_mq_run_work_fn(struct work_struct *work)
 830 {
 831         struct blk_mq_hw_ctx *hctx;
 832
 833         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 834
 835         __blk_mq_run_hw_queue(hctx);
 836 }
 837
 838 static void blk_mq_delay_work_fn(struct work_struct *work)
 839 {
 840         struct blk_mq_hw_ctx *hctx;
 841
 842         hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
 843
 844         if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
 845                 __blk_mq_run_hw_queue(hctx);
 846 }
 847
 848 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 849 {
 850         unsigned long tmo = msecs_to_jiffies(msecs);
 851
 852         if (hctx->queue->nr_hw_queues == 1)
 853                 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
 854         else {
 855                 unsigned int cpu;
 856
 857                 cpu = blk_mq_hctx_next_cpu(hctx);
 858                 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 859         }
 860 }
 861 EXPORT_SYMBOL(blk_mq_delay_queue);
 862
 863 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 864                                     struct request *rq, bool at_head)
 865 {
 866         struct blk_mq_ctx *ctx = rq->mq_ctx;
 867
 868         trace_block_rq_insert(hctx->queue, rq);
 869
 870         if (at_head)
 871                 list_add(&rq->queuelist, &ctx->rq_list);
 872         else
 873                 list_add_tail(&rq->queuelist, &ctx->rq_list);
 874
 875         blk_mq_hctx_mark_pending(hctx, ctx);
 876
 877         /*
 878          * We do this early, to ensure we are on the right CPU.
 879          */
 880         blk_add_timer(rq);
 881 }
 882
 883 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 884                 bool async)
 885 {
 886         struct request_queue *q = rq->q;
 887         struct blk_mq_hw_ctx *hctx;
 888         struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
 889
 890         current_ctx = blk_mq_get_ctx(q);
 891         if (!cpu_online(ctx->cpu))
 892                 rq->mq_ctx = ctx = current_ctx;
 893
 894         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 895
 896         if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) &&
 897             !(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
 898                 blk_insert_flush(rq);
 899         } else {
 900                 spin_lock(&ctx->lock);
 901                 __blk_mq_insert_request(hctx, rq, at_head);
 902                 spin_unlock(&ctx->lock);
 903         }
 904
 905         if (run_queue)
 906                 blk_mq_run_hw_queue(hctx, async);
 907
 908         blk_mq_put_ctx(current_ctx);
 909 }
 910
 911 static void blk_mq_insert_requests(struct request_queue *q,
 912                                      struct blk_mq_ctx *ctx,
 913                                      struct list_head *list,
 914                                      int depth,
 915                                      bool from_schedule)
 916
 917 {
 918         struct blk_mq_hw_ctx *hctx;
 919         struct blk_mq_ctx *current_ctx;
 920
 921         trace_block_unplug(q, depth, !from_schedule);
 922
 923         current_ctx = blk_mq_get_ctx(q);
 924
 925         if (!cpu_online(ctx->cpu))
 926                 ctx = current_ctx;
 927         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 928
 929         /*
 930          * preemption doesn't flush plug list, so it's possible ctx->cpu is
 931          * offline now
 932          */
 933         spin_lock(&ctx->lock);
 934         while (!list_empty(list)) {
 935                 struct request *rq;
 936
 937                 rq = list_first_entry(list, struct request, queuelist);
 938                 list_del_init(&rq->queuelist);
 939                 rq->mq_ctx = ctx;
 940                 __blk_mq_insert_request(hctx, rq, false);
 941         }
 942         spin_unlock(&ctx->lock);
 943
 944         blk_mq_run_hw_queue(hctx, from_schedule);
 945         blk_mq_put_ctx(current_ctx);
 946 }
 947
 948 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
 949 {
 950         struct request *rqa = container_of(a, struct request, queuelist);
 951         struct request *rqb = container_of(b, struct request, queuelist);
 952
 953         return !(rqa->mq_ctx < rqb->mq_ctx ||
 954                  (rqa->mq_ctx == rqb->mq_ctx &&
 955                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
 956 }
 957
 958 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 959 {
 960         struct blk_mq_ctx *this_ctx;
 961         struct request_queue *this_q;
 962         struct request *rq;
 963         LIST_HEAD(list);
 964         LIST_HEAD(ctx_list);
 965         unsigned int depth;
 966
 967         list_splice_init(&plug->mq_list, &list);
 968
 969         list_sort(NULL, &list, plug_ctx_cmp);
 970
 971         this_q = NULL;
 972         this_ctx = NULL;
 973         depth = 0;
 974
 975         while (!list_empty(&list)) {
 976                 rq = list_entry_rq(list.next);
 977                 list_del_init(&rq->queuelist);
 978                 BUG_ON(!rq->q);
 979                 if (rq->mq_ctx != this_ctx) {
 980                         if (this_ctx) {
 981                                 blk_mq_insert_requests(this_q, this_ctx,
 982                                                         &ctx_list, depth,
 983                                                         from_schedule);
 984                         }
 985
 986                         this_ctx = rq->mq_ctx;
 987                         this_q = rq->q;
 988                         depth = 0;
 989                 }
 990
 991                 depth++;
 992                 list_add_tail(&rq->queuelist, &ctx_list);
 993         }
 994
 995         /*
 996          * If 'this_ctx' is set, we know we have entries to complete
 997          * on 'ctx_list'. Do those.
 998          */
 999         if (this_ctx) {
1000                 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1001                                        from_schedule);
1002         }
1003 }
1004
1005 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1006 {
1007         init_request_from_bio(rq, bio);
1008         blk_account_io_start(rq, 1);
1009 }
1010
1011 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1012 {
1013         struct blk_mq_hw_ctx *hctx;
1014         struct blk_mq_ctx *ctx;
1015         const int is_sync = rw_is_sync(bio->bi_rw);
1016         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1017         int rw = bio_data_dir(bio);
1018         struct request *rq;
1019         unsigned int use_plug, request_count = 0;
1020
1021         /*
1022          * If we have multiple hardware queues, just go directly to
1023          * one of those for sync IO.
1024          */
1025         use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
1026
1027         blk_queue_bounce(q, &bio);
1028
1029         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1030                 bio_endio(bio, -EIO);
1031                 return;
1032         }
1033
1034         if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
1035                 return;
1036
1037         if (blk_mq_queue_enter(q)) {
1038                 bio_endio(bio, -EIO);
1039                 return;
1040         }
1041
1042         ctx = blk_mq_get_ctx(q);
1043         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1044
1045         if (is_sync)
1046                 rw |= REQ_SYNC;
1047         trace_block_getrq(q, bio, rw);
1048         rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false);
1049         if (likely(rq))
1050                 blk_mq_rq_ctx_init(q, ctx, rq, rw);
1051         else {
1052                 blk_mq_put_ctx(ctx);
1053                 trace_block_sleeprq(q, bio, rw);
1054                 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
1055                                                         false);
1056                 ctx = rq->mq_ctx;
1057                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1058         }
1059
1060         hctx->queued++;
1061
1062         if (unlikely(is_flush_fua)) {
1063                 blk_mq_bio_to_request(rq, bio);
1064                 blk_insert_flush(rq);
1065                 goto run_queue;
1066         }
1067
1068         /*
1069          * A task plug currently exists. Since this is completely lockless,
1070          * utilize that to temporarily store requests until the task is
1071          * either done or scheduled away.
1072          */
1073         if (use_plug) {
1074                 struct blk_plug *plug = current->plug;
1075
1076                 if (plug) {
1077                         blk_mq_bio_to_request(rq, bio);
1078                         if (list_empty(&plug->mq_list))
1079                                 trace_block_plug(q);
1080                         else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1081                                 blk_flush_plug_list(plug, false);
1082                                 trace_block_plug(q);
1083                         }
1084                         list_add_tail(&rq->queuelist, &plug->mq_list);
1085                         blk_mq_put_ctx(ctx);
1086                         return;
1087                 }
1088         }
1089
1090         if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
1091                 blk_mq_bio_to_request(rq, bio);
1092                 spin_lock(&ctx->lock);
1093 insert_rq:
1094                 __blk_mq_insert_request(hctx, rq, false);
1095                 spin_unlock(&ctx->lock);
1096         } else {
1097                 spin_lock(&ctx->lock);
1098                 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1099                         blk_mq_bio_to_request(rq, bio);
1100                         goto insert_rq;
1101                 }
1102
1103                 spin_unlock(&ctx->lock);
1104                 __blk_mq_free_request(hctx, ctx, rq);
1105         }
1106
1107
1108         /*
1109          * For a SYNC request, send it to the hardware immediately. For an
1110          * ASYNC request, just ensure that we run it later on. The latter
1111          * allows for merging opportunities and more efficient dispatching.
1112          */
1113 run_queue:
1114         blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
1115         blk_mq_put_ctx(ctx);
1116 }
1117
1118 /*
1119  * Default mapping to a software queue, since we use one per CPU.
1120  */
1121 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1122 {
1123         return q->queue_hw_ctx[q->mq_map[cpu]];
1124 }
1125 EXPORT_SYMBOL(blk_mq_map_queue);
1126
1127 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
1128                                                    unsigned int hctx_index)
1129 {
1130         return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
1131                                 set->numa_node);
1132 }
1133 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
1134
1135 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
1136                                  unsigned int hctx_index)
1137 {
1138         kfree(hctx);
1139 }
1140 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
1141
1142 static void blk_mq_hctx_notify(void *data, unsigned long action,
1143                                unsigned int cpu)
1144 {
1145         struct blk_mq_hw_ctx *hctx = data;
1146         struct request_queue *q = hctx->queue;
1147         struct blk_mq_ctx *ctx;
1148         LIST_HEAD(tmp);
1149
1150         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1151                 return;
1152
1153         /*
1154          * Move ctx entries to new CPU, if this one is going away.
1155          */
1156         ctx = __blk_mq_get_ctx(q, cpu);
1157
1158         spin_lock(&ctx->lock);
1159         if (!list_empty(&ctx->rq_list)) {
1160                 list_splice_init(&ctx->rq_list, &tmp);
1161                 clear_bit(ctx->index_hw, hctx->ctx_map);
1162         }
1163         spin_unlock(&ctx->lock);
1164
1165         if (list_empty(&tmp))
1166                 return;
1167
1168         ctx = blk_mq_get_ctx(q);
1169         spin_lock(&ctx->lock);
1170
1171         while (!list_empty(&tmp)) {
1172                 struct request *rq;
1173
1174                 rq = list_first_entry(&tmp, struct request, queuelist);
1175                 rq->mq_ctx = ctx;
1176                 list_move_tail(&rq->queuelist, &ctx->rq_list);
1177         }
1178
1179         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1180         blk_mq_hctx_mark_pending(hctx, ctx);
1181
1182         spin_unlock(&ctx->lock);
1183
1184         blk_mq_run_hw_queue(hctx, true);
1185         blk_mq_put_ctx(ctx);
1186 }
1187
1188 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1189                 struct blk_mq_tags *tags, unsigned int hctx_idx)
1190 {
1191         struct page *page;
1192
1193         if (tags->rqs && set->ops->exit_request) {
1194                 int i;
1195
1196                 for (i = 0; i < tags->nr_tags; i++) {
1197                         if (!tags->rqs[i])
1198                                 continue;
1199                         set->ops->exit_request(set->driver_data, tags->rqs[i],
1200                                                 hctx_idx, i);
1201                 }
1202         }
1203
1204         while (!list_empty(&tags->page_list)) {
1205                 page = list_first_entry(&tags->page_list, struct page, lru);
1206                 list_del_init(&page->lru);
1207                 __free_pages(page, page->private);
1208         }
1209
1210         kfree(tags->rqs);
1211
1212         blk_mq_free_tags(tags);
1213 }
1214
1215 static size_t order_to_size(unsigned int order)
1216 {
1217         return (size_t)PAGE_SIZE << order;
1218 }
1219
1220 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1221                 unsigned int hctx_idx)
1222 {
1223         struct blk_mq_tags *tags;
1224         unsigned int i, j, entries_per_page, max_order = 4;
1225         size_t rq_size, left;
1226
1227         tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1228                                 set->numa_node);
1229         if (!tags)
1230                 return NULL;
1231
1232         INIT_LIST_HEAD(&tags->page_list);
1233
1234         tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
1235                                         GFP_KERNEL, set->numa_node);
1236         if (!tags->rqs) {
1237                 blk_mq_free_tags(tags);
1238                 return NULL;
1239         }
1240
1241         /*
1242          * rq_size is the size of the request plus driver payload, rounded
1243          * to the cacheline size
1244          */
1245         rq_size = round_up(sizeof(struct request) + set->cmd_size,
1246                                 cache_line_size());
1247         left = rq_size * set->queue_depth;
1248
1249         for (i = 0; i < set->queue_depth; ) {
1250                 int this_order = max_order;
1251                 struct page *page;
1252                 int to_do;
1253                 void *p;
1254
1255                 while (left < order_to_size(this_order - 1) && this_order)
1256                         this_order--;
1257
1258                 do {
1259                         page = alloc_pages_node(set->numa_node, GFP_KERNEL,
1260                                                 this_order);
1261                         if (page)
1262                                 break;
1263                         if (!this_order--)
1264                                 break;
1265                         if (order_to_size(this_order) < rq_size)
1266                                 break;
1267                 } while (1);
1268
1269                 if (!page)
1270                         goto fail;
1271
1272                 page->private = this_order;
1273                 list_add_tail(&page->lru, &tags->page_list);
1274
1275                 p = page_address(page);
1276                 entries_per_page = order_to_size(this_order) / rq_size;
1277                 to_do = min(entries_per_page, set->queue_depth - i);
1278                 left -= to_do * rq_size;
1279                 for (j = 0; j < to_do; j++) {
1280                         tags->rqs[i] = p;
1281                         if (set->ops->init_request) {
1282                                 if (set->ops->init_request(set->driver_data,
1283                                                 tags->rqs[i], hctx_idx, i,
1284                                                 set->numa_node))
1285                                         goto fail;
1286                         }
1287
1288                         p += rq_size;
1289                         i++;
1290                 }
1291         }
1292
1293         return tags;
1294
1295 fail:
1296         pr_warn("%s: failed to allocate requests\n", __func__);
1297         blk_mq_free_rq_map(set, tags, hctx_idx);
1298         return NULL;
1299 }
1300
1301 static int blk_mq_init_hw_queues(struct request_queue *q,
1302                 struct blk_mq_tag_set *set)
1303 {
1304         struct blk_mq_hw_ctx *hctx;
1305         unsigned int i, j;
1306
1307         /*
1308          * Initialize hardware queues
1309          */
1310         queue_for_each_hw_ctx(q, hctx, i) {
1311                 unsigned int num_maps;
1312                 int node;
1313
1314                 node = hctx->numa_node;
1315                 if (node == NUMA_NO_NODE)
1316                         node = hctx->numa_node = set->numa_node;
1317
1318                 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1319                 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1320                 spin_lock_init(&hctx->lock);
1321                 INIT_LIST_HEAD(&hctx->dispatch);
1322                 hctx->queue = q;
1323                 hctx->queue_num = i;
1324                 hctx->flags = set->flags;
1325                 hctx->cmd_size = set->cmd_size;
1326
1327                 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1328                                                 blk_mq_hctx_notify, hctx);
1329                 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1330
1331                 hctx->tags = set->tags[i];
1332
1333                 /*
1334                  * Allocate space for all possible cpus to avoid allocation in
1335                  * runtime
1336                  */
1337                 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1338                                                 GFP_KERNEL, node);
1339                 if (!hctx->ctxs)
1340                         break;
1341
1342                 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1343                 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1344                                                 GFP_KERNEL, node);
1345                 if (!hctx->ctx_map)
1346                         break;
1347
1348                 hctx->nr_ctx_map = num_maps;
1349                 hctx->nr_ctx = 0;
1350
1351                 if (set->ops->init_hctx &&
1352                     set->ops->init_hctx(hctx, set->driver_data, i))
1353                         break;
1354         }
1355
1356         if (i == q->nr_hw_queues)
1357                 return 0;
1358
1359         /*
1360          * Init failed
1361          */
1362         queue_for_each_hw_ctx(q, hctx, j) {
1363                 if (i == j)
1364                         break;
1365
1366                 if (set->ops->exit_hctx)
1367                         set->ops->exit_hctx(hctx, j);
1368
1369                 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1370                 kfree(hctx->ctxs);
1371                 kfree(hctx->ctx_map);
1372         }
1373
1374         return 1;
1375 }
1376
1377 static void blk_mq_init_cpu_queues(struct request_queue *q,
1378                                    unsigned int nr_hw_queues)
1379 {
1380         unsigned int i;
1381
1382         for_each_possible_cpu(i) {
1383                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1384                 struct blk_mq_hw_ctx *hctx;
1385
1386                 memset(__ctx, 0, sizeof(*__ctx));
1387                 __ctx->cpu = i;
1388                 spin_lock_init(&__ctx->lock);
1389                 INIT_LIST_HEAD(&__ctx->rq_list);
1390                 __ctx->queue = q;
1391
1392                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1393                 if (!cpu_online(i))
1394                         continue;
1395
1396                 hctx = q->mq_ops->map_queue(q, i);
1397                 cpumask_set_cpu(i, hctx->cpumask);
1398                 hctx->nr_ctx++;
1399
1400                 /*
1401                  * Set local node, IFF we have more than one hw queue. If
1402                  * not, we remain on the home node of the device
1403                  */
1404                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1405                         hctx->numa_node = cpu_to_node(i);
1406         }
1407 }
1408
1409 static void blk_mq_map_swqueue(struct request_queue *q)
1410 {
1411         unsigned int i;
1412         struct blk_mq_hw_ctx *hctx;
1413         struct blk_mq_ctx *ctx;
1414
1415         queue_for_each_hw_ctx(q, hctx, i) {
1416                 cpumask_clear(hctx->cpumask);
1417                 hctx->nr_ctx = 0;
1418         }
1419
1420         /*
1421          * Map software to hardware queues
1422          */
1423         queue_for_each_ctx(q, ctx, i) {
1424                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1425                 if (!cpu_online(i))
1426                         continue;
1427
1428                 hctx = q->mq_ops->map_queue(q, i);
1429                 cpumask_set_cpu(i, hctx->cpumask);
1430                 ctx->index_hw = hctx->nr_ctx;
1431                 hctx->ctxs[hctx->nr_ctx++] = ctx;
1432         }
1433
1434         queue_for_each_hw_ctx(q, hctx, i) {
1435                 hctx->next_cpu = cpumask_first(hctx->cpumask);
1436                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1437         }
1438 }
1439
1440 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1441 {
1442         struct blk_mq_hw_ctx **hctxs;
1443         struct blk_mq_ctx *ctx;
1444         struct request_queue *q;
1445         int i;
1446
1447         ctx = alloc_percpu(struct blk_mq_ctx);
1448         if (!ctx)
1449                 return ERR_PTR(-ENOMEM);
1450
1451         hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1452                         set->numa_node);
1453
1454         if (!hctxs)
1455                 goto err_percpu;
1456
1457         for (i = 0; i < set->nr_hw_queues; i++) {
1458                 hctxs[i] = set->ops->alloc_hctx(set, i);
1459                 if (!hctxs[i])
1460                         goto err_hctxs;
1461
1462                 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1463                         goto err_hctxs;
1464
1465                 hctxs[i]->numa_node = NUMA_NO_NODE;
1466                 hctxs[i]->queue_num = i;
1467         }
1468
1469         q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1470         if (!q)
1471                 goto err_hctxs;
1472
1473         q->mq_map = blk_mq_make_queue_map(set);
1474         if (!q->mq_map)
1475                 goto err_map;
1476
1477         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1478         blk_queue_rq_timeout(q, 30000);
1479
1480         q->nr_queues = nr_cpu_ids;
1481         q->nr_hw_queues = set->nr_hw_queues;
1482
1483         q->queue_ctx = ctx;
1484         q->queue_hw_ctx = hctxs;
1485
1486         q->mq_ops = set->ops;
1487         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1488
1489         q->sg_reserved_size = INT_MAX;
1490
1491         blk_queue_make_request(q, blk_mq_make_request);
1492         blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
1493         if (set->timeout)
1494                 blk_queue_rq_timeout(q, set->timeout);
1495
1496         if (set->ops->complete)
1497                 blk_queue_softirq_done(q, set->ops->complete);
1498
1499         blk_mq_init_flush(q);
1500         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1501
1502         q->flush_rq = kzalloc(round_up(sizeof(struct request) +
1503                                 set->cmd_size, cache_line_size()),
1504                                 GFP_KERNEL);
1505         if (!q->flush_rq)
1506                 goto err_hw;
1507
1508         if (blk_mq_init_hw_queues(q, set))
1509                 goto err_flush_rq;
1510
1511         blk_mq_map_swqueue(q);
1512
1513         mutex_lock(&all_q_mutex);
1514         list_add_tail(&q->all_q_node, &all_q_list);
1515         mutex_unlock(&all_q_mutex);
1516
1517         return q;
1518
1519 err_flush_rq:
1520         kfree(q->flush_rq);
1521 err_hw:
1522         kfree(q->mq_map);
1523 err_map:
1524         blk_cleanup_queue(q);
1525 err_hctxs:
1526         for (i = 0; i < set->nr_hw_queues; i++) {
1527                 if (!hctxs[i])
1528                         break;
1529                 free_cpumask_var(hctxs[i]->cpumask);
1530                 set->ops->free_hctx(hctxs[i], i);
1531         }
1532         kfree(hctxs);
1533 err_percpu:
1534         free_percpu(ctx);
1535         return ERR_PTR(-ENOMEM);
1536 }
1537 EXPORT_SYMBOL(blk_mq_init_queue);
1538
1539 void blk_mq_free_queue(struct request_queue *q)
1540 {
1541         struct blk_mq_hw_ctx *hctx;
1542         int i;
1543
1544         queue_for_each_hw_ctx(q, hctx, i) {
1545                 kfree(hctx->ctx_map);
1546                 kfree(hctx->ctxs);
1547                 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1548                 if (q->mq_ops->exit_hctx)
1549                         q->mq_ops->exit_hctx(hctx, i);
1550                 free_cpumask_var(hctx->cpumask);
1551                 q->mq_ops->free_hctx(hctx, i);
1552         }
1553
1554         free_percpu(q->queue_ctx);
1555         kfree(q->queue_hw_ctx);
1556         kfree(q->mq_map);
1557
1558         q->queue_ctx = NULL;
1559         q->queue_hw_ctx = NULL;
1560         q->mq_map = NULL;
1561
1562         mutex_lock(&all_q_mutex);
1563         list_del_init(&q->all_q_node);
1564         mutex_unlock(&all_q_mutex);
1565 }
1566
1567 /* Basically redo blk_mq_init_queue with queue frozen */
1568 static void blk_mq_queue_reinit(struct request_queue *q)
1569 {
1570         blk_mq_freeze_queue(q);
1571
1572         blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1573
1574         /*
1575          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1576          * we should change hctx numa_node according to new topology (this
1577          * involves free and re-allocate memory, worthy doing?)
1578          */
1579
1580         blk_mq_map_swqueue(q);
1581
1582         blk_mq_unfreeze_queue(q);
1583 }
1584
1585 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1586                                       unsigned long action, void *hcpu)
1587 {
1588         struct request_queue *q;
1589
1590         /*
1591          * Before new mappings are established, hotadded cpu might already
1592          * start handling requests. This doesn't break anything as we map
1593          * offline CPUs to first hardware queue. We will re-init the queue
1594          * below to get optimal settings.
1595          */
1596         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1597             action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1598                 return NOTIFY_OK;
1599
1600         mutex_lock(&all_q_mutex);
1601         list_for_each_entry(q, &all_q_list, all_q_node)
1602                 blk_mq_queue_reinit(q);
1603         mutex_unlock(&all_q_mutex);
1604         return NOTIFY_OK;
1605 }
1606
1607 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1608 {
1609         int i;
1610
1611         if (!set->nr_hw_queues)
1612                 return -EINVAL;
1613         if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
1614                 return -EINVAL;
1615         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
1616                 return -EINVAL;
1617
1618         if (!set->nr_hw_queues ||
1619             !set->ops->queue_rq || !set->ops->map_queue ||
1620             !set->ops->alloc_hctx || !set->ops->free_hctx)
1621                 return -EINVAL;
1622
1623
1624         set->tags = kmalloc_node(set->nr_hw_queues *
1625                                  sizeof(struct blk_mq_tags *),
1626                                  GFP_KERNEL, set->numa_node);
1627         if (!set->tags)
1628                 goto out;
1629
1630         for (i = 0; i < set->nr_hw_queues; i++) {
1631                 set->tags[i] = blk_mq_init_rq_map(set, i);
1632                 if (!set->tags[i])
1633                         goto out_unwind;
1634         }
1635
1636         return 0;
1637
1638 out_unwind:
1639         while (--i >= 0)
1640                 blk_mq_free_rq_map(set, set->tags[i], i);
1641 out:
1642         return -ENOMEM;
1643 }
1644 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
1645
1646 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
1647 {
1648         int i;
1649
1650         for (i = 0; i < set->nr_hw_queues; i++)
1651                 blk_mq_free_rq_map(set, set->tags[i], i);
1652         kfree(set->tags);
1653 }
1654 EXPORT_SYMBOL(blk_mq_free_tag_set);
1655
1656 void blk_mq_disable_hotplug(void)
1657 {
1658         mutex_lock(&all_q_mutex);
1659 }
1660
1661 void blk_mq_enable_hotplug(void)
1662 {
1663         mutex_unlock(&all_q_mutex);
1664 }
1665
1666 static int __init blk_mq_init(void)
1667 {
1668         blk_mq_cpu_init();
1669
1670         /* Must be called after percpu_counter_hotcpu_callback() */
1671         hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1672
1673         return 0;
1674 }
1675 subsys_initcall(blk_mq_init);