block/blk-mq-sched.c

   1 /*
   2  * blk-mq scheduling framework
   3  *
   4  * Copyright (C) 2016 Jens Axboe
   5  */
   6 #include <linux/kernel.h>
   7 #include <linux/module.h>
   8 #include <linux/blk-mq.h>
   9
  10 #include <trace/events/block.h>
  11
  12 #include "blk.h"
  13 #include "blk-mq.h"
  14 #include "blk-mq-debugfs.h"
  15 #include "blk-mq-sched.h"
  16 #include "blk-mq-tag.h"
  17 #include "blk-wbt.h"
  18
  19 void blk_mq_sched_free_hctx_data(struct request_queue *q,
  20                                  void (*exit)(struct blk_mq_hw_ctx *))
  21 {
  22         struct blk_mq_hw_ctx *hctx;
  23         int i;
  24
  25         queue_for_each_hw_ctx(q, hctx, i) {
  26                 if (exit && hctx->sched_data)
  27                         exit(hctx);
  28                 kfree(hctx->sched_data);
  29                 hctx->sched_data = NULL;
  30         }
  31 }
  32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  33
  34 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
  35                                       struct request *rq,
  36                                       struct bio *bio,
  37                                       struct io_context *ioc)
  38 {
  39         struct io_cq *icq;
  40
  41         spin_lock_irq(q->queue_lock);
  42         icq = ioc_lookup_icq(ioc, q);
  43         spin_unlock_irq(q->queue_lock);
  44
  45         if (!icq) {
  46                 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  47                 if (!icq)
  48                         return;
  49         }
  50
  51         rq->elv.icq = icq;
  52         if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
  53                 rq->rq_flags |= RQF_ELVPRIV;
  54                 get_io_context(icq->ioc);
  55                 return;
  56         }
  57
  58         rq->elv.icq = NULL;
  59 }
  60
  61 static void blk_mq_sched_assign_ioc(struct request_queue *q,
  62                                     struct request *rq, struct bio *bio)
  63 {
  64         struct io_context *ioc;
  65
  66         ioc = rq_ioc(bio);
  67         if (ioc)
  68                 __blk_mq_sched_assign_ioc(q, rq, bio, ioc);
  69 }
  70
  71 struct request *blk_mq_sched_get_request(struct request_queue *q,
  72                                          struct bio *bio,
  73                                          unsigned int op,
  74                                          struct blk_mq_alloc_data *data)
  75 {
  76         struct elevator_queue *e = q->elevator;
  77         struct request *rq;
  78
  79         blk_queue_enter_live(q);
  80         data->q = q;
  81         if (likely(!data->ctx))
  82                 data->ctx = blk_mq_get_ctx(q);
  83         if (likely(!data->hctx))
  84                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
  85
  86         if (e) {
  87                 data->flags |= BLK_MQ_REQ_INTERNAL;
  88
  89                 /*
  90                  * Flush requests are special and go directly to the
  91                  * dispatch list.
  92                  */
  93                 if (!op_is_flush(op) && e->type->ops.mq.get_request) {
  94                         rq = e->type->ops.mq.get_request(q, op, data);
  95                         if (rq)
  96                                 rq->rq_flags |= RQF_QUEUED;
  97                 } else
  98                         rq = __blk_mq_alloc_request(data, op);
  99         } else {
 100                 rq = __blk_mq_alloc_request(data, op);
 101         }
 102
 103         if (rq) {
 104                 if (!op_is_flush(op)) {
 105                         rq->elv.icq = NULL;
 106                         if (e && e->type->icq_cache)
 107                                 blk_mq_sched_assign_ioc(q, rq, bio);
 108                 }
 109                 data->hctx->queued++;
 110                 return rq;
 111         }
 112
 113         blk_queue_exit(q);
 114         return NULL;
 115 }
 116
 117 void blk_mq_sched_put_request(struct request *rq)
 118 {
 119         struct request_queue *q = rq->q;
 120         struct elevator_queue *e = q->elevator;
 121
 122         if (rq->rq_flags & RQF_ELVPRIV) {
 123                 blk_mq_sched_put_rq_priv(rq->q, rq);
 124                 if (rq->elv.icq) {
 125                         put_io_context(rq->elv.icq->ioc);
 126                         rq->elv.icq = NULL;
 127                 }
 128         }
 129
 130         if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
 131                 e->type->ops.mq.put_request(rq);
 132         else
 133                 blk_mq_finish_request(rq);
 134 }
 135
 136 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 137 {
 138         struct request_queue *q = hctx->queue;
 139         struct elevator_queue *e = q->elevator;
 140         const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 141         bool did_work = false;
 142         LIST_HEAD(rq_list);
 143
 144         if (unlikely(blk_mq_hctx_stopped(hctx)))
 145                 return;
 146
 147         hctx->run++;
 148
 149         /*
 150          * If we have previous entries on our dispatch list, grab them first for
 151          * more fair dispatch.
 152          */
 153         if (!list_empty_careful(&hctx->dispatch)) {
 154                 spin_lock(&hctx->lock);
 155                 if (!list_empty(&hctx->dispatch))
 156                         list_splice_init(&hctx->dispatch, &rq_list);
 157                 spin_unlock(&hctx->lock);
 158         }
 159
 160         /*
 161          * Only ask the scheduler for requests, if we didn't have residual
 162          * requests from the dispatch list. This is to avoid the case where
 163          * we only ever dispatch a fraction of the requests available because
 164          * of low device queue depth. Once we pull requests out of the IO
 165          * scheduler, we can no longer merge or sort them. So it's best to
 166          * leave them there for as long as we can. Mark the hw queue as
 167          * needing a restart in that case.
 168          */
 169         if (!list_empty(&rq_list)) {
 170                 blk_mq_sched_mark_restart_hctx(hctx);
 171                 did_work = blk_mq_dispatch_rq_list(q, &rq_list);
 172         } else if (!has_sched_dispatch) {
 173                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
 174                 blk_mq_dispatch_rq_list(q, &rq_list);
 175         }
 176
 177         /*
 178          * We want to dispatch from the scheduler if we had no work left
 179          * on the dispatch list, OR if we did have work but weren't able
 180          * to make progress.
 181          */
 182         if (!did_work && has_sched_dispatch) {
 183                 do {
 184                         struct request *rq;
 185
 186                         rq = e->type->ops.mq.dispatch_request(hctx);
 187                         if (!rq)
 188                                 break;
 189                         list_add(&rq->queuelist, &rq_list);
 190                 } while (blk_mq_dispatch_rq_list(q, &rq_list));
 191         }
 192 }
 193
 194 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 195                             struct request **merged_request)
 196 {
 197         struct request *rq;
 198
 199         switch (elv_merge(q, &rq, bio)) {
 200         case ELEVATOR_BACK_MERGE:
 201                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 202                         return false;
 203                 if (!bio_attempt_back_merge(q, rq, bio))
 204                         return false;
 205                 *merged_request = attempt_back_merge(q, rq);
 206                 if (!*merged_request)
 207                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
 208                 return true;
 209         case ELEVATOR_FRONT_MERGE:
 210                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 211                         return false;
 212                 if (!bio_attempt_front_merge(q, rq, bio))
 213                         return false;
 214                 *merged_request = attempt_front_merge(q, rq);
 215                 if (!*merged_request)
 216                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 217                 return true;
 218         default:
 219                 return false;
 220         }
 221 }
 222 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 223
 224 /*
 225  * Reverse check our software queue for entries that we could potentially
 226  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 227  * too much time checking for merges.
 228  */
 229 static bool blk_mq_attempt_merge(struct request_queue *q,
 230                                  struct blk_mq_ctx *ctx, struct bio *bio)
 231 {
 232         struct request *rq;
 233         int checked = 8;
 234
 235         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 236                 bool merged = false;
 237
 238                 if (!checked--)
 239                         break;
 240
 241                 if (!blk_rq_merge_ok(rq, bio))
 242                         continue;
 243
 244                 switch (blk_try_merge(rq, bio)) {
 245                 case ELEVATOR_BACK_MERGE:
 246                         if (blk_mq_sched_allow_merge(q, rq, bio))
 247                                 merged = bio_attempt_back_merge(q, rq, bio);
 248                         break;
 249                 case ELEVATOR_FRONT_MERGE:
 250                         if (blk_mq_sched_allow_merge(q, rq, bio))
 251                                 merged = bio_attempt_front_merge(q, rq, bio);
 252                         break;
 253                 case ELEVATOR_DISCARD_MERGE:
 254                         merged = bio_attempt_discard_merge(q, rq, bio);
 255                         break;
 256                 default:
 257                         continue;
 258                 }
 259
 260                 if (merged)
 261                         ctx->rq_merged++;
 262                 return merged;
 263         }
 264
 265         return false;
 266 }
 267
 268 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 269 {
 270         struct elevator_queue *e = q->elevator;
 271         struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 272         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 273         bool ret = false;
 274
 275         if (e && e->type->ops.mq.bio_merge) {
 276                 blk_mq_put_ctx(ctx);
 277                 return e->type->ops.mq.bio_merge(hctx, bio);
 278         }
 279
 280         if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
 281                 /* default per sw-queue merge */
 282                 spin_lock(&ctx->lock);
 283                 ret = blk_mq_attempt_merge(q, ctx, bio);
 284                 spin_unlock(&ctx->lock);
 285         }
 286
 287         blk_mq_put_ctx(ctx);
 288         return ret;
 289 }
 290
 291 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 292 {
 293         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
 294 }
 295 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 296
 297 void blk_mq_sched_request_inserted(struct request *rq)
 298 {
 299         trace_block_rq_insert(rq->q, rq);
 300 }
 301 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 302
 303 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 304                                        struct request *rq)
 305 {
 306         if (rq->tag == -1) {
 307                 rq->rq_flags |= RQF_SORTED;
 308                 return false;
 309         }
 310
 311         /*
 312          * If we already have a real request tag, send directly to
 313          * the dispatch list.
 314          */
 315         spin_lock(&hctx->lock);
 316         list_add(&rq->queuelist, &hctx->dispatch);
 317         spin_unlock(&hctx->lock);
 318         return true;
 319 }
 320
 321 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 322 {
 323         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 324                 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 325                 if (blk_mq_hctx_has_pending(hctx)) {
 326                         blk_mq_run_hw_queue(hctx, true);
 327                         return true;
 328                 }
 329         }
 330         return false;
 331 }
 332
 333 /**
 334  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
 335  * @pos:    loop cursor.
 336  * @skip:   the list element that will not be examined. Iteration starts at
 337  *          @skip->next.
 338  * @head:   head of the list to examine. This list must have at least one
 339  *          element, namely @skip.
 340  * @member: name of the list_head structure within typeof(*pos).
 341  */
 342 #define list_for_each_entry_rcu_rr(pos, skip, head, member)             \
 343         for ((pos) = (skip);                                            \
 344              (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
 345                         (pos)->member.next, typeof(*pos), member) :     \
 346               list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
 347              (pos) != (skip); )
 348
 349 /*
 350  * Called after a driver tag has been freed to check whether a hctx needs to
 351  * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
 352  * queues in a round-robin fashion if the tag set of @hctx is shared with other
 353  * hardware queues.
 354  */
 355 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 356 {
 357         struct blk_mq_tags *const tags = hctx->tags;
 358         struct blk_mq_tag_set *const set = hctx->queue->tag_set;
 359         struct request_queue *const queue = hctx->queue, *q;
 360         struct blk_mq_hw_ctx *hctx2;
 361         unsigned int i, j;
 362
 363         if (set->flags & BLK_MQ_F_TAG_SHARED) {
 364                 rcu_read_lock();
 365                 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 366                                            tag_set_list) {
 367                         queue_for_each_hw_ctx(q, hctx2, i)
 368                                 if (hctx2->tags == tags &&
 369                                     blk_mq_sched_restart_hctx(hctx2))
 370                                         goto done;
 371                 }
 372                 j = hctx->queue_num + 1;
 373                 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
 374                         if (j == queue->nr_hw_queues)
 375                                 j = 0;
 376                         hctx2 = queue->queue_hw_ctx[j];
 377                         if (hctx2->tags == tags &&
 378                             blk_mq_sched_restart_hctx(hctx2))
 379                                 break;
 380                 }
 381 done:
 382                 rcu_read_unlock();
 383         } else {
 384                 blk_mq_sched_restart_hctx(hctx);
 385         }
 386 }
 387
 388 /*
 389  * Add flush/fua to the queue. If we fail getting a driver tag, then
 390  * punt to the requeue list. Requeue will re-invoke us from a context
 391  * that's safe to block from.
 392  */
 393 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
 394                                       struct request *rq, bool can_block)
 395 {
 396         if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
 397                 blk_insert_flush(rq);
 398                 blk_mq_run_hw_queue(hctx, true);
 399         } else
 400                 blk_mq_add_to_requeue_list(rq, false, true);
 401 }
 402
 403 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 404                                  bool run_queue, bool async, bool can_block)
 405 {
 406         struct request_queue *q = rq->q;
 407         struct elevator_queue *e = q->elevator;
 408         struct blk_mq_ctx *ctx = rq->mq_ctx;
 409         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 410
 411         if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
 412                 blk_mq_sched_insert_flush(hctx, rq, can_block);
 413                 return;
 414         }
 415
 416         if (e && blk_mq_sched_bypass_insert(hctx, rq))
 417                 goto run;
 418
 419         if (e && e->type->ops.mq.insert_requests) {
 420                 LIST_HEAD(list);
 421
 422                 list_add(&rq->queuelist, &list);
 423                 e->type->ops.mq.insert_requests(hctx, &list, at_head);
 424         } else {
 425                 spin_lock(&ctx->lock);
 426                 __blk_mq_insert_request(hctx, rq, at_head);
 427                 spin_unlock(&ctx->lock);
 428         }
 429
 430 run:
 431         if (run_queue)
 432                 blk_mq_run_hw_queue(hctx, async);
 433 }
 434
 435 void blk_mq_sched_insert_requests(struct request_queue *q,
 436                                   struct blk_mq_ctx *ctx,
 437                                   struct list_head *list, bool run_queue_async)
 438 {
 439         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 440         struct elevator_queue *e = hctx->queue->elevator;
 441
 442         if (e) {
 443                 struct request *rq, *next;
 444
 445                 /*
 446                  * We bypass requests that already have a driver tag assigned,
 447                  * which should only be flushes. Flushes are only ever inserted
 448                  * as single requests, so we shouldn't ever hit the
 449                  * WARN_ON_ONCE() below (but let's handle it just in case).
 450                  */
 451                 list_for_each_entry_safe(rq, next, list, queuelist) {
 452                         if (WARN_ON_ONCE(rq->tag != -1)) {
 453                                 list_del_init(&rq->queuelist);
 454                                 blk_mq_sched_bypass_insert(hctx, rq);
 455                         }
 456                 }
 457         }
 458
 459         if (e && e->type->ops.mq.insert_requests)
 460                 e->type->ops.mq.insert_requests(hctx, list, false);
 461         else
 462                 blk_mq_insert_requests(hctx, ctx, list);
 463
 464         blk_mq_run_hw_queue(hctx, run_queue_async);
 465 }
 466
 467 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 468                                    struct blk_mq_hw_ctx *hctx,
 469                                    unsigned int hctx_idx)
 470 {
 471         if (hctx->sched_tags) {
 472                 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
 473                 blk_mq_free_rq_map(hctx->sched_tags);
 474                 hctx->sched_tags = NULL;
 475         }
 476 }
 477
 478 static int blk_mq_sched_alloc_tags(struct request_queue *q,
 479                                    struct blk_mq_hw_ctx *hctx,
 480                                    unsigned int hctx_idx)
 481 {
 482         struct blk_mq_tag_set *set = q->tag_set;
 483         int ret;
 484
 485         hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 486                                                set->reserved_tags);
 487         if (!hctx->sched_tags)
 488                 return -ENOMEM;
 489
 490         ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 491         if (ret)
 492                 blk_mq_sched_free_tags(set, hctx, hctx_idx);
 493
 494         return ret;
 495 }
 496
 497 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 498 {
 499         struct blk_mq_tag_set *set = q->tag_set;
 500         struct blk_mq_hw_ctx *hctx;
 501         int i;
 502
 503         queue_for_each_hw_ctx(q, hctx, i)
 504                 blk_mq_sched_free_tags(set, hctx, i);
 505 }
 506
 507 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 508                            unsigned int hctx_idx)
 509 {
 510         struct elevator_queue *e = q->elevator;
 511         int ret;
 512
 513         if (!e)
 514                 return 0;
 515
 516         ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
 517         if (ret)
 518                 return ret;
 519
 520         if (e->type->ops.mq.init_hctx) {
 521                 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
 522                 if (ret) {
 523                         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 524                         return ret;
 525                 }
 526         }
 527
 528         blk_mq_debugfs_register_sched_hctx(q, hctx);
 529
 530         return 0;
 531 }
 532
 533 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 534                             unsigned int hctx_idx)
 535 {
 536         struct elevator_queue *e = q->elevator;
 537
 538         if (!e)
 539                 return;
 540
 541         blk_mq_debugfs_unregister_sched_hctx(hctx);
 542
 543         if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 544                 e->type->ops.mq.exit_hctx(hctx, hctx_idx);
 545                 hctx->sched_data = NULL;
 546         }
 547
 548         blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 549 }
 550
 551 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 552 {
 553         struct blk_mq_hw_ctx *hctx;
 554         struct elevator_queue *eq;
 555         unsigned int i;
 556         int ret;
 557
 558         if (!e) {
 559                 q->elevator = NULL;
 560                 return 0;
 561         }
 562
 563         /*
 564          * Default to 256, since we don't split into sync/async like the
 565          * old code did. Additionally, this is a per-hw queue depth.
 566          */
 567         q->nr_requests = 2 * BLKDEV_MAX_RQ;
 568
 569         queue_for_each_hw_ctx(q, hctx, i) {
 570                 ret = blk_mq_sched_alloc_tags(q, hctx, i);
 571                 if (ret)
 572                         goto err;
 573         }
 574
 575         ret = e->ops.mq.init_sched(q, e);
 576         if (ret)
 577                 goto err;
 578
 579         blk_mq_debugfs_register_sched(q);
 580
 581         queue_for_each_hw_ctx(q, hctx, i) {
 582                 if (e->ops.mq.init_hctx) {
 583                         ret = e->ops.mq.init_hctx(hctx, i);
 584                         if (ret) {
 585                                 eq = q->elevator;
 586                                 blk_mq_exit_sched(q, eq);
 587                                 kobject_put(&eq->kobj);
 588                                 return ret;
 589                         }
 590                 }
 591                 blk_mq_debugfs_register_sched_hctx(q, hctx);
 592         }
 593
 594         return 0;
 595
 596 err:
 597         blk_mq_sched_tags_teardown(q);
 598         q->elevator = NULL;
 599         return ret;
 600 }
 601
 602 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 603 {
 604         struct blk_mq_hw_ctx *hctx;
 605         unsigned int i;
 606
 607         queue_for_each_hw_ctx(q, hctx, i) {
 608                 blk_mq_debugfs_unregister_sched_hctx(hctx);
 609                 if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 610                         e->type->ops.mq.exit_hctx(hctx, i);
 611                         hctx->sched_data = NULL;
 612                 }
 613         }
 614         blk_mq_debugfs_unregister_sched(q);
 615         if (e->type->ops.mq.exit_sched)
 616                 e->type->ops.mq.exit_sched(e);
 617         blk_mq_sched_tags_teardown(q);
 618         q->elevator = NULL;
 619 }
 620
 621 int blk_mq_sched_init(struct request_queue *q)
 622 {
 623         int ret;
 624
 625         mutex_lock(&q->sysfs_lock);
 626         ret = elevator_init(q, NULL);
 627         mutex_unlock(&q->sysfs_lock);
 628
 629         return ret;
 630 }