block/kyber-iosched.c

   1 /*
   2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3  * scalable techniques.
   4  *
   5  * Copyright (C) 2017 Facebook
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public
   9  * License v2 as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18  */
  19
  20 #include <linux/kernel.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/blk-mq.h>
  23 #include <linux/elevator.h>
  24 #include <linux/module.h>
  25 #include <linux/sbitmap.h>
  26
  27 #include "blk.h"
  28 #include "blk-mq.h"
  29 #include "blk-mq-debugfs.h"
  30 #include "blk-mq-sched.h"
  31 #include "blk-mq-tag.h"
  32 #include "blk-stat.h"
  33
  34 /* Scheduling domains. */
  35 enum {
  36         KYBER_READ,
  37         KYBER_SYNC_WRITE,
  38         KYBER_OTHER, /* Async writes, discard, etc. */
  39         KYBER_NUM_DOMAINS,
  40 };
  41
  42 enum {
  43         KYBER_MIN_DEPTH = 256,
  44
  45         /*
  46          * In order to prevent starvation of synchronous requests by a flood of
  47          * asynchronous requests, we reserve 25% of requests for synchronous
  48          * operations.
  49          */
  50         KYBER_ASYNC_PERCENT = 75,
  51 };
  52
  53 /*
  54  * Initial device-wide depths for each scheduling domain.
  55  *
  56  * Even for fast devices with lots of tags like NVMe, you can saturate
  57  * the device with only a fraction of the maximum possible queue depth.
  58  * So, we cap these to a reasonable value.
  59  */
  60 static const unsigned int kyber_depth[] = {
  61         [KYBER_READ] = 256,
  62         [KYBER_SYNC_WRITE] = 128,
  63         [KYBER_OTHER] = 64,
  64 };
  65
  66 /*
  67  * Scheduling domain batch sizes. We favor reads.
  68  */
  69 static const unsigned int kyber_batch_size[] = {
  70         [KYBER_READ] = 16,
  71         [KYBER_SYNC_WRITE] = 8,
  72         [KYBER_OTHER] = 8,
  73 };
  74
  75 struct kyber_queue_data {
  76         struct request_queue *q;
  77
  78         struct blk_stat_callback *cb;
  79
  80         /*
  81          * The device is divided into multiple scheduling domains based on the
  82          * request type. Each domain has a fixed number of in-flight requests of
  83          * that type device-wide, limited by these tokens.
  84          */
  85         struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  86
  87         /*
  88          * Async request percentage, converted to per-word depth for
  89          * sbitmap_get_shallow().
  90          */
  91         unsigned int async_depth;
  92
  93         /* Target latencies in nanoseconds. */
  94         u64 read_lat_nsec, write_lat_nsec;
  95 };
  96
  97 struct kyber_hctx_data {
  98         spinlock_t lock;
  99         struct list_head rqs[KYBER_NUM_DOMAINS];
 100         unsigned int cur_domain;
 101         unsigned int batching;
 102         wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 103         struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 104         atomic_t wait_index[KYBER_NUM_DOMAINS];
 105 };
 106
 107 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 108                              void *key);
 109
 110 static int rq_sched_domain(const struct request *rq)
 111 {
 112         unsigned int op = rq->cmd_flags;
 113
 114         if ((op & REQ_OP_MASK) == REQ_OP_READ)
 115                 return KYBER_READ;
 116         else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
 117                 return KYBER_SYNC_WRITE;
 118         else
 119                 return KYBER_OTHER;
 120 }
 121
 122 enum {
 123         NONE = 0,
 124         GOOD = 1,
 125         GREAT = 2,
 126         BAD = -1,
 127         AWFUL = -2,
 128 };
 129
 130 #define IS_GOOD(status) ((status) > 0)
 131 #define IS_BAD(status) ((status) < 0)
 132
 133 static int kyber_lat_status(struct blk_stat_callback *cb,
 134                             unsigned int sched_domain, u64 target)
 135 {
 136         u64 latency;
 137
 138         if (!cb->stat[sched_domain].nr_samples)
 139                 return NONE;
 140
 141         latency = cb->stat[sched_domain].mean;
 142         if (latency >= 2 * target)
 143                 return AWFUL;
 144         else if (latency > target)
 145                 return BAD;
 146         else if (latency <= target / 2)
 147                 return GREAT;
 148         else /* (latency <= target) */
 149                 return GOOD;
 150 }
 151
 152 /*
 153  * Adjust the read or synchronous write depth given the status of reads and
 154  * writes. The goal is that the latencies of the two domains are fair (i.e., if
 155  * one is good, then the other is good).
 156  */
 157 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 158                                   unsigned int sched_domain, int this_status,
 159                                   int other_status)
 160 {
 161         unsigned int orig_depth, depth;
 162
 163         /*
 164          * If this domain had no samples, or reads and writes are both good or
 165          * both bad, don't adjust the depth.
 166          */
 167         if (this_status == NONE ||
 168             (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 169             (IS_BAD(this_status) && IS_BAD(other_status)))
 170                 return;
 171
 172         orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 173
 174         if (other_status == NONE) {
 175                 depth++;
 176         } else {
 177                 switch (this_status) {
 178                 case GOOD:
 179                         if (other_status == AWFUL)
 180                                 depth -= max(depth / 4, 1U);
 181                         else
 182                                 depth -= max(depth / 8, 1U);
 183                         break;
 184                 case GREAT:
 185                         if (other_status == AWFUL)
 186                                 depth /= 2;
 187                         else
 188                                 depth -= max(depth / 4, 1U);
 189                         break;
 190                 case BAD:
 191                         depth++;
 192                         break;
 193                 case AWFUL:
 194                         if (other_status == GREAT)
 195                                 depth += 2;
 196                         else
 197                                 depth++;
 198                         break;
 199                 }
 200         }
 201
 202         depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 203         if (depth != orig_depth)
 204                 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 205 }
 206
 207 /*
 208  * Adjust the depth of other requests given the status of reads and synchronous
 209  * writes. As long as either domain is doing fine, we don't throttle, but if
 210  * both domains are doing badly, we throttle heavily.
 211  */
 212 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 213                                      int read_status, int write_status,
 214                                      bool have_samples)
 215 {
 216         unsigned int orig_depth, depth;
 217         int status;
 218
 219         orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 220
 221         if (read_status == NONE && write_status == NONE) {
 222                 depth += 2;
 223         } else if (have_samples) {
 224                 if (read_status == NONE)
 225                         status = write_status;
 226                 else if (write_status == NONE)
 227                         status = read_status;
 228                 else
 229                         status = max(read_status, write_status);
 230                 switch (status) {
 231                 case GREAT:
 232                         depth += 2;
 233                         break;
 234                 case GOOD:
 235                         depth++;
 236                         break;
 237                 case BAD:
 238                         depth -= max(depth / 4, 1U);
 239                         break;
 240                 case AWFUL:
 241                         depth /= 2;
 242                         break;
 243                 }
 244         }
 245
 246         depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 247         if (depth != orig_depth)
 248                 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 249 }
 250
 251 /*
 252  * Apply heuristics for limiting queue depths based on gathered latency
 253  * statistics.
 254  */
 255 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 256 {
 257         struct kyber_queue_data *kqd = cb->data;
 258         int read_status, write_status;
 259
 260         read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 261         write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 262
 263         kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 264         kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 265         kyber_adjust_other_depth(kqd, read_status, write_status,
 266                                  cb->stat[KYBER_OTHER].nr_samples != 0);
 267
 268         /*
 269          * Continue monitoring latencies if we aren't hitting the targets or
 270          * we're still throttling other requests.
 271          */
 272         if (!blk_stat_is_active(kqd->cb) &&
 273             ((IS_BAD(read_status) || IS_BAD(write_status) ||
 274               kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 275                 blk_stat_activate_msecs(kqd->cb, 100);
 276 }
 277
 278 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 279 {
 280         /*
 281          * All of the hardware queues have the same depth, so we can just grab
 282          * the shift of the first one.
 283          */
 284         return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 285 }
 286
 287 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 288 {
 289         struct kyber_queue_data *kqd;
 290         unsigned int max_tokens;
 291         unsigned int shift;
 292         int ret = -ENOMEM;
 293         int i;
 294
 295         kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 296         if (!kqd)
 297                 goto err;
 298         kqd->q = q;
 299
 300         kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
 301                                           KYBER_NUM_DOMAINS, kqd);
 302         if (!kqd->cb)
 303                 goto err_kqd;
 304
 305         /*
 306          * The maximum number of tokens for any scheduling domain is at least
 307          * the queue depth of a single hardware queue. If the hardware doesn't
 308          * have many tags, still provide a reasonable number.
 309          */
 310         max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
 311                            KYBER_MIN_DEPTH);
 312         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 313                 WARN_ON(!kyber_depth[i]);
 314                 WARN_ON(!kyber_batch_size[i]);
 315                 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 316                                               max_tokens, -1, false, GFP_KERNEL,
 317                                               q->node);
 318                 if (ret) {
 319                         while (--i >= 0)
 320                                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 321                         goto err_cb;
 322                 }
 323                 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
 324         }
 325
 326         shift = kyber_sched_tags_shift(kqd);
 327         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 328
 329         kqd->read_lat_nsec = 2000000ULL;
 330         kqd->write_lat_nsec = 10000000ULL;
 331
 332         return kqd;
 333
 334 err_cb:
 335         blk_stat_free_callback(kqd->cb);
 336 err_kqd:
 337         kfree(kqd);
 338 err:
 339         return ERR_PTR(ret);
 340 }
 341
 342 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 343 {
 344         struct kyber_queue_data *kqd;
 345         struct elevator_queue *eq;
 346
 347         eq = elevator_alloc(q, e);
 348         if (!eq)
 349                 return -ENOMEM;
 350
 351         kqd = kyber_queue_data_alloc(q);
 352         if (IS_ERR(kqd)) {
 353                 kobject_put(&eq->kobj);
 354                 return PTR_ERR(kqd);
 355         }
 356
 357         eq->elevator_data = kqd;
 358         q->elevator = eq;
 359
 360         blk_stat_add_callback(q, kqd->cb);
 361
 362         return 0;
 363 }
 364
 365 static void kyber_exit_sched(struct elevator_queue *e)
 366 {
 367         struct kyber_queue_data *kqd = e->elevator_data;
 368         struct request_queue *q = kqd->q;
 369         int i;
 370
 371         blk_stat_remove_callback(q, kqd->cb);
 372
 373         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 374                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 375         blk_stat_free_callback(kqd->cb);
 376         kfree(kqd);
 377 }
 378
 379 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 380 {
 381         struct kyber_hctx_data *khd;
 382         int i;
 383
 384         khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 385         if (!khd)
 386                 return -ENOMEM;
 387
 388         spin_lock_init(&khd->lock);
 389
 390         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 391                 INIT_LIST_HEAD(&khd->rqs[i]);
 392                 init_waitqueue_func_entry(&khd->domain_wait[i],
 393                                           kyber_domain_wake);
 394                 khd->domain_wait[i].private = hctx;
 395                 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 396                 atomic_set(&khd->wait_index[i], 0);
 397         }
 398
 399         khd->cur_domain = 0;
 400         khd->batching = 0;
 401
 402         hctx->sched_data = khd;
 403
 404         return 0;
 405 }
 406
 407 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 408 {
 409         kfree(hctx->sched_data);
 410 }
 411
 412 static int rq_get_domain_token(struct request *rq)
 413 {
 414         return (long)rq->elv.priv[0];
 415 }
 416
 417 static void rq_set_domain_token(struct request *rq, int token)
 418 {
 419         rq->elv.priv[0] = (void *)(long)token;
 420 }
 421
 422 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 423                                   struct request *rq)
 424 {
 425         unsigned int sched_domain;
 426         int nr;
 427
 428         nr = rq_get_domain_token(rq);
 429         if (nr != -1) {
 430                 sched_domain = rq_sched_domain(rq);
 431                 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 432                                     rq->mq_ctx->cpu);
 433         }
 434 }
 435
 436 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 437 {
 438         /*
 439          * We use the scheduler tags as per-hardware queue queueing tokens.
 440          * Async requests can be limited at this stage.
 441          */
 442         if (!op_is_sync(op)) {
 443                 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
 444
 445                 data->shallow_depth = kqd->async_depth;
 446         }
 447 }
 448
 449 static void kyber_prepare_request(struct request *rq, struct bio *bio)
 450 {
 451         rq_set_domain_token(rq, -1);
 452 }
 453
 454 static void kyber_finish_request(struct request *rq)
 455 {
 456         struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
 457
 458         rq_clear_domain_token(kqd, rq);
 459 }
 460
 461 static void kyber_completed_request(struct request *rq)
 462 {
 463         struct request_queue *q = rq->q;
 464         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 465         unsigned int sched_domain;
 466         u64 now, latency, target;
 467
 468         /*
 469          * Check if this request met our latency goal. If not, quickly gather
 470          * some statistics and start throttling.
 471          */
 472         sched_domain = rq_sched_domain(rq);
 473         switch (sched_domain) {
 474         case KYBER_READ:
 475                 target = kqd->read_lat_nsec;
 476                 break;
 477         case KYBER_SYNC_WRITE:
 478                 target = kqd->write_lat_nsec;
 479                 break;
 480         default:
 481                 return;
 482         }
 483
 484         /* If we are already monitoring latencies, don't check again. */
 485         if (blk_stat_is_active(kqd->cb))
 486                 return;
 487
 488         now = __blk_stat_time(ktime_to_ns(ktime_get()));
 489         if (now < blk_stat_time(&rq->issue_stat))
 490                 return;
 491
 492         latency = now - blk_stat_time(&rq->issue_stat);
 493
 494         if (latency > target)
 495                 blk_stat_activate_msecs(kqd->cb, 10);
 496 }
 497
 498 static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 499                                   struct blk_mq_hw_ctx *hctx)
 500 {
 501         LIST_HEAD(rq_list);
 502         struct request *rq, *next;
 503
 504         blk_mq_flush_busy_ctxs(hctx, &rq_list);
 505         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 506                 unsigned int sched_domain;
 507
 508                 sched_domain = rq_sched_domain(rq);
 509                 list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
 510         }
 511 }
 512
 513 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 514                              void *key)
 515 {
 516         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 517
 518         list_del_init(&wait->entry);
 519         blk_mq_run_hw_queue(hctx, true);
 520         return 1;
 521 }
 522
 523 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 524                                   struct kyber_hctx_data *khd,
 525                                   struct blk_mq_hw_ctx *hctx)
 526 {
 527         unsigned int sched_domain = khd->cur_domain;
 528         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 529         wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 530         struct sbq_wait_state *ws;
 531         int nr;
 532
 533         nr = __sbitmap_queue_get(domain_tokens);
 534
 535         /*
 536          * If we failed to get a domain token, make sure the hardware queue is
 537          * run when one becomes available. Note that this is serialized on
 538          * khd->lock, but we still need to be careful about the waker.
 539          */
 540         if (nr < 0 && list_empty_careful(&wait->entry)) {
 541                 ws = sbq_wait_ptr(domain_tokens,
 542                                   &khd->wait_index[sched_domain]);
 543                 khd->domain_ws[sched_domain] = ws;
 544                 add_wait_queue(&ws->wait, wait);
 545
 546                 /*
 547                  * Try again in case a token was freed before we got on the wait
 548                  * queue.
 549                  */
 550                 nr = __sbitmap_queue_get(domain_tokens);
 551         }
 552
 553         /*
 554          * If we got a token while we were on the wait queue, remove ourselves
 555          * from the wait queue to ensure that all wake ups make forward
 556          * progress. It's possible that the waker already deleted the entry
 557          * between the !list_empty_careful() check and us grabbing the lock, but
 558          * list_del_init() is okay with that.
 559          */
 560         if (nr >= 0 && !list_empty_careful(&wait->entry)) {
 561                 ws = khd->domain_ws[sched_domain];
 562                 spin_lock_irq(&ws->wait.lock);
 563                 list_del_init(&wait->entry);
 564                 spin_unlock_irq(&ws->wait.lock);
 565         }
 566
 567         return nr;
 568 }
 569
 570 static struct request *
 571 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 572                           struct kyber_hctx_data *khd,
 573                           struct blk_mq_hw_ctx *hctx,
 574                           bool *flushed)
 575 {
 576         struct list_head *rqs;
 577         struct request *rq;
 578         int nr;
 579
 580         rqs = &khd->rqs[khd->cur_domain];
 581         rq = list_first_entry_or_null(rqs, struct request, queuelist);
 582
 583         /*
 584          * If there wasn't already a pending request and we haven't flushed the
 585          * software queues yet, flush the software queues and check again.
 586          */
 587         if (!rq && !*flushed) {
 588                 kyber_flush_busy_ctxs(khd, hctx);
 589                 *flushed = true;
 590                 rq = list_first_entry_or_null(rqs, struct request, queuelist);
 591         }
 592
 593         if (rq) {
 594                 nr = kyber_get_domain_token(kqd, khd, hctx);
 595                 if (nr >= 0) {
 596                         khd->batching++;
 597                         rq_set_domain_token(rq, nr);
 598                         list_del_init(&rq->queuelist);
 599                         return rq;
 600                 }
 601         }
 602
 603         /* There were either no pending requests or no tokens. */
 604         return NULL;
 605 }
 606
 607 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 608 {
 609         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 610         struct kyber_hctx_data *khd = hctx->sched_data;
 611         bool flushed = false;
 612         struct request *rq;
 613         int i;
 614
 615         spin_lock(&khd->lock);
 616
 617         /*
 618          * First, if we are still entitled to batch, try to dispatch a request
 619          * from the batch.
 620          */
 621         if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 622                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 623                 if (rq)
 624                         goto out;
 625         }
 626
 627         /*
 628          * Either,
 629          * 1. We were no longer entitled to a batch.
 630          * 2. The domain we were batching didn't have any requests.
 631          * 3. The domain we were batching was out of tokens.
 632          *
 633          * Start another batch. Note that this wraps back around to the original
 634          * domain if no other domains have requests or tokens.
 635          */
 636         khd->batching = 0;
 637         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 638                 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 639                         khd->cur_domain = 0;
 640                 else
 641                         khd->cur_domain++;
 642
 643                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
 644                 if (rq)
 645                         goto out;
 646         }
 647
 648         rq = NULL;
 649 out:
 650         spin_unlock(&khd->lock);
 651         return rq;
 652 }
 653
 654 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 655 {
 656         struct kyber_hctx_data *khd = hctx->sched_data;
 657         int i;
 658
 659         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 660                 if (!list_empty_careful(&khd->rqs[i]))
 661                         return true;
 662         }
 663         return sbitmap_any_bit_set(&hctx->ctx_map);
 664 }
 665
 666 #define KYBER_LAT_SHOW_STORE(op)                                        \
 667 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 668                                      char *page)                        \
 669 {                                                                       \
 670         struct kyber_queue_data *kqd = e->elevator_data;                \
 671                                                                         \
 672         return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 673 }                                                                       \
 674                                                                         \
 675 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 676                                       const char *page, size_t count)   \
 677 {                                                                       \
 678         struct kyber_queue_data *kqd = e->elevator_data;                \
 679         unsigned long long nsec;                                        \
 680         int ret;                                                        \
 681                                                                         \
 682         ret = kstrtoull(page, 10, &nsec);                               \
 683         if (ret)                                                        \
 684                 return ret;                                             \
 685                                                                         \
 686         kqd->op##_lat_nsec = nsec;                                      \
 687                                                                         \
 688         return count;                                                   \
 689 }
 690 KYBER_LAT_SHOW_STORE(read);
 691 KYBER_LAT_SHOW_STORE(write);
 692 #undef KYBER_LAT_SHOW_STORE
 693
 694 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 695 static struct elv_fs_entry kyber_sched_attrs[] = {
 696         KYBER_LAT_ATTR(read),
 697         KYBER_LAT_ATTR(write),
 698         __ATTR_NULL
 699 };
 700 #undef KYBER_LAT_ATTR
 701
 702 #ifdef CONFIG_BLK_DEBUG_FS
 703 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 704 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 705 {                                                                       \
 706         struct request_queue *q = data;                                 \
 707         struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 708                                                                         \
 709         sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 710         return 0;                                                       \
 711 }                                                                       \
 712                                                                         \
 713 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 714         __acquires(&khd->lock)                                          \
 715 {                                                                       \
 716         struct blk_mq_hw_ctx *hctx = m->private;                        \
 717         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 718                                                                         \
 719         spin_lock(&khd->lock);                                          \
 720         return seq_list_start(&khd->rqs[domain], *pos);                 \
 721 }                                                                       \
 722                                                                         \
 723 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 724                                      loff_t *pos)                       \
 725 {                                                                       \
 726         struct blk_mq_hw_ctx *hctx = m->private;                        \
 727         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 728                                                                         \
 729         return seq_list_next(v, &khd->rqs[domain], pos);                \
 730 }                                                                       \
 731                                                                         \
 732 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 733         __releases(&khd->lock)                                          \
 734 {                                                                       \
 735         struct blk_mq_hw_ctx *hctx = m->private;                        \
 736         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 737                                                                         \
 738         spin_unlock(&khd->lock);                                        \
 739 }                                                                       \
 740                                                                         \
 741 static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 742         .start  = kyber_##name##_rqs_start,                             \
 743         .next   = kyber_##name##_rqs_next,                              \
 744         .stop   = kyber_##name##_rqs_stop,                              \
 745         .show   = blk_mq_debugfs_rq_show,                               \
 746 };                                                                      \
 747                                                                         \
 748 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 749 {                                                                       \
 750         struct blk_mq_hw_ctx *hctx = data;                              \
 751         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 752         wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
 753                                                                         \
 754         seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
 755         return 0;                                                       \
 756 }
 757 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 758 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 759 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 760 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
 761
 762 static int kyber_async_depth_show(void *data, struct seq_file *m)
 763 {
 764         struct request_queue *q = data;
 765         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 766
 767         seq_printf(m, "%u\n", kqd->async_depth);
 768         return 0;
 769 }
 770
 771 static int kyber_cur_domain_show(void *data, struct seq_file *m)
 772 {
 773         struct blk_mq_hw_ctx *hctx = data;
 774         struct kyber_hctx_data *khd = hctx->sched_data;
 775
 776         switch (khd->cur_domain) {
 777         case KYBER_READ:
 778                 seq_puts(m, "READ\n");
 779                 break;
 780         case KYBER_SYNC_WRITE:
 781                 seq_puts(m, "SYNC_WRITE\n");
 782                 break;
 783         case KYBER_OTHER:
 784                 seq_puts(m, "OTHER\n");
 785                 break;
 786         default:
 787                 seq_printf(m, "%u\n", khd->cur_domain);
 788                 break;
 789         }
 790         return 0;
 791 }
 792
 793 static int kyber_batching_show(void *data, struct seq_file *m)
 794 {
 795         struct blk_mq_hw_ctx *hctx = data;
 796         struct kyber_hctx_data *khd = hctx->sched_data;
 797
 798         seq_printf(m, "%u\n", khd->batching);
 799         return 0;
 800 }
 801
 802 #define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 803         {#name "_tokens", 0400, kyber_##name##_tokens_show}
 804 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 805         KYBER_QUEUE_DOMAIN_ATTRS(read),
 806         KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 807         KYBER_QUEUE_DOMAIN_ATTRS(other),
 808         {"async_depth", 0400, kyber_async_depth_show},
 809         {},
 810 };
 811 #undef KYBER_QUEUE_DOMAIN_ATTRS
 812
 813 #define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 814         {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 815         {#name "_waiting", 0400, kyber_##name##_waiting_show}
 816 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 817         KYBER_HCTX_DOMAIN_ATTRS(read),
 818         KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 819         KYBER_HCTX_DOMAIN_ATTRS(other),
 820         {"cur_domain", 0400, kyber_cur_domain_show},
 821         {"batching", 0400, kyber_batching_show},
 822         {},
 823 };
 824 #undef KYBER_HCTX_DOMAIN_ATTRS
 825 #endif
 826
 827 static struct elevator_type kyber_sched = {
 828         .ops.mq = {
 829                 .init_sched = kyber_init_sched,
 830                 .exit_sched = kyber_exit_sched,
 831                 .init_hctx = kyber_init_hctx,
 832                 .exit_hctx = kyber_exit_hctx,
 833                 .limit_depth = kyber_limit_depth,
 834                 .prepare_request = kyber_prepare_request,
 835                 .finish_request = kyber_finish_request,
 836                 .requeue_request = kyber_finish_request,
 837                 .completed_request = kyber_completed_request,
 838                 .dispatch_request = kyber_dispatch_request,
 839                 .has_work = kyber_has_work,
 840         },
 841         .uses_mq = true,
 842 #ifdef CONFIG_BLK_DEBUG_FS
 843         .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 844         .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
 845 #endif
 846         .elevator_attrs = kyber_sched_attrs,
 847         .elevator_name = "kyber",
 848         .elevator_owner = THIS_MODULE,
 849 };
 850
 851 static int __init kyber_init(void)
 852 {
 853         return elv_register(&kyber_sched);
 854 }
 855
 856 static void __exit kyber_exit(void)
 857 {
 858         elv_unregister(&kyber_sched);
 859 }
 860
 861 module_init(kyber_init);
 862 module_exit(kyber_exit);
 863
 864 MODULE_AUTHOR("Omar Sandoval");
 865 MODULE_LICENSE("GPL");
 866 MODULE_DESCRIPTION("Kyber I/O scheduler");