2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
5 * Copyright (C) 2017 Facebook
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
29 #include "blk-mq-sched.h"
30 #include "blk-mq-tag.h"
33 /* Scheduling domains. */
37 KYBER_OTHER
, /* Async writes, discard, etc. */
42 KYBER_MIN_DEPTH
= 256,
45 * In order to prevent starvation of synchronous requests by a flood of
46 * asynchronous requests, we reserve 25% of requests for synchronous
49 KYBER_ASYNC_PERCENT
= 75,
53 * Initial device-wide depths for each scheduling domain.
55 * Even for fast devices with lots of tags like NVMe, you can saturate
56 * the device with only a fraction of the maximum possible queue depth.
57 * So, we cap these to a reasonable value.
59 static const unsigned int kyber_depth
[] = {
61 [KYBER_SYNC_WRITE
] = 128,
66 * Scheduling domain batch sizes. We favor reads.
68 static const unsigned int kyber_batch_size
[] = {
70 [KYBER_SYNC_WRITE
] = 8,
74 struct kyber_queue_data
{
75 struct request_queue
*q
;
77 struct blk_stat_callback
*cb
;
80 * The device is divided into multiple scheduling domains based on the
81 * request type. Each domain has a fixed number of in-flight requests of
82 * that type device-wide, limited by these tokens.
84 struct sbitmap_queue domain_tokens
[KYBER_NUM_DOMAINS
];
87 * Async request percentage, converted to per-word depth for
88 * sbitmap_get_shallow().
90 unsigned int async_depth
;
92 /* Target latencies in nanoseconds. */
93 u64 read_lat_nsec
, write_lat_nsec
;
96 struct kyber_hctx_data
{
98 struct list_head rqs
[KYBER_NUM_DOMAINS
];
99 unsigned int cur_domain
;
100 unsigned int batching
;
101 wait_queue_t domain_wait
[KYBER_NUM_DOMAINS
];
102 atomic_t wait_index
[KYBER_NUM_DOMAINS
];
105 static int rq_sched_domain(const struct request
*rq
)
107 unsigned int op
= rq
->cmd_flags
;
109 if ((op
& REQ_OP_MASK
) == REQ_OP_READ
)
111 else if ((op
& REQ_OP_MASK
) == REQ_OP_WRITE
&& op_is_sync(op
))
112 return KYBER_SYNC_WRITE
;
125 #define IS_GOOD(status) ((status) > 0)
126 #define IS_BAD(status) ((status) < 0)
128 static int kyber_lat_status(struct blk_stat_callback
*cb
,
129 unsigned int sched_domain
, u64 target
)
133 if (!cb
->stat
[sched_domain
].nr_samples
)
136 latency
= cb
->stat
[sched_domain
].mean
;
137 if (latency
>= 2 * target
)
139 else if (latency
> target
)
141 else if (latency
<= target
/ 2)
143 else /* (latency <= target) */
148 * Adjust the read or synchronous write depth given the status of reads and
149 * writes. The goal is that the latencies of the two domains are fair (i.e., if
150 * one is good, then the other is good).
152 static void kyber_adjust_rw_depth(struct kyber_queue_data
*kqd
,
153 unsigned int sched_domain
, int this_status
,
156 unsigned int orig_depth
, depth
;
159 * If this domain had no samples, or reads and writes are both good or
160 * both bad, don't adjust the depth.
162 if (this_status
== NONE
||
163 (IS_GOOD(this_status
) && IS_GOOD(other_status
)) ||
164 (IS_BAD(this_status
) && IS_BAD(other_status
)))
167 orig_depth
= depth
= kqd
->domain_tokens
[sched_domain
].sb
.depth
;
169 if (other_status
== NONE
) {
172 switch (this_status
) {
174 if (other_status
== AWFUL
)
175 depth
-= max(depth
/ 4, 1U);
177 depth
-= max(depth
/ 8, 1U);
180 if (other_status
== AWFUL
)
183 depth
-= max(depth
/ 4, 1U);
189 if (other_status
== GREAT
)
197 depth
= clamp(depth
, 1U, kyber_depth
[sched_domain
]);
198 if (depth
!= orig_depth
)
199 sbitmap_queue_resize(&kqd
->domain_tokens
[sched_domain
], depth
);
203 * Adjust the depth of other requests given the status of reads and synchronous
204 * writes. As long as either domain is doing fine, we don't throttle, but if
205 * both domains are doing badly, we throttle heavily.
207 static void kyber_adjust_other_depth(struct kyber_queue_data
*kqd
,
208 int read_status
, int write_status
,
211 unsigned int orig_depth
, depth
;
214 orig_depth
= depth
= kqd
->domain_tokens
[KYBER_OTHER
].sb
.depth
;
216 if (read_status
== NONE
&& write_status
== NONE
) {
218 } else if (have_samples
) {
219 if (read_status
== NONE
)
220 status
= write_status
;
221 else if (write_status
== NONE
)
222 status
= read_status
;
224 status
= max(read_status
, write_status
);
233 depth
-= max(depth
/ 4, 1U);
241 depth
= clamp(depth
, 1U, kyber_depth
[KYBER_OTHER
]);
242 if (depth
!= orig_depth
)
243 sbitmap_queue_resize(&kqd
->domain_tokens
[KYBER_OTHER
], depth
);
247 * Apply heuristics for limiting queue depths based on gathered latency
250 static void kyber_stat_timer_fn(struct blk_stat_callback
*cb
)
252 struct kyber_queue_data
*kqd
= cb
->data
;
253 int read_status
, write_status
;
255 read_status
= kyber_lat_status(cb
, KYBER_READ
, kqd
->read_lat_nsec
);
256 write_status
= kyber_lat_status(cb
, KYBER_SYNC_WRITE
, kqd
->write_lat_nsec
);
258 kyber_adjust_rw_depth(kqd
, KYBER_READ
, read_status
, write_status
);
259 kyber_adjust_rw_depth(kqd
, KYBER_SYNC_WRITE
, write_status
, read_status
);
260 kyber_adjust_other_depth(kqd
, read_status
, write_status
,
261 cb
->stat
[KYBER_OTHER
].nr_samples
!= 0);
264 * Continue monitoring latencies if we aren't hitting the targets or
265 * we're still throttling other requests.
267 if (!blk_stat_is_active(kqd
->cb
) &&
268 ((IS_BAD(read_status
) || IS_BAD(write_status
) ||
269 kqd
->domain_tokens
[KYBER_OTHER
].sb
.depth
< kyber_depth
[KYBER_OTHER
])))
270 blk_stat_activate_msecs(kqd
->cb
, 100);
273 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data
*kqd
)
276 * All of the hardware queues have the same depth, so we can just grab
277 * the shift of the first one.
279 return kqd
->q
->queue_hw_ctx
[0]->sched_tags
->bitmap_tags
.sb
.shift
;
282 static struct kyber_queue_data
*kyber_queue_data_alloc(struct request_queue
*q
)
284 struct kyber_queue_data
*kqd
;
285 unsigned int max_tokens
;
290 kqd
= kmalloc_node(sizeof(*kqd
), GFP_KERNEL
, q
->node
);
295 kqd
->cb
= blk_stat_alloc_callback(kyber_stat_timer_fn
, rq_sched_domain
,
296 KYBER_NUM_DOMAINS
, kqd
);
301 * The maximum number of tokens for any scheduling domain is at least
302 * the queue depth of a single hardware queue. If the hardware doesn't
303 * have many tags, still provide a reasonable number.
305 max_tokens
= max_t(unsigned int, q
->tag_set
->queue_depth
,
307 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
308 WARN_ON(!kyber_depth
[i
]);
309 WARN_ON(!kyber_batch_size
[i
]);
310 ret
= sbitmap_queue_init_node(&kqd
->domain_tokens
[i
],
311 max_tokens
, -1, false, GFP_KERNEL
,
315 sbitmap_queue_free(&kqd
->domain_tokens
[i
]);
318 sbitmap_queue_resize(&kqd
->domain_tokens
[i
], kyber_depth
[i
]);
321 shift
= kyber_sched_tags_shift(kqd
);
322 kqd
->async_depth
= (1U << shift
) * KYBER_ASYNC_PERCENT
/ 100U;
324 kqd
->read_lat_nsec
= 2000000ULL;
325 kqd
->write_lat_nsec
= 10000000ULL;
330 blk_stat_free_callback(kqd
->cb
);
337 static int kyber_init_sched(struct request_queue
*q
, struct elevator_type
*e
)
339 struct kyber_queue_data
*kqd
;
340 struct elevator_queue
*eq
;
342 eq
= elevator_alloc(q
, e
);
346 kqd
= kyber_queue_data_alloc(q
);
348 kobject_put(&eq
->kobj
);
352 eq
->elevator_data
= kqd
;
355 blk_stat_add_callback(q
, kqd
->cb
);
360 static void kyber_exit_sched(struct elevator_queue
*e
)
362 struct kyber_queue_data
*kqd
= e
->elevator_data
;
363 struct request_queue
*q
= kqd
->q
;
366 blk_stat_remove_callback(q
, kqd
->cb
);
368 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++)
369 sbitmap_queue_free(&kqd
->domain_tokens
[i
]);
370 blk_stat_free_callback(kqd
->cb
);
374 static int kyber_init_hctx(struct blk_mq_hw_ctx
*hctx
, unsigned int hctx_idx
)
376 struct kyber_hctx_data
*khd
;
379 khd
= kmalloc_node(sizeof(*khd
), GFP_KERNEL
, hctx
->numa_node
);
383 spin_lock_init(&khd
->lock
);
385 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
386 INIT_LIST_HEAD(&khd
->rqs
[i
]);
387 INIT_LIST_HEAD(&khd
->domain_wait
[i
].task_list
);
388 atomic_set(&khd
->wait_index
[i
], 0);
394 hctx
->sched_data
= khd
;
399 static void kyber_exit_hctx(struct blk_mq_hw_ctx
*hctx
, unsigned int hctx_idx
)
401 kfree(hctx
->sched_data
);
404 static int rq_get_domain_token(struct request
*rq
)
406 return (long)rq
->elv
.priv
[0];
409 static void rq_set_domain_token(struct request
*rq
, int token
)
411 rq
->elv
.priv
[0] = (void *)(long)token
;
414 static void rq_clear_domain_token(struct kyber_queue_data
*kqd
,
417 unsigned int sched_domain
;
420 nr
= rq_get_domain_token(rq
);
422 sched_domain
= rq_sched_domain(rq
);
423 sbitmap_queue_clear(&kqd
->domain_tokens
[sched_domain
], nr
,
428 static struct request
*kyber_get_request(struct request_queue
*q
,
430 struct blk_mq_alloc_data
*data
)
432 struct kyber_queue_data
*kqd
= q
->elevator
->elevator_data
;
436 * We use the scheduler tags as per-hardware queue queueing tokens.
437 * Async requests can be limited at this stage.
440 data
->shallow_depth
= kqd
->async_depth
;
442 rq
= __blk_mq_alloc_request(data
, op
);
444 rq_set_domain_token(rq
, -1);
448 static void kyber_put_request(struct request
*rq
)
450 struct request_queue
*q
= rq
->q
;
451 struct kyber_queue_data
*kqd
= q
->elevator
->elevator_data
;
453 rq_clear_domain_token(kqd
, rq
);
454 blk_mq_finish_request(rq
);
457 static void kyber_completed_request(struct request
*rq
)
459 struct request_queue
*q
= rq
->q
;
460 struct kyber_queue_data
*kqd
= q
->elevator
->elevator_data
;
461 unsigned int sched_domain
;
462 u64 now
, latency
, target
;
465 * Check if this request met our latency goal. If not, quickly gather
466 * some statistics and start throttling.
468 sched_domain
= rq_sched_domain(rq
);
469 switch (sched_domain
) {
471 target
= kqd
->read_lat_nsec
;
473 case KYBER_SYNC_WRITE
:
474 target
= kqd
->write_lat_nsec
;
480 /* If we are already monitoring latencies, don't check again. */
481 if (blk_stat_is_active(kqd
->cb
))
484 now
= __blk_stat_time(ktime_to_ns(ktime_get()));
485 if (now
< blk_stat_time(&rq
->issue_stat
))
488 latency
= now
- blk_stat_time(&rq
->issue_stat
);
490 if (latency
> target
)
491 blk_stat_activate_msecs(kqd
->cb
, 10);
494 static void kyber_flush_busy_ctxs(struct kyber_hctx_data
*khd
,
495 struct blk_mq_hw_ctx
*hctx
)
498 struct request
*rq
, *next
;
500 blk_mq_flush_busy_ctxs(hctx
, &rq_list
);
501 list_for_each_entry_safe(rq
, next
, &rq_list
, queuelist
) {
502 unsigned int sched_domain
;
504 sched_domain
= rq_sched_domain(rq
);
505 list_move_tail(&rq
->queuelist
, &khd
->rqs
[sched_domain
]);
509 static int kyber_domain_wake(wait_queue_t
*wait
, unsigned mode
, int flags
,
512 struct blk_mq_hw_ctx
*hctx
= READ_ONCE(wait
->private);
514 list_del_init(&wait
->task_list
);
515 blk_mq_run_hw_queue(hctx
, true);
519 static int kyber_get_domain_token(struct kyber_queue_data
*kqd
,
520 struct kyber_hctx_data
*khd
,
521 struct blk_mq_hw_ctx
*hctx
)
523 unsigned int sched_domain
= khd
->cur_domain
;
524 struct sbitmap_queue
*domain_tokens
= &kqd
->domain_tokens
[sched_domain
];
525 wait_queue_t
*wait
= &khd
->domain_wait
[sched_domain
];
526 struct sbq_wait_state
*ws
;
529 nr
= __sbitmap_queue_get(domain_tokens
);
534 * If we failed to get a domain token, make sure the hardware queue is
535 * run when one becomes available. Note that this is serialized on
536 * khd->lock, but we still need to be careful about the waker.
538 if (list_empty_careful(&wait
->task_list
)) {
539 init_waitqueue_func_entry(wait
, kyber_domain_wake
);
540 wait
->private = hctx
;
541 ws
= sbq_wait_ptr(domain_tokens
,
542 &khd
->wait_index
[sched_domain
]);
543 add_wait_queue(&ws
->wait
, wait
);
546 * Try again in case a token was freed before we got on the wait
549 nr
= __sbitmap_queue_get(domain_tokens
);
554 static struct request
*
555 kyber_dispatch_cur_domain(struct kyber_queue_data
*kqd
,
556 struct kyber_hctx_data
*khd
,
557 struct blk_mq_hw_ctx
*hctx
,
560 struct list_head
*rqs
;
564 rqs
= &khd
->rqs
[khd
->cur_domain
];
565 rq
= list_first_entry_or_null(rqs
, struct request
, queuelist
);
568 * If there wasn't already a pending request and we haven't flushed the
569 * software queues yet, flush the software queues and check again.
571 if (!rq
&& !*flushed
) {
572 kyber_flush_busy_ctxs(khd
, hctx
);
574 rq
= list_first_entry_or_null(rqs
, struct request
, queuelist
);
578 nr
= kyber_get_domain_token(kqd
, khd
, hctx
);
581 rq_set_domain_token(rq
, nr
);
582 list_del_init(&rq
->queuelist
);
587 /* There were either no pending requests or no tokens. */
591 static struct request
*kyber_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
593 struct kyber_queue_data
*kqd
= hctx
->queue
->elevator
->elevator_data
;
594 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
595 bool flushed
= false;
599 spin_lock(&khd
->lock
);
602 * First, if we are still entitled to batch, try to dispatch a request
605 if (khd
->batching
< kyber_batch_size
[khd
->cur_domain
]) {
606 rq
= kyber_dispatch_cur_domain(kqd
, khd
, hctx
, &flushed
);
613 * 1. We were no longer entitled to a batch.
614 * 2. The domain we were batching didn't have any requests.
615 * 3. The domain we were batching was out of tokens.
617 * Start another batch. Note that this wraps back around to the original
618 * domain if no other domains have requests or tokens.
621 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
622 if (khd
->cur_domain
== KYBER_NUM_DOMAINS
- 1)
627 rq
= kyber_dispatch_cur_domain(kqd
, khd
, hctx
, &flushed
);
634 spin_unlock(&khd
->lock
);
638 static bool kyber_has_work(struct blk_mq_hw_ctx
*hctx
)
640 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
643 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
644 if (!list_empty_careful(&khd
->rqs
[i
]))
650 #define KYBER_LAT_SHOW_STORE(op) \
651 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
654 struct kyber_queue_data *kqd = e->elevator_data; \
656 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
659 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
660 const char *page, size_t count) \
662 struct kyber_queue_data *kqd = e->elevator_data; \
663 unsigned long long nsec; \
666 ret = kstrtoull(page, 10, &nsec); \
670 kqd->op##_lat_nsec = nsec; \
674 KYBER_LAT_SHOW_STORE(read
);
675 KYBER_LAT_SHOW_STORE(write
);
676 #undef KYBER_LAT_SHOW_STORE
678 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
679 static struct elv_fs_entry kyber_sched_attrs
[] = {
680 KYBER_LAT_ATTR(read
),
681 KYBER_LAT_ATTR(write
),
684 #undef KYBER_LAT_ATTR
686 static struct elevator_type kyber_sched
= {
688 .init_sched
= kyber_init_sched
,
689 .exit_sched
= kyber_exit_sched
,
690 .init_hctx
= kyber_init_hctx
,
691 .exit_hctx
= kyber_exit_hctx
,
692 .get_request
= kyber_get_request
,
693 .put_request
= kyber_put_request
,
694 .completed_request
= kyber_completed_request
,
695 .dispatch_request
= kyber_dispatch_request
,
696 .has_work
= kyber_has_work
,
699 .elevator_attrs
= kyber_sched_attrs
,
700 .elevator_name
= "kyber",
701 .elevator_owner
= THIS_MODULE
,
704 static int __init
kyber_init(void)
706 return elv_register(&kyber_sched
);
709 static void __exit
kyber_exit(void)
711 elv_unregister(&kyber_sched
);
714 module_init(kyber_init
);
715 module_exit(kyber_exit
);
717 MODULE_AUTHOR("Omar Sandoval");
718 MODULE_LICENSE("GPL");
719 MODULE_DESCRIPTION("Kyber I/O scheduler");