2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
125 * struct bfq_service_tree - per ioprio_class service tree.
127 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
128 * ioprio_class has its own independent scheduler, and so its own
129 * bfq_service_tree. All the fields are protected by the queue lock
130 * of the containing bfqd.
132 struct bfq_service_tree
{
133 /* tree for active entities (i.e., those backlogged) */
134 struct rb_root active
;
135 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
138 /* idle entity with minimum F_i */
139 struct bfq_entity
*first_idle
;
140 /* idle entity with maximum F_i */
141 struct bfq_entity
*last_idle
;
143 /* scheduler virtual time */
145 /* scheduler weight sum; active and idle entities contribute to it */
150 * struct bfq_sched_data - multi-class scheduler.
152 * bfq_sched_data is the basic scheduler queue. It supports three
153 * ioprio_classes, and can be used either as a toplevel queue or as an
154 * intermediate queue on a hierarchical setup. @next_in_service
155 * points to the active entity of the sched_data service trees that
156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
159 * The supported ioprio_classes are the same as in CFQ, in descending
160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
161 * Requests from higher priority queues are served before all the
162 * requests from lower priority queues; among requests of the same
163 * queue requests are served according to B-WF2Q+.
164 * All the fields are protected by the queue lock of the containing bfqd.
166 struct bfq_sched_data
{
167 /* entity in service */
168 struct bfq_entity
*in_service_entity
;
169 /* head-of-line entity (see comments above) */
170 struct bfq_entity
*next_in_service
;
171 /* array of service trees, one per ioprio_class */
172 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service
;
179 * struct bfq_entity - schedulable entity.
181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
183 * entity belongs to the sched_data of the parent group in the cgroup
184 * hierarchy. Non-leaf entities have also their own sched_data, stored
187 * Each entity stores independently its priority values; this would
188 * allow different weights on different devices, but this
189 * functionality is not exported to userspace by now. Priorities and
190 * weights are updated lazily, first storing the new values into the
191 * new_* fields, then setting the @prio_changed flag. As soon as
192 * there is a transition in the entity state that allows the priority
193 * update to take place the effective and the requested priority
194 * values are synchronized.
196 * Unless cgroups are used, the weight value is calculated from the
197 * ioprio to export the same interface as CFQ. When dealing with
198 * ``well-behaved'' queues (i.e., queues that do not spend too much
199 * time to consume their budget and have true sequential behavior, and
200 * when there are no external factors breaking anticipation) the
201 * relative weights at each level of the cgroups hierarchy should be
202 * guaranteed. All the fields are protected by the queue lock of the
206 /* service_tree member */
207 struct rb_node rb_node
;
210 * Flag, true if the entity is on a tree (either the active or
211 * the idle one of its service_tree) or is in service.
215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
218 /* tree the entity is enqueued into; %NULL if not on a tree */
219 struct rb_root
*tree
;
222 * minimum start time of the (active) subtree rooted at this
223 * entity; used for O(log N) lookups into active trees
227 /* amount of service received during the last service slot */
230 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
233 /* weight of the queue */
235 /* next weight if a change is in progress */
238 /* original weight, used to implement weight boosting */
241 /* parent entity, for hierarchical scheduling */
242 struct bfq_entity
*parent
;
245 * For non-leaf nodes in the hierarchy, the associated
246 * scheduler queue, %NULL on leaf nodes.
248 struct bfq_sched_data
*my_sched_data
;
249 /* the scheduler queue this entity belongs to */
250 struct bfq_sched_data
*sched_data
;
252 /* flag, set to request a weight, ioprio or ioprio_class change */
259 * struct bfq_ttime - per process thinktime stats.
262 /* completion time of the last request */
263 u64 last_end_request
;
265 /* total process thinktime */
267 /* number of thinktime samples */
268 unsigned long ttime_samples
;
269 /* average process thinktime */
274 * struct bfq_queue - leaf schedulable entity.
276 * A bfq_queue is a leaf request queue; it can be associated with an
277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
284 /* reference counter */
286 /* parent bfq_data */
287 struct bfq_data
*bfqd
;
289 /* current ioprio and ioprio class */
290 unsigned short ioprio
, ioprio_class
;
291 /* next ioprio and ioprio class if a change is in progress */
292 unsigned short new_ioprio
, new_ioprio_class
;
294 /* sorted list of pending requests */
295 struct rb_root sort_list
;
296 /* if fifo isn't expired, next request to serve */
297 struct request
*next_rq
;
298 /* number of sync and async requests queued */
300 /* number of requests currently allocated */
302 /* number of pending metadata requests */
304 /* fifo list of requests in sort_list */
305 struct list_head fifo
;
307 /* entity representing this queue in the scheduler */
308 struct bfq_entity entity
;
310 /* maximum budget allowed from the feedback mechanism */
312 /* budget expiration (in jiffies) */
313 unsigned long budget_timeout
;
315 /* number of requests on the dispatch list or inside driver */
321 /* node for active/idle bfqq list inside parent bfqd */
322 struct list_head bfqq_list
;
324 /* associated @bfq_ttime struct */
325 struct bfq_ttime ttime
;
327 /* bit vector: a 1 for each seeky requests in history */
329 /* position of the last request enqueued */
330 sector_t last_request_pos
;
332 /* Number of consecutive pairs of request completion and
333 * arrival, such that the queue becomes idle after the
334 * completion, but the next request arrives within an idle
335 * time slice; used only if the queue's IO_bound flag has been
338 unsigned int requests_within_timer
;
340 /* pid of the process owning the queue, used for logging purposes */
345 * struct bfq_io_cq - per (request_queue, io_context) structure.
348 /* associated io_cq structure */
349 struct io_cq icq
; /* must be the first member */
350 /* array of two process queues, the sync and the async */
351 struct bfq_queue
*bfqq
[2];
352 /* per (request_queue, blkcg) ioprio */
354 #ifdef CONFIG_BFQ_GROUP_IOSCHED
355 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
360 * struct bfq_data - per-device data structure.
362 * All the fields are protected by @lock.
365 /* device request queue */
366 struct request_queue
*queue
;
368 struct list_head dispatch
;
370 /* root bfq_group for the device */
371 struct bfq_group
*root_group
;
374 * Number of bfq_queues containing requests (including the
375 * queue in service, even if it is idling).
378 /* number of queued requests */
380 /* number of requests dispatched and waiting for completion */
384 * Maximum number of requests in driver in the last
385 * @hw_tag_samples completed requests.
387 int max_rq_in_driver
;
388 /* number of samples used to calculate hw_tag */
390 /* flag set to one if the driver is showing a queueing behavior */
393 /* number of budgets assigned */
394 int budgets_assigned
;
397 * Timer set when idling (waiting) for the next request from
398 * the queue in service.
400 struct hrtimer idle_slice_timer
;
402 /* bfq_queue in service */
403 struct bfq_queue
*in_service_queue
;
404 /* bfq_io_cq (bic) associated with the @in_service_queue */
405 struct bfq_io_cq
*in_service_bic
;
407 /* on-disk position of the last served request */
408 sector_t last_position
;
410 /* beginning of the last budget */
411 ktime_t last_budget_start
;
412 /* beginning of the last idle slice */
413 ktime_t last_idling_start
;
414 /* number of samples used to calculate @peak_rate */
415 int peak_rate_samples
;
417 * Peak read/write rate, observed during the service of a
418 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
419 * left-shifted by BFQ_RATE_SHIFT to increase precision in
420 * fixed-point calculations.
423 /* maximum budget allotted to a bfq_queue before rescheduling */
426 /* list of all the bfq_queues active on the device */
427 struct list_head active_list
;
428 /* list of all the bfq_queues idle on the device */
429 struct list_head idle_list
;
432 * Timeout for async/sync requests; when it fires, requests
433 * are served in fifo order.
435 u64 bfq_fifo_expire
[2];
436 /* weight of backward seeks wrt forward ones */
437 unsigned int bfq_back_penalty
;
438 /* maximum allowed backward seek */
439 unsigned int bfq_back_max
;
440 /* maximum idling time */
443 /* user-configured max budget value (0 for auto-tuning) */
444 int bfq_user_max_budget
;
446 * Timeout for bfq_queues to consume their budget; used to
447 * prevent seeky queues from imposing long latencies to
448 * sequential or quasi-sequential ones (this also implies that
449 * seeky queues cannot receive guarantees in the service
450 * domain; after a timeout they are charged for the time they
451 * have been in service, to preserve fairness among them, but
452 * without service-domain guarantees).
454 unsigned int bfq_timeout
;
457 * Number of consecutive requests that must be issued within
458 * the idle time slice to set again idling to a queue which
459 * was marked as non-I/O-bound (see the definition of the
460 * IO_bound flag for further details).
462 unsigned int bfq_requests_within_timer
;
465 * Force device idling whenever needed to provide accurate
466 * service guarantees, without caring about throughput
467 * issues. CAVEAT: this may even increase latencies, in case
468 * of useless idling for processes that did stop doing I/O.
470 bool strict_guarantees
;
472 /* fallback dummy bfqq for extreme OOM conditions */
473 struct bfq_queue oom_bfqq
;
478 * bic associated with the task issuing current bio for
479 * merging. This and the next field are used as a support to
480 * be able to perform the bic lookup, needed by bio-merge
481 * functions, before the scheduler lock is taken, and thus
482 * avoid taking the request-queue lock while the scheduler
483 * lock is being held.
485 struct bfq_io_cq
*bio_bic
;
486 /* bfqq associated with the task issuing current bio for merging */
487 struct bfq_queue
*bio_bfqq
;
490 enum bfqq_state_flags
{
491 BFQQF_busy
= 0, /* has requests or is in service */
492 BFQQF_wait_request
, /* waiting for a request */
493 BFQQF_non_blocking_wait_rq
, /*
494 * waiting for a request
495 * without idling the device
497 BFQQF_fifo_expire
, /* FIFO checked in this slice */
498 BFQQF_idle_window
, /* slice idling enabled */
499 BFQQF_sync
, /* synchronous queue */
500 BFQQF_budget_new
, /* no completion with this budget */
502 * bfqq has timed-out at least once
503 * having consumed at most 2/10 of
508 #define BFQ_BFQQ_FNS(name) \
509 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
511 __set_bit(BFQQF_##name, &(bfqq)->flags); \
513 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
515 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
517 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
519 return test_bit(BFQQF_##name, &(bfqq)->flags); \
523 BFQ_BFQQ_FNS(wait_request
);
524 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
525 BFQ_BFQQ_FNS(fifo_expire
);
526 BFQ_BFQQ_FNS(idle_window
);
528 BFQ_BFQQ_FNS(budget_new
);
529 BFQ_BFQQ_FNS(IO_bound
);
532 /* Logging facilities. */
533 #ifdef CONFIG_BFQ_GROUP_IOSCHED
534 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
535 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
537 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
540 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
541 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
542 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
546 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
549 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
550 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
553 #else /* CONFIG_BFQ_GROUP_IOSCHED */
555 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
556 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
557 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
559 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
561 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
563 #define bfq_log(bfqd, fmt, args...) \
564 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
566 /* Expiration reasons. */
567 enum bfqq_expiration
{
568 BFQQE_TOO_IDLE
= 0, /*
569 * queue has been idling for
572 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
573 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
574 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
575 BFQQE_PREEMPTED
/* preemption in progress */
579 #ifdef CONFIG_BFQ_GROUP_IOSCHED
580 /* number of ios merged */
581 struct blkg_rwstat merged
;
582 /* total time spent on device in ns, may not be accurate w/ queueing */
583 struct blkg_rwstat service_time
;
584 /* total time spent waiting in scheduler queue in ns */
585 struct blkg_rwstat wait_time
;
586 /* number of IOs queued up */
587 struct blkg_rwstat queued
;
588 /* total disk time and nr sectors dispatched by this group */
589 struct blkg_stat time
;
590 /* sum of number of ios queued across all samples */
591 struct blkg_stat avg_queue_size_sum
;
592 /* count of samples taken for average */
593 struct blkg_stat avg_queue_size_samples
;
594 /* how many times this group has been removed from service tree */
595 struct blkg_stat dequeue
;
596 /* total time spent waiting for it to be assigned a timeslice. */
597 struct blkg_stat group_wait_time
;
598 /* time spent idling for this blkcg_gq */
599 struct blkg_stat idle_time
;
600 /* total time with empty current active q with other requests queued */
601 struct blkg_stat empty_time
;
602 /* fields after this shouldn't be cleared on stat reset */
603 uint64_t start_group_wait_time
;
604 uint64_t start_idle_time
;
605 uint64_t start_empty_time
;
607 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
610 #ifdef CONFIG_BFQ_GROUP_IOSCHED
613 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
615 * @ps: @blkcg_policy_storage that this structure inherits
616 * @weight: weight of the bfq_group
618 struct bfq_group_data
{
619 /* must be the first member */
620 struct blkcg_policy_data pd
;
622 unsigned short weight
;
626 * struct bfq_group - per (device, cgroup) data structure.
627 * @entity: schedulable entity to insert into the parent group sched_data.
628 * @sched_data: own sched_data, to contain child entities (they may be
629 * both bfq_queues and bfq_groups).
630 * @bfqd: the bfq_data for the device this group acts upon.
631 * @async_bfqq: array of async queues for all the tasks belonging to
632 * the group, one queue per ioprio value per ioprio_class,
633 * except for the idle class that has only one queue.
634 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
635 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
636 * to avoid too many special cases during group creation/
638 * @stats: stats for this bfqg.
640 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
641 * there is a set of bfq_groups, each one collecting the lower-level
642 * entities belonging to the group that are acting on the same device.
644 * Locking works as follows:
645 * o @bfqd is protected by the queue lock, RCU is used to access it
647 * o All the other fields are protected by the @bfqd queue lock.
650 /* must be the first member */
651 struct blkg_policy_data pd
;
653 struct bfq_entity entity
;
654 struct bfq_sched_data sched_data
;
658 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
659 struct bfq_queue
*async_idle_bfqq
;
661 struct bfq_entity
*my_entity
;
663 struct bfqg_stats stats
;
668 struct bfq_sched_data sched_data
;
670 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
671 struct bfq_queue
*async_idle_bfqq
;
673 struct rb_root rq_pos_tree
;
677 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
679 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
681 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
683 return bfqq
? bfqq
->ioprio_class
- 1 :
684 BFQ_DEFAULT_GRP_CLASS
- 1;
687 static struct bfq_service_tree
*
688 bfq_entity_service_tree(struct bfq_entity
*entity
)
690 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
691 unsigned int idx
= bfq_class_idx(entity
);
693 return sched_data
->service_tree
+ idx
;
696 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
698 return bic
->bfqq
[is_sync
];
701 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
704 bic
->bfqq
[is_sync
] = bfqq
;
707 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
709 return bic
->icq
.q
->elevator
->elevator_data
;
712 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
713 static void bfq_put_queue(struct bfq_queue
*bfqq
);
714 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
715 struct bio
*bio
, bool is_sync
,
716 struct bfq_io_cq
*bic
);
717 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
718 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
720 /* Expiration time of sync (0) and async (1) requests, in ns. */
721 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
723 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
724 static const int bfq_back_max
= 16 * 1024;
726 /* Penalty of a backwards seek, in number of sectors. */
727 static const int bfq_back_penalty
= 2;
729 /* Idling period duration, in ns. */
730 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
732 /* Minimum number of assigned budgets for which stats are safe to compute. */
733 static const int bfq_stats_min_budgets
= 194;
735 /* Default maximum budget values, in sectors and number of requests. */
736 static const int bfq_default_max_budget
= 16 * 1024;
738 /* Default timeout values, in jiffies, approximating CFQ defaults. */
739 static const int bfq_timeout
= HZ
/ 8;
741 static struct kmem_cache
*bfq_pool
;
743 /* Below this threshold (in ms), we consider thinktime immediate. */
744 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
746 /* hw_tag detection: parallel requests threshold and min samples needed. */
747 #define BFQ_HW_QUEUE_THRESHOLD 4
748 #define BFQ_HW_QUEUE_SAMPLES 32
750 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
751 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
752 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
753 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
755 /* Min samples used for peak rate estimation (for autotuning). */
756 #define BFQ_PEAK_RATE_SAMPLES 32
758 /* Shift used for peak rate fixed precision calculations. */
759 #define BFQ_RATE_SHIFT 16
761 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
762 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
764 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
765 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
768 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
769 * @icq: the iocontext queue.
771 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
773 /* bic->icq is the first member, %NULL will convert to %NULL */
774 return container_of(icq
, struct bfq_io_cq
, icq
);
778 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
779 * @bfqd: the lookup key.
780 * @ioc: the io_context of the process doing I/O.
781 * @q: the request queue.
783 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
784 struct io_context
*ioc
,
785 struct request_queue
*q
)
789 struct bfq_io_cq
*icq
;
791 spin_lock_irqsave(q
->queue_lock
, flags
);
792 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
793 spin_unlock_irqrestore(q
->queue_lock
, flags
);
802 * Scheduler run of queue, if there are requests pending and no one in the
803 * driver that will restart queueing.
805 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
807 if (bfqd
->queued
!= 0) {
808 bfq_log(bfqd
, "schedule dispatch");
809 blk_mq_run_hw_queues(bfqd
->queue
, true);
814 * bfq_gt - compare two timestamps.
818 * Return @a > @b, dealing with wrapping correctly.
820 static int bfq_gt(u64 a
, u64 b
)
822 return (s64
)(a
- b
) > 0;
825 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
827 struct rb_node
*node
= tree
->rb_node
;
829 return rb_entry(node
, struct bfq_entity
, rb_node
);
832 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
834 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
837 * bfq_update_next_in_service - update sd->next_in_service
838 * @sd: sched_data for which to perform the update.
839 * @new_entity: if not NULL, pointer to the entity whose activation,
840 * requeueing or repositionig triggered the invocation of
843 * This function is called to update sd->next_in_service, which, in
844 * its turn, may change as a consequence of the insertion or
845 * extraction of an entity into/from one of the active trees of
846 * sd. These insertions/extractions occur as a consequence of
847 * activations/deactivations of entities, with some activations being
848 * 'true' activations, and other activations being requeueings (i.e.,
849 * implementing the second, requeueing phase of the mechanism used to
850 * reposition an entity in its active tree; see comments on
851 * __bfq_activate_entity and __bfq_requeue_entity for details). In
852 * both the last two activation sub-cases, new_entity points to the
853 * just activated or requeued entity.
855 * Returns true if sd->next_in_service changes in such a way that
856 * entity->parent may become the next_in_service for its parent
859 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
860 struct bfq_entity
*new_entity
)
862 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
863 bool parent_sched_may_change
= false;
866 * If this update is triggered by the activation, requeueing
867 * or repositiong of an entity that does not coincide with
868 * sd->next_in_service, then a full lookup in the active tree
869 * can be avoided. In fact, it is enough to check whether the
870 * just-modified entity has a higher priority than
871 * sd->next_in_service, or, even if it has the same priority
872 * as sd->next_in_service, is eligible and has a lower virtual
873 * finish time than sd->next_in_service. If this compound
874 * condition holds, then the new entity becomes the new
875 * next_in_service. Otherwise no change is needed.
877 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
879 * Flag used to decide whether to replace
880 * sd->next_in_service with new_entity. Tentatively
881 * set to true, and left as true if
882 * sd->next_in_service is NULL.
884 bool replace_next
= true;
887 * If there is already a next_in_service candidate
888 * entity, then compare class priorities or timestamps
889 * to decide whether to replace sd->service_tree with
892 if (next_in_service
) {
893 unsigned int new_entity_class_idx
=
894 bfq_class_idx(new_entity
);
895 struct bfq_service_tree
*st
=
896 sd
->service_tree
+ new_entity_class_idx
;
899 * For efficiency, evaluate the most likely
900 * sub-condition first.
903 (new_entity_class_idx
==
904 bfq_class_idx(next_in_service
)
906 !bfq_gt(new_entity
->start
, st
->vtime
)
908 bfq_gt(next_in_service
->finish
,
911 new_entity_class_idx
<
912 bfq_class_idx(next_in_service
);
916 next_in_service
= new_entity
;
917 } else /* invoked because of a deactivation: lookup needed */
918 next_in_service
= bfq_lookup_next_entity(sd
);
920 if (next_in_service
) {
921 parent_sched_may_change
= !sd
->next_in_service
||
922 bfq_update_parent_budget(next_in_service
);
925 sd
->next_in_service
= next_in_service
;
927 if (!next_in_service
)
928 return parent_sched_may_change
;
930 return parent_sched_may_change
;
933 #ifdef CONFIG_BFQ_GROUP_IOSCHED
934 /* both next loops stop at one of the child entities of the root group */
935 #define for_each_entity(entity) \
936 for (; entity ; entity = entity->parent)
939 * For each iteration, compute parent in advance, so as to be safe if
940 * entity is deallocated during the iteration. Such a deallocation may
941 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
944 #define for_each_entity_safe(entity, parent) \
945 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
948 * Returns true if this budget changes may let next_in_service->parent
949 * become the next_in_service entity for its parent entity.
951 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
953 struct bfq_entity
*bfqg_entity
;
954 struct bfq_group
*bfqg
;
955 struct bfq_sched_data
*group_sd
;
958 group_sd
= next_in_service
->sched_data
;
960 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
962 * bfq_group's my_entity field is not NULL only if the group
963 * is not the root group. We must not touch the root entity
964 * as it must never become an in-service entity.
966 bfqg_entity
= bfqg
->my_entity
;
968 if (bfqg_entity
->budget
> next_in_service
->budget
)
970 bfqg_entity
->budget
= next_in_service
->budget
;
977 * This function tells whether entity stops being a candidate for next
978 * service, according to the following logic.
980 * This function is invoked for an entity that is about to be set in
981 * service. If such an entity is a queue, then the entity is no longer
982 * a candidate for next service (i.e, a candidate entity to serve
983 * after the in-service entity is expired). The function then returns
986 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
988 if (bfq_entity_to_bfqq(entity
))
994 #else /* CONFIG_BFQ_GROUP_IOSCHED */
996 * Next two macros are fake loops when cgroups support is not
997 * enabled. I fact, in such a case, there is only one level to go up
998 * (to reach the root group).
1000 #define for_each_entity(entity) \
1001 for (; entity ; entity = NULL)
1003 #define for_each_entity_safe(entity, parent) \
1004 for (parent = NULL; entity ; entity = parent)
1006 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1011 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1016 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1019 * Shift for timestamp calculations. This actually limits the maximum
1020 * service allowed in one timestamp delta (small shift values increase it),
1021 * the maximum total weight that can be used for the queues in the system
1022 * (big shift values increase it), and the period of virtual time
1025 #define WFQ_SERVICE_SHIFT 22
1027 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1029 struct bfq_queue
*bfqq
= NULL
;
1031 if (!entity
->my_sched_data
)
1032 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1039 * bfq_delta - map service into the virtual time domain.
1040 * @service: amount of service.
1041 * @weight: scale factor (weight of an entity or weight sum).
1043 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1045 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1052 * bfq_calc_finish - assign the finish time to an entity.
1053 * @entity: the entity to act upon.
1054 * @service: the service to be charged to the entity.
1056 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1058 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1060 entity
->finish
= entity
->start
+
1061 bfq_delta(service
, entity
->weight
);
1064 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1065 "calc_finish: serv %lu, w %d",
1066 service
, entity
->weight
);
1067 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1068 "calc_finish: start %llu, finish %llu, delta %llu",
1069 entity
->start
, entity
->finish
,
1070 bfq_delta(service
, entity
->weight
));
1075 * bfq_entity_of - get an entity from a node.
1076 * @node: the node field of the entity.
1078 * Convert a node pointer to the relative entity. This is used only
1079 * to simplify the logic of some functions and not as the generic
1080 * conversion mechanism because, e.g., in the tree walking functions,
1081 * the check for a %NULL value would be redundant.
1083 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1085 struct bfq_entity
*entity
= NULL
;
1088 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1094 * bfq_extract - remove an entity from a tree.
1095 * @root: the tree root.
1096 * @entity: the entity to remove.
1098 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1100 entity
->tree
= NULL
;
1101 rb_erase(&entity
->rb_node
, root
);
1105 * bfq_idle_extract - extract an entity from the idle tree.
1106 * @st: the service tree of the owning @entity.
1107 * @entity: the entity being removed.
1109 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1110 struct bfq_entity
*entity
)
1112 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1113 struct rb_node
*next
;
1115 if (entity
== st
->first_idle
) {
1116 next
= rb_next(&entity
->rb_node
);
1117 st
->first_idle
= bfq_entity_of(next
);
1120 if (entity
== st
->last_idle
) {
1121 next
= rb_prev(&entity
->rb_node
);
1122 st
->last_idle
= bfq_entity_of(next
);
1125 bfq_extract(&st
->idle
, entity
);
1128 list_del(&bfqq
->bfqq_list
);
1132 * bfq_insert - generic tree insertion.
1134 * @entity: entity to insert.
1136 * This is used for the idle and the active tree, since they are both
1137 * ordered by finish time.
1139 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1141 struct bfq_entity
*entry
;
1142 struct rb_node
**node
= &root
->rb_node
;
1143 struct rb_node
*parent
= NULL
;
1147 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1149 if (bfq_gt(entry
->finish
, entity
->finish
))
1150 node
= &parent
->rb_left
;
1152 node
= &parent
->rb_right
;
1155 rb_link_node(&entity
->rb_node
, parent
, node
);
1156 rb_insert_color(&entity
->rb_node
, root
);
1158 entity
->tree
= root
;
1162 * bfq_update_min - update the min_start field of a entity.
1163 * @entity: the entity to update.
1164 * @node: one of its children.
1166 * This function is called when @entity may store an invalid value for
1167 * min_start due to updates to the active tree. The function assumes
1168 * that the subtree rooted at @node (which may be its left or its right
1169 * child) has a valid min_start value.
1171 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1173 struct bfq_entity
*child
;
1176 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1177 if (bfq_gt(entity
->min_start
, child
->min_start
))
1178 entity
->min_start
= child
->min_start
;
1183 * bfq_update_active_node - recalculate min_start.
1184 * @node: the node to update.
1186 * @node may have changed position or one of its children may have moved,
1187 * this function updates its min_start value. The left and right subtrees
1188 * are assumed to hold a correct min_start value.
1190 static void bfq_update_active_node(struct rb_node
*node
)
1192 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1194 entity
->min_start
= entity
->start
;
1195 bfq_update_min(entity
, node
->rb_right
);
1196 bfq_update_min(entity
, node
->rb_left
);
1200 * bfq_update_active_tree - update min_start for the whole active tree.
1201 * @node: the starting node.
1203 * @node must be the deepest modified node after an update. This function
1204 * updates its min_start using the values held by its children, assuming
1205 * that they did not change, and then updates all the nodes that may have
1206 * changed in the path to the root. The only nodes that may have changed
1207 * are the ones in the path or their siblings.
1209 static void bfq_update_active_tree(struct rb_node
*node
)
1211 struct rb_node
*parent
;
1214 bfq_update_active_node(node
);
1216 parent
= rb_parent(node
);
1220 if (node
== parent
->rb_left
&& parent
->rb_right
)
1221 bfq_update_active_node(parent
->rb_right
);
1222 else if (parent
->rb_left
)
1223 bfq_update_active_node(parent
->rb_left
);
1230 * bfq_active_insert - insert an entity in the active tree of its
1232 * @st: the service tree of the entity.
1233 * @entity: the entity being inserted.
1235 * The active tree is ordered by finish time, but an extra key is kept
1236 * per each node, containing the minimum value for the start times of
1237 * its children (and the node itself), so it's possible to search for
1238 * the eligible node with the lowest finish time in logarithmic time.
1240 static void bfq_active_insert(struct bfq_service_tree
*st
,
1241 struct bfq_entity
*entity
)
1243 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1244 struct rb_node
*node
= &entity
->rb_node
;
1245 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1246 struct bfq_sched_data
*sd
= NULL
;
1247 struct bfq_group
*bfqg
= NULL
;
1248 struct bfq_data
*bfqd
= NULL
;
1251 bfq_insert(&st
->active
, entity
);
1254 node
= node
->rb_left
;
1255 else if (node
->rb_right
)
1256 node
= node
->rb_right
;
1258 bfq_update_active_tree(node
);
1260 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1261 sd
= entity
->sched_data
;
1262 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1263 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1266 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1270 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1271 * @ioprio: the ioprio value to convert.
1273 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1275 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1279 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1280 * @weight: the weight value to convert.
1282 * To preserve as much as possible the old only-ioprio user interface,
1283 * 0 is used as an escape ioprio value for weights (numerically) equal or
1284 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1286 static unsigned short bfq_weight_to_ioprio(int weight
)
1288 return max_t(int, 0,
1289 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1292 static void bfq_get_entity(struct bfq_entity
*entity
)
1294 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1298 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1304 * bfq_find_deepest - find the deepest node that an extraction can modify.
1305 * @node: the node being removed.
1307 * Do the first step of an extraction in an rb tree, looking for the
1308 * node that will replace @node, and returning the deepest node that
1309 * the following modifications to the tree can touch. If @node is the
1310 * last node in the tree return %NULL.
1312 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1314 struct rb_node
*deepest
;
1316 if (!node
->rb_right
&& !node
->rb_left
)
1317 deepest
= rb_parent(node
);
1318 else if (!node
->rb_right
)
1319 deepest
= node
->rb_left
;
1320 else if (!node
->rb_left
)
1321 deepest
= node
->rb_right
;
1323 deepest
= rb_next(node
);
1324 if (deepest
->rb_right
)
1325 deepest
= deepest
->rb_right
;
1326 else if (rb_parent(deepest
) != node
)
1327 deepest
= rb_parent(deepest
);
1334 * bfq_active_extract - remove an entity from the active tree.
1335 * @st: the service_tree containing the tree.
1336 * @entity: the entity being removed.
1338 static void bfq_active_extract(struct bfq_service_tree
*st
,
1339 struct bfq_entity
*entity
)
1341 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1342 struct rb_node
*node
;
1343 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1344 struct bfq_sched_data
*sd
= NULL
;
1345 struct bfq_group
*bfqg
= NULL
;
1346 struct bfq_data
*bfqd
= NULL
;
1349 node
= bfq_find_deepest(&entity
->rb_node
);
1350 bfq_extract(&st
->active
, entity
);
1353 bfq_update_active_tree(node
);
1355 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1356 sd
= entity
->sched_data
;
1357 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1358 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1361 list_del(&bfqq
->bfqq_list
);
1365 * bfq_idle_insert - insert an entity into the idle tree.
1366 * @st: the service tree containing the tree.
1367 * @entity: the entity to insert.
1369 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1370 struct bfq_entity
*entity
)
1372 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1373 struct bfq_entity
*first_idle
= st
->first_idle
;
1374 struct bfq_entity
*last_idle
= st
->last_idle
;
1376 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1377 st
->first_idle
= entity
;
1378 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1379 st
->last_idle
= entity
;
1381 bfq_insert(&st
->idle
, entity
);
1384 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1388 * bfq_forget_entity - do not consider entity any longer for scheduling
1389 * @st: the service tree.
1390 * @entity: the entity being removed.
1391 * @is_in_service: true if entity is currently the in-service entity.
1393 * Forget everything about @entity. In addition, if entity represents
1394 * a queue, and the latter is not in service, then release the service
1395 * reference to the queue (the one taken through bfq_get_entity). In
1396 * fact, in this case, there is really no more service reference to
1397 * the queue, as the latter is also outside any service tree. If,
1398 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1399 * will take care of putting the reference when the queue finally
1400 * stops being served.
1402 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1403 struct bfq_entity
*entity
,
1406 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1408 entity
->on_st
= false;
1409 st
->wsum
-= entity
->weight
;
1410 if (bfqq
&& !is_in_service
)
1411 bfq_put_queue(bfqq
);
1415 * bfq_put_idle_entity - release the idle tree ref of an entity.
1416 * @st: service tree for the entity.
1417 * @entity: the entity being released.
1419 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1420 struct bfq_entity
*entity
)
1422 bfq_idle_extract(st
, entity
);
1423 bfq_forget_entity(st
, entity
,
1424 entity
== entity
->sched_data
->in_service_entity
);
1428 * bfq_forget_idle - update the idle tree if necessary.
1429 * @st: the service tree to act upon.
1431 * To preserve the global O(log N) complexity we only remove one entry here;
1432 * as the idle tree will not grow indefinitely this can be done safely.
1434 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1436 struct bfq_entity
*first_idle
= st
->first_idle
;
1437 struct bfq_entity
*last_idle
= st
->last_idle
;
1439 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1440 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1442 * Forget the whole idle tree, increasing the vtime past
1443 * the last finish time of idle entities.
1445 st
->vtime
= last_idle
->finish
;
1448 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1449 bfq_put_idle_entity(st
, first_idle
);
1452 static struct bfq_service_tree
*
1453 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1454 struct bfq_entity
*entity
)
1456 struct bfq_service_tree
*new_st
= old_st
;
1458 if (entity
->prio_changed
) {
1459 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1460 unsigned short prev_weight
, new_weight
;
1461 struct bfq_data
*bfqd
= NULL
;
1462 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1463 struct bfq_sched_data
*sd
;
1464 struct bfq_group
*bfqg
;
1469 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1471 sd
= entity
->my_sched_data
;
1472 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1473 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1477 old_st
->wsum
-= entity
->weight
;
1479 if (entity
->new_weight
!= entity
->orig_weight
) {
1480 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1481 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1482 pr_crit("update_weight_prio: new_weight %d\n",
1483 entity
->new_weight
);
1484 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1485 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1487 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1489 entity
->orig_weight
= entity
->new_weight
;
1492 bfq_weight_to_ioprio(entity
->orig_weight
);
1496 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1497 entity
->prio_changed
= 0;
1500 * NOTE: here we may be changing the weight too early,
1501 * this will cause unfairness. The correct approach
1502 * would have required additional complexity to defer
1503 * weight changes to the proper time instants (i.e.,
1504 * when entity->finish <= old_st->vtime).
1506 new_st
= bfq_entity_service_tree(entity
);
1508 prev_weight
= entity
->weight
;
1509 new_weight
= entity
->orig_weight
;
1510 entity
->weight
= new_weight
;
1512 new_st
->wsum
+= entity
->weight
;
1514 if (new_st
!= old_st
)
1515 entity
->start
= new_st
->vtime
;
1521 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1522 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1525 * bfq_bfqq_served - update the scheduler status after selection for
1527 * @bfqq: the queue being served.
1528 * @served: bytes to transfer.
1530 * NOTE: this can be optimized, as the timestamps of upper level entities
1531 * are synchronized every time a new bfqq is selected for service. By now,
1532 * we keep it to better check consistency.
1534 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1536 struct bfq_entity
*entity
= &bfqq
->entity
;
1537 struct bfq_service_tree
*st
;
1539 for_each_entity(entity
) {
1540 st
= bfq_entity_service_tree(entity
);
1542 entity
->service
+= served
;
1544 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1545 bfq_forget_idle(st
);
1547 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1548 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1552 * bfq_bfqq_charge_full_budget - set the service to the entity budget.
1553 * @bfqq: the queue that needs a service update.
1555 * When it's not possible to be fair in the service domain, because
1556 * a queue is not consuming its budget fast enough (the meaning of
1557 * fast depends on the timeout parameter), we charge it a full
1558 * budget. In this way we should obtain a sort of time-domain
1559 * fairness among all the seeky/slow queues.
1561 static void bfq_bfqq_charge_full_budget(struct bfq_queue
*bfqq
)
1563 struct bfq_entity
*entity
= &bfqq
->entity
;
1565 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "charge_full_budget");
1567 bfq_bfqq_served(bfqq
, entity
->budget
- entity
->service
);
1570 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1571 struct bfq_service_tree
*st
,
1574 st
= __bfq_entity_update_weight_prio(st
, entity
);
1575 bfq_calc_finish(entity
, entity
->budget
);
1578 * If some queues enjoy backshifting for a while, then their
1579 * (virtual) finish timestamps may happen to become lower and
1580 * lower than the system virtual time. In particular, if
1581 * these queues often happen to be idle for short time
1582 * periods, and during such time periods other queues with
1583 * higher timestamps happen to be busy, then the backshifted
1584 * timestamps of the former queues can become much lower than
1585 * the system virtual time. In fact, to serve the queues with
1586 * higher timestamps while the ones with lower timestamps are
1587 * idle, the system virtual time may be pushed-up to much
1588 * higher values than the finish timestamps of the idle
1589 * queues. As a consequence, the finish timestamps of all new
1590 * or newly activated queues may end up being much larger than
1591 * those of lucky queues with backshifted timestamps. The
1592 * latter queues may then monopolize the device for a lot of
1593 * time. This would simply break service guarantees.
1595 * To reduce this problem, push up a little bit the
1596 * backshifted timestamps of the queue associated with this
1597 * entity (only a queue can happen to have the backshifted
1598 * flag set): just enough to let the finish timestamp of the
1599 * queue be equal to the current value of the system virtual
1600 * time. This may introduce a little unfairness among queues
1601 * with backshifted timestamps, but it does not break
1602 * worst-case fairness guarantees.
1604 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1605 unsigned long delta
= st
->vtime
- entity
->finish
;
1607 entity
->start
+= delta
;
1608 entity
->finish
+= delta
;
1611 bfq_active_insert(st
, entity
);
1615 * __bfq_activate_entity - handle activation of entity.
1616 * @entity: the entity being activated.
1617 * @non_blocking_wait_rq: true if entity was waiting for a request
1619 * Called for a 'true' activation, i.e., if entity is not active and
1620 * one of its children receives a new request.
1622 * Basically, this function updates the timestamps of entity and
1623 * inserts entity into its active tree, ater possible extracting it
1624 * from its idle tree.
1626 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1627 bool non_blocking_wait_rq
)
1629 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1630 bool backshifted
= false;
1631 unsigned long long min_vstart
;
1633 /* See comments on bfq_fqq_update_budg_for_activation */
1634 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1636 min_vstart
= entity
->finish
;
1638 min_vstart
= st
->vtime
;
1640 if (entity
->tree
== &st
->idle
) {
1642 * Must be on the idle tree, bfq_idle_extract() will
1645 bfq_idle_extract(st
, entity
);
1646 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1647 min_vstart
: entity
->finish
;
1650 * The finish time of the entity may be invalid, and
1651 * it is in the past for sure, otherwise the queue
1652 * would have been on the idle tree.
1654 entity
->start
= min_vstart
;
1655 st
->wsum
+= entity
->weight
;
1657 * entity is about to be inserted into a service tree,
1658 * and then set in service: get a reference to make
1659 * sure entity does not disappear until it is no
1660 * longer in service or scheduled for service.
1662 bfq_get_entity(entity
);
1664 entity
->on_st
= true;
1667 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1671 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1672 * @entity: the entity being requeued or repositioned.
1674 * Requeueing is needed if this entity stops being served, which
1675 * happens if a leaf descendant entity has expired. On the other hand,
1676 * repositioning is needed if the next_inservice_entity for the child
1677 * entity has changed. See the comments inside the function for
1680 * Basically, this function: 1) removes entity from its active tree if
1681 * present there, 2) updates the timestamps of entity and 3) inserts
1682 * entity back into its active tree (in the new, right position for
1683 * the new values of the timestamps).
1685 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
1687 struct bfq_sched_data
*sd
= entity
->sched_data
;
1688 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1690 if (entity
== sd
->in_service_entity
) {
1692 * We are requeueing the current in-service entity,
1693 * which may have to be done for one of the following
1695 * - entity represents the in-service queue, and the
1696 * in-service queue is being requeued after an
1698 * - entity represents a group, and its budget has
1699 * changed because one of its child entities has
1700 * just been either activated or requeued for some
1701 * reason; the timestamps of the entity need then to
1702 * be updated, and the entity needs to be enqueued
1703 * or repositioned accordingly.
1705 * In particular, before requeueing, the start time of
1706 * the entity must be moved forward to account for the
1707 * service that the entity has received while in
1708 * service. This is done by the next instructions. The
1709 * finish time will then be updated according to this
1710 * new value of the start time, and to the budget of
1713 bfq_calc_finish(entity
, entity
->service
);
1714 entity
->start
= entity
->finish
;
1716 * In addition, if the entity had more than one child
1717 * when set in service, then was not extracted from
1718 * the active tree. This implies that the position of
1719 * the entity in the active tree may need to be
1720 * changed now, because we have just updated the start
1721 * time of the entity, and we will update its finish
1722 * time in a moment (the requeueing is then, more
1723 * precisely, a repositioning in this case). To
1724 * implement this repositioning, we: 1) dequeue the
1725 * entity here, 2) update the finish time and
1726 * requeue the entity according to the new
1730 bfq_active_extract(st
, entity
);
1731 } else { /* The entity is already active, and not in service */
1733 * In this case, this function gets called only if the
1734 * next_in_service entity below this entity has
1735 * changed, and this change has caused the budget of
1736 * this entity to change, which, finally implies that
1737 * the finish time of this entity must be
1738 * updated. Such an update may cause the scheduling,
1739 * i.e., the position in the active tree, of this
1740 * entity to change. We handle this change by: 1)
1741 * dequeueing the entity here, 2) updating the finish
1742 * time and requeueing the entity according to the new
1743 * timestamps below. This is the same approach as the
1744 * non-extracted-entity sub-case above.
1746 bfq_active_extract(st
, entity
);
1749 bfq_update_fin_time_enqueue(entity
, st
, false);
1752 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1753 struct bfq_sched_data
*sd
,
1754 bool non_blocking_wait_rq
)
1756 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1758 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
1760 * in service or already queued on the active tree,
1761 * requeue or reposition
1763 __bfq_requeue_entity(entity
);
1766 * Not in service and not queued on its active tree:
1767 * the activity is idle and this is a true activation.
1769 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
1774 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1775 * and activate, requeue or reposition all ancestors
1776 * for which such an update becomes necessary.
1777 * @entity: the entity to activate.
1778 * @non_blocking_wait_rq: true if this entity was waiting for a request
1779 * @requeue: true if this is a requeue, which implies that bfqq is
1780 * being expired; thus ALL its ancestors stop being served and must
1781 * therefore be requeued
1783 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1784 bool non_blocking_wait_rq
,
1787 struct bfq_sched_data
*sd
;
1789 for_each_entity(entity
) {
1790 sd
= entity
->sched_data
;
1791 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
1793 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
1799 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1800 * @entity: the entity to deactivate.
1801 * @ins_into_idle_tree: if false, the entity will not be put into the
1804 * Deactivates an entity, independently from its previous state. Must
1805 * be invoked only if entity is on a service tree. Extracts the entity
1806 * from that tree, and if necessary and allowed, puts it on the idle
1809 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
1810 bool ins_into_idle_tree
)
1812 struct bfq_sched_data
*sd
= entity
->sched_data
;
1813 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1814 int is_in_service
= entity
== sd
->in_service_entity
;
1816 if (!entity
->on_st
) /* entity never activated, or already inactive */
1820 bfq_calc_finish(entity
, entity
->service
);
1822 if (entity
->tree
== &st
->active
)
1823 bfq_active_extract(st
, entity
);
1824 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
1825 bfq_idle_extract(st
, entity
);
1827 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
1828 bfq_forget_entity(st
, entity
, is_in_service
);
1830 bfq_idle_insert(st
, entity
);
1836 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1837 * @entity: the entity to deactivate.
1838 * @ins_into_idle_tree: true if the entity can be put on the idle tree
1840 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
1841 bool ins_into_idle_tree
,
1844 struct bfq_sched_data
*sd
;
1845 struct bfq_entity
*parent
= NULL
;
1847 for_each_entity_safe(entity
, parent
) {
1848 sd
= entity
->sched_data
;
1850 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
1852 * entity is not in any tree any more, so
1853 * this deactivation is a no-op, and there is
1854 * nothing to change for upper-level entities
1855 * (in case of expiration, this can never
1861 if (sd
->next_in_service
== entity
)
1863 * entity was the next_in_service entity,
1864 * then, since entity has just been
1865 * deactivated, a new one must be found.
1867 bfq_update_next_in_service(sd
, NULL
);
1869 if (sd
->next_in_service
)
1871 * The parent entity is still backlogged,
1872 * because next_in_service is not NULL. So, no
1873 * further upwards deactivation must be
1874 * performed. Yet, next_in_service has
1875 * changed. Then the schedule does need to be
1881 * If we get here, then the parent is no more
1882 * backlogged and we need to propagate the
1883 * deactivation upwards. Thus let the loop go on.
1887 * Also let parent be queued into the idle tree on
1888 * deactivation, to preserve service guarantees, and
1889 * assuming that who invoked this function does not
1890 * need parent entities too to be removed completely.
1892 ins_into_idle_tree
= true;
1896 * If the deactivation loop is fully executed, then there are
1897 * no more entities to touch and next loop is not executed at
1898 * all. Otherwise, requeue remaining entities if they are
1899 * about to stop receiving service, or reposition them if this
1903 for_each_entity(entity
) {
1905 * Invoke __bfq_requeue_entity on entity, even if
1906 * already active, to requeue/reposition it in the
1907 * active tree (because sd->next_in_service has
1910 __bfq_requeue_entity(entity
);
1912 sd
= entity
->sched_data
;
1913 if (!bfq_update_next_in_service(sd
, entity
) &&
1916 * next_in_service unchanged or not causing
1917 * any change in entity->parent->sd, and no
1918 * requeueing needed for expiration: stop
1926 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1927 * if needed, to have at least one entity eligible.
1928 * @st: the service tree to act upon.
1930 * Assumes that st is not empty.
1932 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
1934 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
1936 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
1937 return root_entity
->min_start
;
1942 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
1944 if (new_value
> st
->vtime
) {
1945 st
->vtime
= new_value
;
1946 bfq_forget_idle(st
);
1951 * bfq_first_active_entity - find the eligible entity with
1952 * the smallest finish time
1953 * @st: the service tree to select from.
1954 * @vtime: the system virtual to use as a reference for eligibility
1956 * This function searches the first schedulable entity, starting from the
1957 * root of the tree and going on the left every time on this side there is
1958 * a subtree with at least one eligible (start >= vtime) entity. The path on
1959 * the right is followed only if a) the left subtree contains no eligible
1960 * entities and b) no eligible entity has been found yet.
1962 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
1965 struct bfq_entity
*entry
, *first
= NULL
;
1966 struct rb_node
*node
= st
->active
.rb_node
;
1969 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
1971 if (!bfq_gt(entry
->start
, vtime
))
1974 if (node
->rb_left
) {
1975 entry
= rb_entry(node
->rb_left
,
1976 struct bfq_entity
, rb_node
);
1977 if (!bfq_gt(entry
->min_start
, vtime
)) {
1978 node
= node
->rb_left
;
1984 node
= node
->rb_right
;
1991 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1992 * @st: the service tree.
1994 * If there is no in-service entity for the sched_data st belongs to,
1995 * then return the entity that will be set in service if:
1996 * 1) the parent entity this st belongs to is set in service;
1997 * 2) no entity belonging to such parent entity undergoes a state change
1998 * that would influence the timestamps of the entity (e.g., becomes idle,
1999 * becomes backlogged, changes its budget, ...).
2001 * In this first case, update the virtual time in @st too (see the
2002 * comments on this update inside the function).
2004 * In constrast, if there is an in-service entity, then return the
2005 * entity that would be set in service if not only the above
2006 * conditions, but also the next one held true: the currently
2007 * in-service entity, on expiration,
2008 * 1) gets a finish time equal to the current one, or
2009 * 2) is not eligible any more, or
2012 static struct bfq_entity
*
2013 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2015 struct bfq_entity
*entity
;
2018 if (RB_EMPTY_ROOT(&st
->active
))
2022 * Get the value of the system virtual time for which at
2023 * least one entity is eligible.
2025 new_vtime
= bfq_calc_vtime_jump(st
);
2028 * If there is no in-service entity for the sched_data this
2029 * active tree belongs to, then push the system virtual time
2030 * up to the value that guarantees that at least one entity is
2031 * eligible. If, instead, there is an in-service entity, then
2032 * do not make any such update, because there is already an
2033 * eligible entity, namely the in-service one (even if the
2034 * entity is not on st, because it was extracted when set in
2038 bfq_update_vtime(st
, new_vtime
);
2040 entity
= bfq_first_active_entity(st
, new_vtime
);
2046 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2047 * @sd: the sched_data.
2049 * This function is invoked when there has been a change in the trees
2050 * for sd, and we need know what is the new next entity after this
2053 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2055 struct bfq_service_tree
*st
= sd
->service_tree
;
2056 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2057 struct bfq_entity
*entity
= NULL
;
2061 * Choose from idle class, if needed to guarantee a minimum
2062 * bandwidth to this class (and if there is some active entity
2063 * in idle class). This should also mitigate
2064 * priority-inversion problems in case a low priority task is
2065 * holding file system resources.
2067 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2068 BFQ_CL_IDLE_TIMEOUT
)) {
2069 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2070 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2071 /* About to be served if backlogged, or not yet backlogged */
2072 sd
->bfq_class_idle_last_service
= jiffies
;
2076 * Find the next entity to serve for the highest-priority
2077 * class, unless the idle class needs to be served.
2079 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2080 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2081 sd
->in_service_entity
);
2093 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2095 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2097 return sd
->next_in_service
!= sd
->in_service_entity
;
2101 * Get next queue for service.
2103 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2105 struct bfq_entity
*entity
= NULL
;
2106 struct bfq_sched_data
*sd
;
2107 struct bfq_queue
*bfqq
;
2109 if (bfqd
->busy_queues
== 0)
2113 * Traverse the path from the root to the leaf entity to
2114 * serve. Set in service all the entities visited along the
2117 sd
= &bfqd
->root_group
->sched_data
;
2118 for (; sd
; sd
= entity
->my_sched_data
) {
2120 * WARNING. We are about to set the in-service entity
2121 * to sd->next_in_service, i.e., to the (cached) value
2122 * returned by bfq_lookup_next_entity(sd) the last
2123 * time it was invoked, i.e., the last time when the
2124 * service order in sd changed as a consequence of the
2125 * activation or deactivation of an entity. In this
2126 * respect, if we execute bfq_lookup_next_entity(sd)
2127 * in this very moment, it may, although with low
2128 * probability, yield a different entity than that
2129 * pointed to by sd->next_in_service. This rare event
2130 * happens in case there was no CLASS_IDLE entity to
2131 * serve for sd when bfq_lookup_next_entity(sd) was
2132 * invoked for the last time, while there is now one
2135 * If the above event happens, then the scheduling of
2136 * such entity in CLASS_IDLE is postponed until the
2137 * service of the sd->next_in_service entity
2138 * finishes. In fact, when the latter is expired,
2139 * bfq_lookup_next_entity(sd) gets called again,
2140 * exactly to update sd->next_in_service.
2143 /* Make next_in_service entity become in_service_entity */
2144 entity
= sd
->next_in_service
;
2145 sd
->in_service_entity
= entity
;
2148 * Reset the accumulator of the amount of service that
2149 * the entity is about to receive.
2151 entity
->service
= 0;
2154 * If entity is no longer a candidate for next
2155 * service, then we extract it from its active tree,
2156 * for the following reason. To further boost the
2157 * throughput in some special case, BFQ needs to know
2158 * which is the next candidate entity to serve, while
2159 * there is already an entity in service. In this
2160 * respect, to make it easy to compute/update the next
2161 * candidate entity to serve after the current
2162 * candidate has been set in service, there is a case
2163 * where it is necessary to extract the current
2164 * candidate from its service tree. Such a case is
2165 * when the entity just set in service cannot be also
2166 * a candidate for next service. Details about when
2167 * this conditions holds are reported in the comments
2168 * on the function bfq_no_longer_next_in_service()
2171 if (bfq_no_longer_next_in_service(entity
))
2172 bfq_active_extract(bfq_entity_service_tree(entity
),
2176 * For the same reason why we may have just extracted
2177 * entity from its active tree, we may need to update
2178 * next_in_service for the sched_data of entity too,
2179 * regardless of whether entity has been extracted.
2180 * In fact, even if entity has not been extracted, a
2181 * descendant entity may get extracted. Such an event
2182 * would cause a change in next_in_service for the
2183 * level of the descendant entity, and thus possibly
2184 * back to upper levels.
2186 * We cannot perform the resulting needed update
2187 * before the end of this loop, because, to know which
2188 * is the correct next-to-serve candidate entity for
2189 * each level, we need first to find the leaf entity
2190 * to set in service. In fact, only after we know
2191 * which is the next-to-serve leaf entity, we can
2192 * discover whether the parent entity of the leaf
2193 * entity becomes the next-to-serve, and so on.
2198 bfqq
= bfq_entity_to_bfqq(entity
);
2201 * We can finally update all next-to-serve entities along the
2202 * path from the leaf entity just set in service to the root.
2204 for_each_entity(entity
) {
2205 struct bfq_sched_data
*sd
= entity
->sched_data
;
2207 if (!bfq_update_next_in_service(sd
, NULL
))
2214 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2216 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2217 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2218 struct bfq_entity
*entity
= in_serv_entity
;
2220 if (bfqd
->in_service_bic
) {
2221 put_io_context(bfqd
->in_service_bic
->icq
.ioc
);
2222 bfqd
->in_service_bic
= NULL
;
2225 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2226 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2227 bfqd
->in_service_queue
= NULL
;
2230 * When this function is called, all in-service entities have
2231 * been properly deactivated or requeued, so we can safely
2232 * execute the final step: reset in_service_entity along the
2233 * path from entity to the root.
2235 for_each_entity(entity
)
2236 entity
->sched_data
->in_service_entity
= NULL
;
2239 * in_serv_entity is no longer in service, so, if it is in no
2240 * service tree either, then release the service reference to
2241 * the queue it represents (taken with bfq_get_entity).
2243 if (!in_serv_entity
->on_st
)
2244 bfq_put_queue(in_serv_bfqq
);
2247 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2248 bool ins_into_idle_tree
, bool expiration
)
2250 struct bfq_entity
*entity
= &bfqq
->entity
;
2252 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2255 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2257 struct bfq_entity
*entity
= &bfqq
->entity
;
2259 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2261 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2264 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2266 struct bfq_entity
*entity
= &bfqq
->entity
;
2268 bfq_activate_requeue_entity(entity
, false,
2269 bfqq
== bfqd
->in_service_queue
);
2272 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2275 * Called when the bfqq no longer has requests pending, remove it from
2276 * the service tree. As a special case, it can be invoked during an
2279 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2282 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2284 bfq_clear_bfqq_busy(bfqq
);
2286 bfqd
->busy_queues
--;
2288 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2290 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2294 * Called when an inactive queue receives a new request.
2296 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2298 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2300 bfq_activate_bfqq(bfqd
, bfqq
);
2302 bfq_mark_bfqq_busy(bfqq
);
2303 bfqd
->busy_queues
++;
2306 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2308 /* bfqg stats flags */
2309 enum bfqg_stats_flags
{
2310 BFQG_stats_waiting
= 0,
2315 #define BFQG_FLAG_FNS(name) \
2316 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2318 stats->flags |= (1 << BFQG_stats_##name); \
2320 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2322 stats->flags &= ~(1 << BFQG_stats_##name); \
2324 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2326 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2329 BFQG_FLAG_FNS(waiting)
2330 BFQG_FLAG_FNS(idling
)
2331 BFQG_FLAG_FNS(empty
)
2332 #undef BFQG_FLAG_FNS
2334 /* This should be called with the queue_lock held. */
2335 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2337 unsigned long long now
;
2339 if (!bfqg_stats_waiting(stats
))
2342 now
= sched_clock();
2343 if (time_after64(now
, stats
->start_group_wait_time
))
2344 blkg_stat_add(&stats
->group_wait_time
,
2345 now
- stats
->start_group_wait_time
);
2346 bfqg_stats_clear_waiting(stats
);
2349 /* This should be called with the queue_lock held. */
2350 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2351 struct bfq_group
*curr_bfqg
)
2353 struct bfqg_stats
*stats
= &bfqg
->stats
;
2355 if (bfqg_stats_waiting(stats
))
2357 if (bfqg
== curr_bfqg
)
2359 stats
->start_group_wait_time
= sched_clock();
2360 bfqg_stats_mark_waiting(stats
);
2363 /* This should be called with the queue_lock held. */
2364 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2366 unsigned long long now
;
2368 if (!bfqg_stats_empty(stats
))
2371 now
= sched_clock();
2372 if (time_after64(now
, stats
->start_empty_time
))
2373 blkg_stat_add(&stats
->empty_time
,
2374 now
- stats
->start_empty_time
);
2375 bfqg_stats_clear_empty(stats
);
2378 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2380 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2383 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2385 struct bfqg_stats
*stats
= &bfqg
->stats
;
2387 if (blkg_rwstat_total(&stats
->queued
))
2391 * group is already marked empty. This can happen if bfqq got new
2392 * request in parent group and moved to this group while being added
2393 * to service tree. Just ignore the event and move on.
2395 if (bfqg_stats_empty(stats
))
2398 stats
->start_empty_time
= sched_clock();
2399 bfqg_stats_mark_empty(stats
);
2402 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2404 struct bfqg_stats
*stats
= &bfqg
->stats
;
2406 if (bfqg_stats_idling(stats
)) {
2407 unsigned long long now
= sched_clock();
2409 if (time_after64(now
, stats
->start_idle_time
))
2410 blkg_stat_add(&stats
->idle_time
,
2411 now
- stats
->start_idle_time
);
2412 bfqg_stats_clear_idling(stats
);
2416 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2418 struct bfqg_stats
*stats
= &bfqg
->stats
;
2420 stats
->start_idle_time
= sched_clock();
2421 bfqg_stats_mark_idling(stats
);
2424 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2426 struct bfqg_stats
*stats
= &bfqg
->stats
;
2428 blkg_stat_add(&stats
->avg_queue_size_sum
,
2429 blkg_rwstat_total(&stats
->queued
));
2430 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2431 bfqg_stats_update_group_wait_time(stats
);
2435 * blk-cgroup policy-related handlers
2436 * The following functions help in converting between blk-cgroup
2437 * internal structures and BFQ-specific structures.
2440 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2442 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2445 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2447 return pd_to_blkg(&bfqg
->pd
);
2450 static struct blkcg_policy blkcg_policy_bfq
;
2452 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2454 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2458 * bfq_group handlers
2459 * The following functions help in navigating the bfq_group hierarchy
2460 * by allowing to find the parent of a bfq_group or the bfq_group
2461 * associated to a bfq_queue.
2464 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2466 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2468 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2471 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2473 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2475 return group_entity
? container_of(group_entity
, struct bfq_group
,
2477 bfqq
->bfqd
->root_group
;
2481 * The following two functions handle get and put of a bfq_group by
2482 * wrapping the related blk-cgroup hooks.
2485 static void bfqg_get(struct bfq_group
*bfqg
)
2487 return blkg_get(bfqg_to_blkg(bfqg
));
2490 static void bfqg_put(struct bfq_group
*bfqg
)
2492 return blkg_put(bfqg_to_blkg(bfqg
));
2495 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2496 struct bfq_queue
*bfqq
,
2499 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2500 bfqg_stats_end_empty_time(&bfqg
->stats
);
2501 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2502 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2505 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2507 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2510 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2512 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2515 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2516 uint64_t start_time
, uint64_t io_start_time
,
2519 struct bfqg_stats
*stats
= &bfqg
->stats
;
2520 unsigned long long now
= sched_clock();
2522 if (time_after64(now
, io_start_time
))
2523 blkg_rwstat_add(&stats
->service_time
, op
,
2524 now
- io_start_time
);
2525 if (time_after64(io_start_time
, start_time
))
2526 blkg_rwstat_add(&stats
->wait_time
, op
,
2527 io_start_time
- start_time
);
2531 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2533 /* queued stats shouldn't be cleared */
2534 blkg_rwstat_reset(&stats
->merged
);
2535 blkg_rwstat_reset(&stats
->service_time
);
2536 blkg_rwstat_reset(&stats
->wait_time
);
2537 blkg_stat_reset(&stats
->time
);
2538 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2539 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2540 blkg_stat_reset(&stats
->dequeue
);
2541 blkg_stat_reset(&stats
->group_wait_time
);
2542 blkg_stat_reset(&stats
->idle_time
);
2543 blkg_stat_reset(&stats
->empty_time
);
2547 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2552 /* queued stats shouldn't be cleared */
2553 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2554 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2555 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2556 blkg_stat_add_aux(&from
->time
, &from
->time
);
2557 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2558 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2559 &from
->avg_queue_size_samples
);
2560 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2561 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2562 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2563 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2567 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2568 * recursive stats can still account for the amount used by this bfqg after
2571 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2573 struct bfq_group
*parent
;
2575 if (!bfqg
) /* root_group */
2578 parent
= bfqg_parent(bfqg
);
2580 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2582 if (unlikely(!parent
))
2585 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2586 bfqg_stats_reset(&bfqg
->stats
);
2589 static void bfq_init_entity(struct bfq_entity
*entity
,
2590 struct bfq_group
*bfqg
)
2592 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2594 entity
->weight
= entity
->new_weight
;
2595 entity
->orig_weight
= entity
->new_weight
;
2597 bfqq
->ioprio
= bfqq
->new_ioprio
;
2598 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2601 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2602 entity
->sched_data
= &bfqg
->sched_data
;
2605 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2607 blkg_rwstat_exit(&stats
->merged
);
2608 blkg_rwstat_exit(&stats
->service_time
);
2609 blkg_rwstat_exit(&stats
->wait_time
);
2610 blkg_rwstat_exit(&stats
->queued
);
2611 blkg_stat_exit(&stats
->time
);
2612 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2613 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2614 blkg_stat_exit(&stats
->dequeue
);
2615 blkg_stat_exit(&stats
->group_wait_time
);
2616 blkg_stat_exit(&stats
->idle_time
);
2617 blkg_stat_exit(&stats
->empty_time
);
2620 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2622 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2623 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2624 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2625 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2626 blkg_stat_init(&stats
->time
, gfp
) ||
2627 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2628 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2629 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2630 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2631 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2632 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2633 bfqg_stats_exit(stats
);
2640 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2642 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2645 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2647 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2650 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2652 struct bfq_group_data
*bgd
;
2654 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2660 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2662 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2664 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2665 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2668 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2670 kfree(cpd_to_bfqgd(cpd
));
2673 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
2675 struct bfq_group
*bfqg
;
2677 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
2681 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
2689 static void bfq_pd_init(struct blkg_policy_data
*pd
)
2691 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
2692 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
2693 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
2694 struct bfq_entity
*entity
= &bfqg
->entity
;
2695 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
2697 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
2698 entity
->my_sched_data
= &bfqg
->sched_data
;
2699 bfqg
->my_entity
= entity
; /*
2700 * the root_group's will be set to NULL
2701 * in bfq_init_queue()
2706 static void bfq_pd_free(struct blkg_policy_data
*pd
)
2708 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2710 bfqg_stats_exit(&bfqg
->stats
);
2714 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
2716 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2718 bfqg_stats_reset(&bfqg
->stats
);
2721 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
2722 struct bfq_group
*parent
)
2724 struct bfq_entity
*entity
;
2726 entity
= &bfqg
->entity
;
2727 entity
->parent
= parent
->my_entity
;
2728 entity
->sched_data
= &parent
->sched_data
;
2731 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
2732 struct blkcg
*blkcg
)
2734 struct blkcg_gq
*blkg
;
2736 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
2738 return blkg_to_bfqg(blkg
);
2742 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
2743 struct blkcg
*blkcg
)
2745 struct bfq_group
*bfqg
, *parent
;
2746 struct bfq_entity
*entity
;
2748 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
2750 if (unlikely(!bfqg
))
2754 * Update chain of bfq_groups as we might be handling a leaf group
2755 * which, along with some of its relatives, has not been hooked yet
2756 * to the private hierarchy of BFQ.
2758 entity
= &bfqg
->entity
;
2759 for_each_entity(entity
) {
2760 bfqg
= container_of(entity
, struct bfq_group
, entity
);
2761 if (bfqg
!= bfqd
->root_group
) {
2762 parent
= bfqg_parent(bfqg
);
2764 parent
= bfqd
->root_group
;
2765 bfq_group_set_parent(bfqg
, parent
);
2772 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
2773 struct bfq_queue
*bfqq
,
2775 enum bfqq_expiration reason
);
2778 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2779 * @bfqd: queue descriptor.
2780 * @bfqq: the queue to move.
2781 * @bfqg: the group to move to.
2783 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2784 * it on the new one. Avoid putting the entity on the old group idle tree.
2786 * Must be called under the queue lock; the cgroup owning @bfqg must
2787 * not disappear (by now this just means that we are called under
2790 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2791 struct bfq_group
*bfqg
)
2793 struct bfq_entity
*entity
= &bfqq
->entity
;
2795 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2796 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2797 * from data structures related to current group. Otherwise we
2798 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2801 if (bfqq
== bfqd
->in_service_queue
)
2802 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
2803 false, BFQQE_PREEMPTED
);
2805 if (bfq_bfqq_busy(bfqq
))
2806 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
2807 else if (entity
->on_st
)
2808 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
2809 bfqg_put(bfqq_group(bfqq
));
2812 * Here we use a reference to bfqg. We don't need a refcounter
2813 * as the cgroup reference will not be dropped, so that its
2814 * destroy() callback will not be invoked.
2816 entity
->parent
= bfqg
->my_entity
;
2817 entity
->sched_data
= &bfqg
->sched_data
;
2820 if (bfq_bfqq_busy(bfqq
))
2821 bfq_activate_bfqq(bfqd
, bfqq
);
2823 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
2824 bfq_schedule_dispatch(bfqd
);
2828 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2829 * @bfqd: the queue descriptor.
2830 * @bic: the bic to move.
2831 * @blkcg: the blk-cgroup to move to.
2833 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2834 * has to make sure that the reference to cgroup is valid across the call.
2836 * NOTE: an alternative approach might have been to store the current
2837 * cgroup in bfqq and getting a reference to it, reducing the lookup
2838 * time here, at the price of slightly more complex code.
2840 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
2841 struct bfq_io_cq
*bic
,
2842 struct blkcg
*blkcg
)
2844 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
2845 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
2846 struct bfq_group
*bfqg
;
2847 struct bfq_entity
*entity
;
2849 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
2851 if (unlikely(!bfqg
))
2852 bfqg
= bfqd
->root_group
;
2855 entity
= &async_bfqq
->entity
;
2857 if (entity
->sched_data
!= &bfqg
->sched_data
) {
2858 bic_set_bfqq(bic
, NULL
, 0);
2859 bfq_log_bfqq(bfqd
, async_bfqq
,
2860 "bic_change_group: %p %d",
2863 bfq_put_queue(async_bfqq
);
2868 entity
= &sync_bfqq
->entity
;
2869 if (entity
->sched_data
!= &bfqg
->sched_data
)
2870 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
2876 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
2878 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
2879 struct bfq_group
*bfqg
= NULL
;
2883 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
2886 * Check whether blkcg has changed. The condition may trigger
2887 * spuriously on a newly created cic but there's no harm.
2889 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
2892 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
2893 bic
->blkcg_serial_nr
= serial_nr
;
2899 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
2900 * @st: the service tree being flushed.
2902 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
2904 struct bfq_entity
*entity
= st
->first_idle
;
2906 for (; entity
; entity
= st
->first_idle
)
2907 __bfq_deactivate_entity(entity
, false);
2911 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
2912 * @bfqd: the device data structure with the root group.
2913 * @entity: the entity to move.
2915 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
2916 struct bfq_entity
*entity
)
2918 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2920 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
2924 * bfq_reparent_active_entities - move to the root group all active
2926 * @bfqd: the device data structure with the root group.
2927 * @bfqg: the group to move from.
2928 * @st: the service tree with the entities.
2930 * Needs queue_lock to be taken and reference to be valid over the call.
2932 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
2933 struct bfq_group
*bfqg
,
2934 struct bfq_service_tree
*st
)
2936 struct rb_root
*active
= &st
->active
;
2937 struct bfq_entity
*entity
= NULL
;
2939 if (!RB_EMPTY_ROOT(&st
->active
))
2940 entity
= bfq_entity_of(rb_first(active
));
2942 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
2943 bfq_reparent_leaf_entity(bfqd
, entity
);
2945 if (bfqg
->sched_data
.in_service_entity
)
2946 bfq_reparent_leaf_entity(bfqd
,
2947 bfqg
->sched_data
.in_service_entity
);
2951 * bfq_pd_offline - deactivate the entity associated with @pd,
2952 * and reparent its children entities.
2953 * @pd: descriptor of the policy going offline.
2955 * blkio already grabs the queue_lock for us, so no need to use
2958 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
2960 struct bfq_service_tree
*st
;
2961 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2962 struct bfq_data
*bfqd
= bfqg
->bfqd
;
2963 struct bfq_entity
*entity
= bfqg
->my_entity
;
2964 unsigned long flags
;
2967 if (!entity
) /* root group */
2970 spin_lock_irqsave(&bfqd
->lock
, flags
);
2972 * Empty all service_trees belonging to this group before
2973 * deactivating the group itself.
2975 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
2976 st
= bfqg
->sched_data
.service_tree
+ i
;
2979 * The idle tree may still contain bfq_queues belonging
2980 * to exited task because they never migrated to a different
2981 * cgroup from the one being destroyed now. No one else
2982 * can access them so it's safe to act without any lock.
2984 bfq_flush_idle_tree(st
);
2987 * It may happen that some queues are still active
2988 * (busy) upon group destruction (if the corresponding
2989 * processes have been forced to terminate). We move
2990 * all the leaf entities corresponding to these queues
2991 * to the root_group.
2992 * Also, it may happen that the group has an entity
2993 * in service, which is disconnected from the active
2994 * tree: it must be moved, too.
2995 * There is no need to put the sync queues, as the
2996 * scheduler has taken no reference.
2998 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3001 __bfq_deactivate_entity(entity
, false);
3002 bfq_put_async_queues(bfqd
, bfqg
);
3004 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
3006 * @blkg is going offline and will be ignored by
3007 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3008 * that they don't get lost. If IOs complete after this point, the
3009 * stats for them will be lost. Oh well...
3011 bfqg_stats_xfer_dead(bfqg
);
3014 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3016 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3017 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3018 unsigned int val
= 0;
3021 val
= bfqgd
->weight
;
3023 seq_printf(sf
, "%u\n", val
);
3028 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3029 struct cftype
*cftype
,
3032 struct blkcg
*blkcg
= css_to_blkcg(css
);
3033 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3034 struct blkcg_gq
*blkg
;
3037 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3041 spin_lock_irq(&blkcg
->lock
);
3042 bfqgd
->weight
= (unsigned short)val
;
3043 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3044 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3049 * Setting the prio_changed flag of the entity
3050 * to 1 with new_weight == weight would re-set
3051 * the value of the weight to its ioprio mapping.
3052 * Set the flag only if necessary.
3054 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3055 bfqg
->entity
.new_weight
= (unsigned short)val
;
3057 * Make sure that the above new value has been
3058 * stored in bfqg->entity.new_weight before
3059 * setting the prio_changed flag. In fact,
3060 * this flag may be read asynchronously (in
3061 * critical sections protected by a different
3062 * lock than that held here), and finding this
3063 * flag set may cause the execution of the code
3064 * for updating parameters whose value may
3065 * depend also on bfqg->entity.new_weight (in
3066 * __bfq_entity_update_weight_prio).
3067 * This barrier makes sure that the new value
3068 * of bfqg->entity.new_weight is correctly
3069 * seen in that code.
3072 bfqg
->entity
.prio_changed
= 1;
3075 spin_unlock_irq(&blkcg
->lock
);
3080 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3081 char *buf
, size_t nbytes
,
3085 /* First unsigned long found in the file is used */
3086 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3091 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3094 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3096 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3097 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3101 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3103 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3104 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3108 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3109 struct blkg_policy_data
*pd
, int off
)
3111 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3112 &blkcg_policy_bfq
, off
);
3113 return __blkg_prfill_u64(sf
, pd
, sum
);
3116 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3117 struct blkg_policy_data
*pd
, int off
)
3119 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3122 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3125 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3127 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3128 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3129 seq_cft(sf
)->private, false);
3133 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3135 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3136 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3137 seq_cft(sf
)->private, true);
3141 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3144 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3146 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3149 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3151 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3152 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3156 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3157 struct blkg_policy_data
*pd
, int off
)
3159 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3160 offsetof(struct blkcg_gq
, stat_bytes
));
3161 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3162 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3164 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3167 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3169 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3170 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3175 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3176 struct blkg_policy_data
*pd
, int off
)
3178 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3179 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3183 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3184 v
= div64_u64(v
, samples
);
3186 __blkg_prfill_u64(sf
, pd
, v
);
3190 /* print avg_queue_size */
3191 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3193 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3194 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3199 static struct bfq_group
*
3200 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3204 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3208 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3211 static struct cftype bfq_blkcg_legacy_files
[] = {
3213 .name
= "bfq.weight",
3214 .flags
= CFTYPE_NOT_ON_ROOT
,
3215 .seq_show
= bfq_io_show_weight
,
3216 .write_u64
= bfq_io_set_weight_legacy
,
3219 /* statistics, covers only the tasks in the bfqg */
3222 .private = offsetof(struct bfq_group
, stats
.time
),
3223 .seq_show
= bfqg_print_stat
,
3226 .name
= "bfq.sectors",
3227 .seq_show
= bfqg_print_stat_sectors
,
3230 .name
= "bfq.io_service_bytes",
3231 .private = (unsigned long)&blkcg_policy_bfq
,
3232 .seq_show
= blkg_print_stat_bytes
,
3235 .name
= "bfq.io_serviced",
3236 .private = (unsigned long)&blkcg_policy_bfq
,
3237 .seq_show
= blkg_print_stat_ios
,
3240 .name
= "bfq.io_service_time",
3241 .private = offsetof(struct bfq_group
, stats
.service_time
),
3242 .seq_show
= bfqg_print_rwstat
,
3245 .name
= "bfq.io_wait_time",
3246 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3247 .seq_show
= bfqg_print_rwstat
,
3250 .name
= "bfq.io_merged",
3251 .private = offsetof(struct bfq_group
, stats
.merged
),
3252 .seq_show
= bfqg_print_rwstat
,
3255 .name
= "bfq.io_queued",
3256 .private = offsetof(struct bfq_group
, stats
.queued
),
3257 .seq_show
= bfqg_print_rwstat
,
3260 /* the same statictics which cover the bfqg and its descendants */
3262 .name
= "bfq.time_recursive",
3263 .private = offsetof(struct bfq_group
, stats
.time
),
3264 .seq_show
= bfqg_print_stat_recursive
,
3267 .name
= "bfq.sectors_recursive",
3268 .seq_show
= bfqg_print_stat_sectors_recursive
,
3271 .name
= "bfq.io_service_bytes_recursive",
3272 .private = (unsigned long)&blkcg_policy_bfq
,
3273 .seq_show
= blkg_print_stat_bytes_recursive
,
3276 .name
= "bfq.io_serviced_recursive",
3277 .private = (unsigned long)&blkcg_policy_bfq
,
3278 .seq_show
= blkg_print_stat_ios_recursive
,
3281 .name
= "bfq.io_service_time_recursive",
3282 .private = offsetof(struct bfq_group
, stats
.service_time
),
3283 .seq_show
= bfqg_print_rwstat_recursive
,
3286 .name
= "bfq.io_wait_time_recursive",
3287 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3288 .seq_show
= bfqg_print_rwstat_recursive
,
3291 .name
= "bfq.io_merged_recursive",
3292 .private = offsetof(struct bfq_group
, stats
.merged
),
3293 .seq_show
= bfqg_print_rwstat_recursive
,
3296 .name
= "bfq.io_queued_recursive",
3297 .private = offsetof(struct bfq_group
, stats
.queued
),
3298 .seq_show
= bfqg_print_rwstat_recursive
,
3301 .name
= "bfq.avg_queue_size",
3302 .seq_show
= bfqg_print_avg_queue_size
,
3305 .name
= "bfq.group_wait_time",
3306 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3307 .seq_show
= bfqg_print_stat
,
3310 .name
= "bfq.idle_time",
3311 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3312 .seq_show
= bfqg_print_stat
,
3315 .name
= "bfq.empty_time",
3316 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3317 .seq_show
= bfqg_print_stat
,
3320 .name
= "bfq.dequeue",
3321 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3322 .seq_show
= bfqg_print_stat
,
3327 static struct cftype bfq_blkg_files
[] = {
3329 .name
= "bfq.weight",
3330 .flags
= CFTYPE_NOT_ON_ROOT
,
3331 .seq_show
= bfq_io_show_weight
,
3332 .write
= bfq_io_set_weight
,
3337 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3339 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3340 struct bfq_queue
*bfqq
, unsigned int op
) { }
3342 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3344 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3345 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3346 uint64_t start_time
, uint64_t io_start_time
,
3347 unsigned int op
) { }
3349 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3350 struct bfq_group
*curr_bfqg
) { }
3351 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3352 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3353 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3354 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3355 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3356 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3358 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3359 struct bfq_group
*bfqg
) {}
3361 static void bfq_init_entity(struct bfq_entity
*entity
,
3362 struct bfq_group
*bfqg
)
3364 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3366 entity
->weight
= entity
->new_weight
;
3367 entity
->orig_weight
= entity
->new_weight
;
3369 bfqq
->ioprio
= bfqq
->new_ioprio
;
3370 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3372 entity
->sched_data
= &bfqg
->sched_data
;
3375 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3377 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3378 struct blkcg
*blkcg
)
3380 return bfqd
->root_group
;
3383 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3385 return bfqq
->bfqd
->root_group
;
3388 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3391 struct bfq_group
*bfqg
;
3394 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3398 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3399 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3403 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3405 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3406 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3408 #define bfq_sample_valid(samples) ((samples) > 80)
3411 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3412 * We choose the request that is closesr to the head right now. Distance
3413 * behind the head is penalized and only allowed to a certain extent.
3415 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3416 struct request
*rq1
,
3417 struct request
*rq2
,
3420 sector_t s1
, s2
, d1
= 0, d2
= 0;
3421 unsigned long back_max
;
3422 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3423 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3424 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3426 if (!rq1
|| rq1
== rq2
)
3431 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3433 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3435 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3437 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3440 s1
= blk_rq_pos(rq1
);
3441 s2
= blk_rq_pos(rq2
);
3444 * By definition, 1KiB is 2 sectors.
3446 back_max
= bfqd
->bfq_back_max
* 2;
3449 * Strict one way elevator _except_ in the case where we allow
3450 * short backward seeks which are biased as twice the cost of a
3451 * similar forward seek.
3455 else if (s1
+ back_max
>= last
)
3456 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3458 wrap
|= BFQ_RQ1_WRAP
;
3462 else if (s2
+ back_max
>= last
)
3463 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3465 wrap
|= BFQ_RQ2_WRAP
;
3467 /* Found required data */
3470 * By doing switch() on the bit mask "wrap" we avoid having to
3471 * check two variables for all permutations: --> faster!
3474 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3489 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3492 * Since both rqs are wrapped,
3493 * start with the one that's further behind head
3494 * (--> only *one* back seek required),
3495 * since back seek takes more time than forward.
3505 * Return expired entry, or NULL to just start from scratch in rbtree.
3507 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3508 struct request
*last
)
3512 if (bfq_bfqq_fifo_expire(bfqq
))
3515 bfq_mark_bfqq_fifo_expire(bfqq
);
3517 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3519 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3522 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3526 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3527 struct bfq_queue
*bfqq
,
3528 struct request
*last
)
3530 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3531 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3532 struct request
*next
, *prev
= NULL
;
3534 /* Follow expired path, else get first next available. */
3535 next
= bfq_check_fifo(bfqq
, last
);
3540 prev
= rb_entry_rq(rbprev
);
3543 next
= rb_entry_rq(rbnext
);
3545 rbnext
= rb_first(&bfqq
->sort_list
);
3546 if (rbnext
&& rbnext
!= &last
->rb_node
)
3547 next
= rb_entry_rq(rbnext
);
3550 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3553 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3554 struct bfq_queue
*bfqq
)
3556 return blk_rq_sectors(rq
);
3560 * bfq_updated_next_req - update the queue after a new next_rq selection.
3561 * @bfqd: the device data the queue belongs to.
3562 * @bfqq: the queue to update.
3564 * If the first request of a queue changes we make sure that the queue
3565 * has enough budget to serve at least its first request (if the
3566 * request has grown). We do this because if the queue has not enough
3567 * budget for its first request, it has to go through two dispatch
3568 * rounds to actually get it dispatched.
3570 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3571 struct bfq_queue
*bfqq
)
3573 struct bfq_entity
*entity
= &bfqq
->entity
;
3574 struct request
*next_rq
= bfqq
->next_rq
;
3575 unsigned long new_budget
;
3580 if (bfqq
== bfqd
->in_service_queue
)
3582 * In order not to break guarantees, budgets cannot be
3583 * changed after an entity has been selected.
3587 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
3588 bfq_serv_to_charge(next_rq
, bfqq
));
3589 if (entity
->budget
!= new_budget
) {
3590 entity
->budget
= new_budget
;
3591 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
3593 bfq_requeue_bfqq(bfqd
, bfqq
);
3597 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
3599 struct bfq_entity
*entity
= &bfqq
->entity
;
3601 return entity
->budget
- entity
->service
;
3605 * If enough samples have been computed, return the current max budget
3606 * stored in bfqd, which is dynamically updated according to the
3607 * estimated disk peak rate; otherwise return the default max budget
3609 static int bfq_max_budget(struct bfq_data
*bfqd
)
3611 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3612 return bfq_default_max_budget
;
3614 return bfqd
->bfq_max_budget
;
3618 * Return min budget, which is a fraction of the current or default
3619 * max budget (trying with 1/32)
3621 static int bfq_min_budget(struct bfq_data
*bfqd
)
3623 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3624 return bfq_default_max_budget
/ 32;
3626 return bfqd
->bfq_max_budget
/ 32;
3629 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3630 struct bfq_queue
*bfqq
,
3632 enum bfqq_expiration reason
);
3635 * The next function, invoked after the input queue bfqq switches from
3636 * idle to busy, updates the budget of bfqq. The function also tells
3637 * whether the in-service queue should be expired, by returning
3638 * true. The purpose of expiring the in-service queue is to give bfqq
3639 * the chance to possibly preempt the in-service queue, and the reason
3640 * for preempting the in-service queue is to achieve the following
3641 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
3642 * expired because it has remained idle.
3644 * In particular, bfqq may have expired for one of the following two
3647 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3648 * and did not make it to issue a new request before its last
3649 * request was served;
3651 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3652 * a new request before the expiration of the idling-time.
3654 * Even if bfqq has expired for one of the above reasons, the process
3655 * associated with the queue may be however issuing requests greedily,
3656 * and thus be sensitive to the bandwidth it receives (bfqq may have
3657 * remained idle for other reasons: CPU high load, bfqq not enjoying
3658 * idling, I/O throttling somewhere in the path from the process to
3659 * the I/O scheduler, ...). But if, after every expiration for one of
3660 * the above two reasons, bfqq has to wait for the service of at least
3661 * one full budget of another queue before being served again, then
3662 * bfqq is likely to get a much lower bandwidth or resource time than
3663 * its reserved ones. To address this issue, two countermeasures need
3666 * First, the budget and the timestamps of bfqq need to be updated in
3667 * a special way on bfqq reactivation: they need to be updated as if
3668 * bfqq did not remain idle and did not expire. In fact, if they are
3669 * computed as if bfqq expired and remained idle until reactivation,
3670 * then the process associated with bfqq is treated as if, instead of
3671 * being greedy, it stopped issuing requests when bfqq remained idle,
3672 * and restarts issuing requests only on this reactivation. In other
3673 * words, the scheduler does not help the process recover the "service
3674 * hole" between bfqq expiration and reactivation. As a consequence,
3675 * the process receives a lower bandwidth than its reserved one. In
3676 * contrast, to recover this hole, the budget must be updated as if
3677 * bfqq was not expired at all before this reactivation, i.e., it must
3678 * be set to the value of the remaining budget when bfqq was
3679 * expired. Along the same line, timestamps need to be assigned the
3680 * value they had the last time bfqq was selected for service, i.e.,
3681 * before last expiration. Thus timestamps need to be back-shifted
3682 * with respect to their normal computation (see [1] for more details
3683 * on this tricky aspect).
3685 * Secondly, to allow the process to recover the hole, the in-service
3686 * queue must be expired too, to give bfqq the chance to preempt it
3687 * immediately. In fact, if bfqq has to wait for a full budget of the
3688 * in-service queue to be completed, then it may become impossible to
3689 * let the process recover the hole, even if the back-shifted
3690 * timestamps of bfqq are lower than those of the in-service queue. If
3691 * this happens for most or all of the holes, then the process may not
3692 * receive its reserved bandwidth. In this respect, it is worth noting
3693 * that, being the service of outstanding requests unpreemptible, a
3694 * little fraction of the holes may however be unrecoverable, thereby
3695 * causing a little loss of bandwidth.
3697 * The last important point is detecting whether bfqq does need this
3698 * bandwidth recovery. In this respect, the next function deems the
3699 * process associated with bfqq greedy, and thus allows it to recover
3700 * the hole, if: 1) the process is waiting for the arrival of a new
3701 * request (which implies that bfqq expired for one of the above two
3702 * reasons), and 2) such a request has arrived soon. The first
3703 * condition is controlled through the flag non_blocking_wait_rq,
3704 * while the second through the flag arrived_in_time. If both
3705 * conditions hold, then the function computes the budget in the
3706 * above-described special way, and signals that the in-service queue
3707 * should be expired. Timestamp back-shifting is done later in
3708 * __bfq_activate_entity.
3710 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
3711 struct bfq_queue
*bfqq
,
3712 bool arrived_in_time
)
3714 struct bfq_entity
*entity
= &bfqq
->entity
;
3716 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
3718 * We do not clear the flag non_blocking_wait_rq here, as
3719 * the latter is used in bfq_activate_bfqq to signal
3720 * that timestamps need to be back-shifted (and is
3721 * cleared right after).
3725 * In next assignment we rely on that either
3726 * entity->service or entity->budget are not updated
3727 * on expiration if bfqq is empty (see
3728 * __bfq_bfqq_recalc_budget). Thus both quantities
3729 * remain unchanged after such an expiration, and the
3730 * following statement therefore assigns to
3731 * entity->budget the remaining budget on such an
3732 * expiration. For clarity, entity->service is not
3733 * updated on expiration in any case, and, in normal
3734 * operation, is reset only when bfqq is selected for
3735 * service (see bfq_get_next_queue).
3737 entity
->budget
= min_t(unsigned long,
3738 bfq_bfqq_budget_left(bfqq
),
3744 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
3745 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
3746 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
3750 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
3751 struct bfq_queue
*bfqq
,
3754 bool bfqq_wants_to_preempt
,
3756 * See the comments on
3757 * bfq_bfqq_update_budg_for_activation for
3758 * details on the usage of the next variable.
3760 arrived_in_time
= ktime_get_ns() <=
3761 bfqq
->ttime
.last_end_request
+
3762 bfqd
->bfq_slice_idle
* 3;
3764 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
3767 * Update budget and check whether bfqq may want to preempt
3768 * the in-service queue.
3770 bfqq_wants_to_preempt
=
3771 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
3774 if (!bfq_bfqq_IO_bound(bfqq
)) {
3775 if (arrived_in_time
) {
3776 bfqq
->requests_within_timer
++;
3777 if (bfqq
->requests_within_timer
>=
3778 bfqd
->bfq_requests_within_timer
)
3779 bfq_mark_bfqq_IO_bound(bfqq
);
3781 bfqq
->requests_within_timer
= 0;
3784 bfq_add_bfqq_busy(bfqd
, bfqq
);
3787 * Expire in-service queue only if preemption may be needed
3788 * for guarantees. In this respect, the function
3789 * next_queue_may_preempt just checks a simple, necessary
3790 * condition, and not a sufficient condition based on
3791 * timestamps. In fact, for the latter condition to be
3792 * evaluated, timestamps would need first to be updated, and
3793 * this operation is quite costly (see the comments on the
3794 * function bfq_bfqq_update_budg_for_activation).
3796 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
3797 next_queue_may_preempt(bfqd
))
3798 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3799 false, BFQQE_PREEMPTED
);
3802 static void bfq_add_request(struct request
*rq
)
3804 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3805 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3806 struct request
*next_rq
, *prev
;
3808 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
3809 bfqq
->queued
[rq_is_sync(rq
)]++;
3812 elv_rb_add(&bfqq
->sort_list
, rq
);
3815 * Check if this request is a better next-serve candidate.
3817 prev
= bfqq
->next_rq
;
3818 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
3819 bfqq
->next_rq
= next_rq
;
3821 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
3822 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, rq
);
3823 else if (prev
!= bfqq
->next_rq
)
3824 bfq_updated_next_req(bfqd
, bfqq
);
3827 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
3829 struct request_queue
*q
)
3831 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
3835 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
3840 #if 0 /* Still not clear if we can do without next two functions */
3841 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
3843 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3845 bfqd
->rq_in_driver
++;
3846 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
3847 bfq_log(bfqd
, "activate_request: new bfqd->last_position %llu",
3848 (unsigned long long)bfqd
->last_position
);
3851 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
3853 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3855 bfqd
->rq_in_driver
--;
3859 static void bfq_remove_request(struct request_queue
*q
,
3862 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3863 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3864 const int sync
= rq_is_sync(rq
);
3866 if (bfqq
->next_rq
== rq
) {
3867 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
3868 bfq_updated_next_req(bfqd
, bfqq
);
3871 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
3872 list_del_init(&rq
->queuelist
);
3873 bfqq
->queued
[sync
]--;
3875 elv_rb_del(&bfqq
->sort_list
, rq
);
3877 elv_rqhash_del(q
, rq
);
3878 if (q
->last_merge
== rq
)
3879 q
->last_merge
= NULL
;
3881 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
3882 bfqq
->next_rq
= NULL
;
3884 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
3885 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
3887 * bfqq emptied. In normal operation, when
3888 * bfqq is empty, bfqq->entity.service and
3889 * bfqq->entity.budget must contain,
3890 * respectively, the service received and the
3891 * budget used last time bfqq emptied. These
3892 * facts do not hold in this case, as at least
3893 * this last removal occurred while bfqq is
3894 * not in service. To avoid inconsistencies,
3895 * reset both bfqq->entity.service and
3896 * bfqq->entity.budget, if bfqq has still a
3897 * process that may issue I/O requests to it.
3899 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
3903 if (rq
->cmd_flags
& REQ_META
)
3904 bfqq
->meta_pending
--;
3906 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
3909 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
3911 struct request_queue
*q
= hctx
->queue
;
3912 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3913 struct request
*free
= NULL
;
3915 * bfq_bic_lookup grabs the queue_lock: invoke it now and
3916 * store its return value for later use, to avoid nesting
3917 * queue_lock inside the bfqd->lock. We assume that the bic
3918 * returned by bfq_bic_lookup does not go away before
3919 * bfqd->lock is taken.
3921 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
3924 spin_lock_irq(&bfqd
->lock
);
3927 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
3929 bfqd
->bio_bfqq
= NULL
;
3930 bfqd
->bio_bic
= bic
;
3932 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
3935 blk_mq_free_request(free
);
3936 spin_unlock_irq(&bfqd
->lock
);
3941 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
3944 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3945 struct request
*__rq
;
3947 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
3948 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
3950 return ELEVATOR_FRONT_MERGE
;
3953 return ELEVATOR_NO_MERGE
;
3956 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
3957 enum elv_merge type
)
3959 if (type
== ELEVATOR_FRONT_MERGE
&&
3960 rb_prev(&req
->rb_node
) &&
3962 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
3963 struct request
, rb_node
))) {
3964 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
3965 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3966 struct request
*prev
, *next_rq
;
3968 /* Reposition request in its sort_list */
3969 elv_rb_del(&bfqq
->sort_list
, req
);
3970 elv_rb_add(&bfqq
->sort_list
, req
);
3972 /* Choose next request to be served for bfqq */
3973 prev
= bfqq
->next_rq
;
3974 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
3975 bfqd
->last_position
);
3976 bfqq
->next_rq
= next_rq
;
3978 * If next_rq changes, update the queue's budget to fit
3981 if (prev
!= bfqq
->next_rq
)
3982 bfq_updated_next_req(bfqd
, bfqq
);
3986 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
3987 struct request
*next
)
3989 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
3991 if (!RB_EMPTY_NODE(&rq
->rb_node
))
3993 spin_lock_irq(&bfqq
->bfqd
->lock
);
3996 * If next and rq belong to the same bfq_queue and next is older
3997 * than rq, then reposition rq in the fifo (by substituting next
3998 * with rq). Otherwise, if next and rq belong to different
3999 * bfq_queues, never reposition rq: in fact, we would have to
4000 * reposition it with respect to next's position in its own fifo,
4001 * which would most certainly be too expensive with respect to
4004 if (bfqq
== next_bfqq
&&
4005 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4006 next
->fifo_time
< rq
->fifo_time
) {
4007 list_del_init(&rq
->queuelist
);
4008 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4009 rq
->fifo_time
= next
->fifo_time
;
4012 if (bfqq
->next_rq
== next
)
4015 bfq_remove_request(q
, next
);
4017 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4019 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4022 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
4025 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4026 bool is_sync
= op_is_sync(bio
->bi_opf
);
4027 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4030 * Disallow merge of a sync bio into an async request.
4032 if (is_sync
&& !rq_is_sync(rq
))
4036 * Lookup the bfqq that this bio will be queued with. Allow
4037 * merge only if rq is queued there.
4042 return bfqq
== RQ_BFQQ(rq
);
4045 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
4046 struct bfq_queue
*bfqq
)
4049 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
4050 bfq_mark_bfqq_budget_new(bfqq
);
4051 bfq_clear_bfqq_fifo_expire(bfqq
);
4053 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
4055 bfq_log_bfqq(bfqd
, bfqq
,
4056 "set_in_service_queue, cur-budget = %d",
4057 bfqq
->entity
.budget
);
4060 bfqd
->in_service_queue
= bfqq
;
4064 * Get and set a new queue for service.
4066 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
4068 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
4070 __bfq_set_in_service_queue(bfqd
, bfqq
);
4074 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
4076 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4077 struct bfq_io_cq
*bic
;
4080 /* Processes have exited, don't wait. */
4081 bic
= bfqd
->in_service_bic
;
4082 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
4085 bfq_mark_bfqq_wait_request(bfqq
);
4088 * We don't want to idle for seeks, but we do want to allow
4089 * fair distribution of slice time for a process doing back-to-back
4090 * seeks. So allow a little bit of time for him to submit a new rq.
4092 sl
= bfqd
->bfq_slice_idle
;
4094 * Grant only minimum idle time if the queue is seeky.
4096 if (BFQQ_SEEKY(bfqq
))
4097 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
4099 bfqd
->last_idling_start
= ktime_get();
4100 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
4102 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
4106 * Set the maximum time for the in-service queue to consume its
4107 * budget. This prevents seeky processes from lowering the disk
4108 * throughput (always guaranteed with a time slice scheme as in CFQ).
4110 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
)
4112 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4113 unsigned int timeout_coeff
= bfqq
->entity
.weight
/
4114 bfqq
->entity
.orig_weight
;
4116 bfqd
->last_budget_start
= ktime_get();
4118 bfq_clear_bfqq_budget_new(bfqq
);
4119 bfqq
->budget_timeout
= jiffies
+
4120 bfqd
->bfq_timeout
* timeout_coeff
;
4122 bfq_log_bfqq(bfqd
, bfqq
, "set budget_timeout %u",
4123 jiffies_to_msecs(bfqd
->bfq_timeout
* timeout_coeff
));
4127 * Remove request from internal lists.
4129 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
4131 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4134 * For consistency, the next instruction should have been
4135 * executed after removing the request from the queue and
4136 * dispatching it. We execute instead this instruction before
4137 * bfq_remove_request() (and hence introduce a temporary
4138 * inconsistency), for efficiency. In fact, should this
4139 * dispatch occur for a non in-service bfqq, this anticipated
4140 * increment prevents two counters related to bfqq->dispatched
4141 * from risking to be, first, uselessly decremented, and then
4142 * incremented again when the (new) value of bfqq->dispatched
4143 * happens to be taken into account.
4147 bfq_remove_request(q
, rq
);
4150 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4152 if (RB_EMPTY_ROOT(&bfqq
->sort_list
))
4153 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
4155 bfq_requeue_bfqq(bfqd
, bfqq
);
4158 * All in-service entities must have been properly deactivated
4159 * or requeued before executing the next function, which
4160 * resets all in-service entites as no more in service.
4162 __bfq_bfqd_reset_in_service(bfqd
);
4166 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4167 * @bfqd: device data.
4168 * @bfqq: queue to update.
4169 * @reason: reason for expiration.
4171 * Handle the feedback on @bfqq budget at queue expiration.
4172 * See the body for detailed comments.
4174 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
4175 struct bfq_queue
*bfqq
,
4176 enum bfqq_expiration reason
)
4178 struct request
*next_rq
;
4179 int budget
, min_budget
;
4181 budget
= bfqq
->max_budget
;
4182 min_budget
= bfq_min_budget(bfqd
);
4184 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
4185 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
4186 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
4187 budget
, bfq_min_budget(bfqd
));
4188 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
4189 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
4191 if (bfq_bfqq_sync(bfqq
)) {
4194 * Caveat: in all the following cases we trade latency
4197 case BFQQE_TOO_IDLE
:
4199 * This is the only case where we may reduce
4200 * the budget: if there is no request of the
4201 * process still waiting for completion, then
4202 * we assume (tentatively) that the timer has
4203 * expired because the batch of requests of
4204 * the process could have been served with a
4205 * smaller budget. Hence, betting that
4206 * process will behave in the same way when it
4207 * becomes backlogged again, we reduce its
4208 * next budget. As long as we guess right,
4209 * this budget cut reduces the latency
4210 * experienced by the process.
4212 * However, if there are still outstanding
4213 * requests, then the process may have not yet
4214 * issued its next request just because it is
4215 * still waiting for the completion of some of
4216 * the still outstanding ones. So in this
4217 * subcase we do not reduce its budget, on the
4218 * contrary we increase it to possibly boost
4219 * the throughput, as discussed in the
4220 * comments to the BUDGET_TIMEOUT case.
4222 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
4223 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4225 if (budget
> 5 * min_budget
)
4226 budget
-= 4 * min_budget
;
4228 budget
= min_budget
;
4231 case BFQQE_BUDGET_TIMEOUT
:
4233 * We double the budget here because it gives
4234 * the chance to boost the throughput if this
4235 * is not a seeky process (and has bumped into
4236 * this timeout because of, e.g., ZBR).
4238 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4240 case BFQQE_BUDGET_EXHAUSTED
:
4242 * The process still has backlog, and did not
4243 * let either the budget timeout or the disk
4244 * idling timeout expire. Hence it is not
4245 * seeky, has a short thinktime and may be
4246 * happy with a higher budget too. So
4247 * definitely increase the budget of this good
4248 * candidate to boost the disk throughput.
4250 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
4252 case BFQQE_NO_MORE_REQUESTS
:
4254 * For queues that expire for this reason, it
4255 * is particularly important to keep the
4256 * budget close to the actual service they
4257 * need. Doing so reduces the timestamp
4258 * misalignment problem described in the
4259 * comments in the body of
4260 * __bfq_activate_entity. In fact, suppose
4261 * that a queue systematically expires for
4262 * BFQQE_NO_MORE_REQUESTS and presents a
4263 * new request in time to enjoy timestamp
4264 * back-shifting. The larger the budget of the
4265 * queue is with respect to the service the
4266 * queue actually requests in each service
4267 * slot, the more times the queue can be
4268 * reactivated with the same virtual finish
4269 * time. It follows that, even if this finish
4270 * time is pushed to the system virtual time
4271 * to reduce the consequent timestamp
4272 * misalignment, the queue unjustly enjoys for
4273 * many re-activations a lower finish time
4274 * than all newly activated queues.
4276 * The service needed by bfqq is measured
4277 * quite precisely by bfqq->entity.service.
4278 * Since bfqq does not enjoy device idling,
4279 * bfqq->entity.service is equal to the number
4280 * of sectors that the process associated with
4281 * bfqq requested to read/write before waiting
4282 * for request completions, or blocking for
4285 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
4292 * Async queues get always the maximum possible
4293 * budget, as for them we do not care about latency
4294 * (in addition, their ability to dispatch is limited
4295 * by the charging factor).
4297 budget
= bfqd
->bfq_max_budget
;
4300 bfqq
->max_budget
= budget
;
4302 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
4303 !bfqd
->bfq_user_max_budget
)
4304 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
4307 * If there is still backlog, then assign a new budget, making
4308 * sure that it is large enough for the next request. Since
4309 * the finish time of bfqq must be kept in sync with the
4310 * budget, be sure to call __bfq_bfqq_expire() *after* this
4313 * If there is no backlog, then no need to update the budget;
4314 * it will be updated on the arrival of a new request.
4316 next_rq
= bfqq
->next_rq
;
4318 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
4319 bfq_serv_to_charge(next_rq
, bfqq
));
4321 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
4322 next_rq
? blk_rq_sectors(next_rq
) : 0,
4323 bfqq
->entity
.budget
);
4326 static unsigned long bfq_calc_max_budget(u64 peak_rate
, u64 timeout
)
4328 unsigned long max_budget
;
4331 * The max_budget calculated when autotuning is equal to the
4332 * amount of sectors transferred in timeout at the estimated
4333 * peak rate. To get this value, peak_rate is, first,
4334 * multiplied by 1000, because timeout is measured in ms,
4335 * while peak_rate is measured in sectors/usecs. Then the
4336 * result of this multiplication is right-shifted by
4337 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
4338 * the peak rate left-shifted by BFQ_RATE_SHIFT.
4340 max_budget
= (unsigned long)(peak_rate
* 1000 *
4341 timeout
>> BFQ_RATE_SHIFT
);
4347 * In addition to updating the peak rate, checks whether the process
4348 * is "slow", and returns 1 if so. This slow flag is used, in addition
4349 * to the budget timeout, to reduce the amount of service provided to
4350 * seeky processes, and hence reduce their chances to lower the
4351 * throughput. See the code for more details.
4353 static bool bfq_update_peak_rate(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
4356 u64 bw
, usecs
, expected
, timeout
;
4360 if (!bfq_bfqq_sync(bfqq
) || bfq_bfqq_budget_new(bfqq
))
4364 delta
= bfqd
->last_idling_start
;
4366 delta
= ktime_get();
4367 delta
= ktime_sub(delta
, bfqd
->last_budget_start
);
4368 usecs
= ktime_to_us(delta
);
4370 /* don't use too short time intervals */
4375 * Calculate the bandwidth for the last slice. We use a 64 bit
4376 * value to store the peak rate, in sectors per usec in fixed
4377 * point math. We do so to have enough precision in the estimate
4378 * and to avoid overflows.
4380 bw
= (u64
)bfqq
->entity
.service
<< BFQ_RATE_SHIFT
;
4381 do_div(bw
, (unsigned long)usecs
);
4383 timeout
= jiffies_to_msecs(bfqd
->bfq_timeout
);
4386 * Use only long (> 20ms) intervals to filter out spikes for
4387 * the peak rate estimation.
4389 if (usecs
> 20000) {
4390 if (bw
> bfqd
->peak_rate
) {
4391 bfqd
->peak_rate
= bw
;
4393 bfq_log(bfqd
, "new peak_rate=%llu", bw
);
4396 update
|= bfqd
->peak_rate_samples
== BFQ_PEAK_RATE_SAMPLES
- 1;
4398 if (bfqd
->peak_rate_samples
< BFQ_PEAK_RATE_SAMPLES
)
4399 bfqd
->peak_rate_samples
++;
4401 if (bfqd
->peak_rate_samples
== BFQ_PEAK_RATE_SAMPLES
&&
4402 update
&& bfqd
->bfq_user_max_budget
== 0) {
4403 bfqd
->bfq_max_budget
=
4404 bfq_calc_max_budget(bfqd
->peak_rate
,
4406 bfq_log(bfqd
, "new max_budget=%d",
4407 bfqd
->bfq_max_budget
);
4412 * A process is considered ``slow'' (i.e., seeky, so that we
4413 * cannot treat it fairly in the service domain, as it would
4414 * slow down too much the other processes) if, when a slice
4415 * ends for whatever reason, it has received service at a
4416 * rate that would not be high enough to complete the budget
4417 * before the budget timeout expiration.
4419 expected
= bw
* 1000 * timeout
>> BFQ_RATE_SHIFT
;
4422 * Caveat: processes doing IO in the slower disk zones will
4423 * tend to be slow(er) even if not seeky. And the estimated
4424 * peak rate will actually be an average over the disk
4425 * surface. Hence, to not be too harsh with unlucky processes,
4426 * we keep a budget/3 margin of safety before declaring a
4429 return expected
> (4 * bfqq
->entity
.budget
) / 3;
4433 * Return the farthest past time instant according to jiffies
4436 static unsigned long bfq_smallest_from_now(void)
4438 return jiffies
- MAX_JIFFY_OFFSET
;
4442 * bfq_bfqq_expire - expire a queue.
4443 * @bfqd: device owning the queue.
4444 * @bfqq: the queue to expire.
4445 * @compensate: if true, compensate for the time spent idling.
4446 * @reason: the reason causing the expiration.
4449 * If the process associated with the queue is slow (i.e., seeky), or
4450 * in case of budget timeout, or, finally, if it is async, we
4451 * artificially charge it an entire budget (independently of the
4452 * actual service it received). As a consequence, the queue will get
4453 * higher timestamps than the correct ones upon reactivation, and
4454 * hence it will be rescheduled as if it had received more service
4455 * than what it actually received. In the end, this class of processes
4456 * will receive less service in proportion to how slowly they consume
4457 * their budgets (and hence how seriously they tend to lower the
4460 * In contrast, when a queue expires because it has been idling for
4461 * too much or because it exhausted its budget, we do not touch the
4462 * amount of service it has received. Hence when the queue will be
4463 * reactivated and its timestamps updated, the latter will be in sync
4464 * with the actual service received by the queue until expiration.
4466 * Charging a full budget to the first type of queues and the exact
4467 * service to the others has the effect of using the WF2Q+ policy to
4468 * schedule the former on a timeslice basis, without violating the
4469 * service domain guarantees of the latter.
4471 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4472 struct bfq_queue
*bfqq
,
4474 enum bfqq_expiration reason
)
4480 * Update device peak rate for autotuning and check whether the
4481 * process is slow (see bfq_update_peak_rate).
4483 slow
= bfq_update_peak_rate(bfqd
, bfqq
, compensate
);
4486 * As above explained, 'punish' slow (i.e., seeky), timed-out
4487 * and async queues, to favor sequential sync workloads.
4489 if (slow
|| reason
== BFQQE_BUDGET_TIMEOUT
)
4490 bfq_bfqq_charge_full_budget(bfqq
);
4492 if (reason
== BFQQE_TOO_IDLE
&&
4493 bfqq
->entity
.service
<= 2 * bfqq
->entity
.budget
/ 10)
4494 bfq_clear_bfqq_IO_bound(bfqq
);
4496 bfq_log_bfqq(bfqd
, bfqq
,
4497 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
4498 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
4501 * Increase, decrease or leave budget unchanged according to
4504 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
4506 __bfq_bfqq_expire(bfqd
, bfqq
);
4508 /* mark bfqq as waiting a request only if a bic still points to it */
4509 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
4510 reason
!= BFQQE_BUDGET_TIMEOUT
&&
4511 reason
!= BFQQE_BUDGET_EXHAUSTED
)
4512 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
4516 * Budget timeout is not implemented through a dedicated timer, but
4517 * just checked on request arrivals and completions, as well as on
4518 * idle timer expirations.
4520 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
4522 if (bfq_bfqq_budget_new(bfqq
) ||
4523 time_is_after_jiffies(bfqq
->budget_timeout
))
4529 * If we expire a queue that is actively waiting (i.e., with the
4530 * device idled) for the arrival of a new request, then we may incur
4531 * the timestamp misalignment problem described in the body of the
4532 * function __bfq_activate_entity. Hence we return true only if this
4533 * condition does not hold, or if the queue is slow enough to deserve
4534 * only to be kicked off for preserving a high throughput.
4536 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
4538 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4539 "may_budget_timeout: wait_request %d left %d timeout %d",
4540 bfq_bfqq_wait_request(bfqq
),
4541 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
4542 bfq_bfqq_budget_timeout(bfqq
));
4544 return (!bfq_bfqq_wait_request(bfqq
) ||
4545 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
4547 bfq_bfqq_budget_timeout(bfqq
);
4551 * For a queue that becomes empty, device idling is allowed only if
4552 * this function returns true for the queue. And this function returns
4553 * true only if idling is beneficial for throughput.
4555 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
4557 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4558 bool idling_boosts_thr
;
4560 if (bfqd
->strict_guarantees
)
4564 * The value of the next variable is computed considering that
4565 * idling is usually beneficial for the throughput if:
4566 * (a) the device is not NCQ-capable, or
4567 * (b) regardless of the presence of NCQ, the request pattern
4568 * for bfqq is I/O-bound (possible throughput losses
4569 * caused by granting idling to seeky queues are mitigated
4570 * by the fact that, in all scenarios where boosting
4571 * throughput is the best thing to do, i.e., in all
4572 * symmetric scenarios, only a minimal idle time is
4573 * allowed to seeky queues).
4575 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
4578 * We have now the components we need to compute the return
4579 * value of the function, which is true only if both the
4580 * following conditions hold:
4581 * 1) bfqq is sync, because idling make sense only for sync queues;
4582 * 2) idling boosts the throughput.
4584 return bfq_bfqq_sync(bfqq
) && idling_boosts_thr
;
4588 * If the in-service queue is empty but the function bfq_bfqq_may_idle
4589 * returns true, then:
4590 * 1) the queue must remain in service and cannot be expired, and
4591 * 2) the device must be idled to wait for the possible arrival of a new
4592 * request for the queue.
4593 * See the comments on the function bfq_bfqq_may_idle for the reasons
4594 * why performing device idling is the best choice to boost the throughput
4595 * and preserve service guarantees when bfq_bfqq_may_idle itself
4598 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
4600 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4602 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
4603 bfq_bfqq_may_idle(bfqq
);
4607 * Select a queue for service. If we have a current queue in service,
4608 * check whether to continue servicing it, or retrieve and set a new one.
4610 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
4612 struct bfq_queue
*bfqq
;
4613 struct request
*next_rq
;
4614 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
4616 bfqq
= bfqd
->in_service_queue
;
4620 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
4622 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
4623 !bfq_bfqq_wait_request(bfqq
) &&
4624 !bfq_bfqq_must_idle(bfqq
))
4629 * This loop is rarely executed more than once. Even when it
4630 * happens, it is much more convenient to re-execute this loop
4631 * than to return NULL and trigger a new dispatch to get a
4634 next_rq
= bfqq
->next_rq
;
4636 * If bfqq has requests queued and it has enough budget left to
4637 * serve them, keep the queue, otherwise expire it.
4640 if (bfq_serv_to_charge(next_rq
, bfqq
) >
4641 bfq_bfqq_budget_left(bfqq
)) {
4643 * Expire the queue for budget exhaustion,
4644 * which makes sure that the next budget is
4645 * enough to serve the next request, even if
4646 * it comes from the fifo expired path.
4648 reason
= BFQQE_BUDGET_EXHAUSTED
;
4652 * The idle timer may be pending because we may
4653 * not disable disk idling even when a new request
4656 if (bfq_bfqq_wait_request(bfqq
)) {
4658 * If we get here: 1) at least a new request
4659 * has arrived but we have not disabled the
4660 * timer because the request was too small,
4661 * 2) then the block layer has unplugged
4662 * the device, causing the dispatch to be
4665 * Since the device is unplugged, now the
4666 * requests are probably large enough to
4667 * provide a reasonable throughput.
4668 * So we disable idling.
4670 bfq_clear_bfqq_wait_request(bfqq
);
4671 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
4672 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
4679 * No requests pending. However, if the in-service queue is idling
4680 * for a new request, or has requests waiting for a completion and
4681 * may idle after their completion, then keep it anyway.
4683 if (bfq_bfqq_wait_request(bfqq
) ||
4684 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
4689 reason
= BFQQE_NO_MORE_REQUESTS
;
4691 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
4693 bfqq
= bfq_set_in_service_queue(bfqd
);
4695 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
4700 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
4702 bfq_log(bfqd
, "select_queue: no queue returned");
4708 * Dispatch next request from bfqq.
4710 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
4711 struct bfq_queue
*bfqq
)
4713 struct request
*rq
= bfqq
->next_rq
;
4714 unsigned long service_to_charge
;
4716 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
4718 bfq_bfqq_served(bfqq
, service_to_charge
);
4720 bfq_dispatch_remove(bfqd
->queue
, rq
);
4722 if (!bfqd
->in_service_bic
) {
4723 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
4724 bfqd
->in_service_bic
= RQ_BIC(rq
);
4728 * Expire bfqq, pretending that its budget expired, if bfqq
4729 * belongs to CLASS_IDLE and other queues are waiting for
4732 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
4738 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
4742 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
4744 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
4747 * Avoiding lock: a race on bfqd->busy_queues should cause at
4748 * most a call to dispatch for nothing
4750 return !list_empty_careful(&bfqd
->dispatch
) ||
4751 bfqd
->busy_queues
> 0;
4754 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
4756 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
4757 struct request
*rq
= NULL
;
4758 struct bfq_queue
*bfqq
= NULL
;
4760 if (!list_empty(&bfqd
->dispatch
)) {
4761 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
4763 list_del_init(&rq
->queuelist
);
4769 * Increment counters here, because this
4770 * dispatch does not follow the standard
4771 * dispatch flow (where counters are
4776 goto inc_in_driver_start_rq
;
4780 * We exploit the put_rq_private hook to decrement
4781 * rq_in_driver, but put_rq_private will not be
4782 * invoked on this request. So, to avoid unbalance,
4783 * just start this request, without incrementing
4784 * rq_in_driver. As a negative consequence,
4785 * rq_in_driver is deceptively lower than it should be
4786 * while this request is in service. This may cause
4787 * bfq_schedule_dispatch to be invoked uselessly.
4789 * As for implementing an exact solution, the
4790 * put_request hook, if defined, is probably invoked
4791 * also on this request. So, by exploiting this hook,
4792 * we could 1) increment rq_in_driver here, and 2)
4793 * decrement it in put_request. Such a solution would
4794 * let the value of the counter be always accurate,
4795 * but it would entail using an extra interface
4796 * function. This cost seems higher than the benefit,
4797 * being the frequency of non-elevator-private
4798 * requests very low.
4803 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
4805 if (bfqd
->busy_queues
== 0)
4809 * Force device to serve one request at a time if
4810 * strict_guarantees is true. Forcing this service scheme is
4811 * currently the ONLY way to guarantee that the request
4812 * service order enforced by the scheduler is respected by a
4813 * queueing device. Otherwise the device is free even to make
4814 * some unlucky request wait for as long as the device
4817 * Of course, serving one request at at time may cause loss of
4820 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
4823 bfqq
= bfq_select_queue(bfqd
);
4827 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
4830 inc_in_driver_start_rq
:
4831 bfqd
->rq_in_driver
++;
4833 rq
->rq_flags
|= RQF_STARTED
;
4839 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
4841 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
4844 spin_lock_irq(&bfqd
->lock
);
4845 rq
= __bfq_dispatch_request(hctx
);
4846 spin_unlock_irq(&bfqd
->lock
);
4852 * Task holds one reference to the queue, dropped when task exits. Each rq
4853 * in-flight on this queue also holds a reference, dropped when rq is freed.
4855 * Scheduler lock must be held here. Recall not to use bfqq after calling
4856 * this function on it.
4858 static void bfq_put_queue(struct bfq_queue
*bfqq
)
4860 #ifdef CONFIG_BFQ_GROUP_IOSCHED
4861 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
4865 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
4872 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
4874 kmem_cache_free(bfq_pool
, bfqq
);
4875 #ifdef CONFIG_BFQ_GROUP_IOSCHED
4880 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4882 if (bfqq
== bfqd
->in_service_queue
) {
4883 __bfq_bfqq_expire(bfqd
, bfqq
);
4884 bfq_schedule_dispatch(bfqd
);
4887 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
4889 bfq_put_queue(bfqq
); /* release process reference */
4892 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
4894 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
4895 struct bfq_data
*bfqd
;
4898 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
4901 unsigned long flags
;
4903 spin_lock_irqsave(&bfqd
->lock
, flags
);
4904 bfq_exit_bfqq(bfqd
, bfqq
);
4905 bic_set_bfqq(bic
, NULL
, is_sync
);
4906 spin_unlock_irq(&bfqd
->lock
);
4910 static void bfq_exit_icq(struct io_cq
*icq
)
4912 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
4914 bfq_exit_icq_bfqq(bic
, true);
4915 bfq_exit_icq_bfqq(bic
, false);
4919 * Update the entity prio values; note that the new values will not
4920 * be used until the next (re)activation.
4923 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
4925 struct task_struct
*tsk
= current
;
4927 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4932 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
4933 switch (ioprio_class
) {
4935 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
4936 "bfq: bad prio class %d\n", ioprio_class
);
4937 case IOPRIO_CLASS_NONE
:
4939 * No prio set, inherit CPU scheduling settings.
4941 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
4942 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
4944 case IOPRIO_CLASS_RT
:
4945 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
4946 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
4948 case IOPRIO_CLASS_BE
:
4949 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
4950 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
4952 case IOPRIO_CLASS_IDLE
:
4953 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
4954 bfqq
->new_ioprio
= 7;
4955 bfq_clear_bfqq_idle_window(bfqq
);
4959 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
4960 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
4962 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
4965 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
4966 bfqq
->entity
.prio_changed
= 1;
4969 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
4971 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
4972 struct bfq_queue
*bfqq
;
4973 int ioprio
= bic
->icq
.ioc
->ioprio
;
4976 * This condition may trigger on a newly created bic, be sure to
4977 * drop the lock before returning.
4979 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
4982 bic
->ioprio
= ioprio
;
4984 bfqq
= bic_to_bfqq(bic
, false);
4986 /* release process reference on this queue */
4987 bfq_put_queue(bfqq
);
4988 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
4989 bic_set_bfqq(bic
, bfqq
, false);
4992 bfqq
= bic_to_bfqq(bic
, true);
4994 bfq_set_next_ioprio_data(bfqq
, bic
);
4997 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
4998 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
5000 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
5001 INIT_LIST_HEAD(&bfqq
->fifo
);
5007 bfq_set_next_ioprio_data(bfqq
, bic
);
5010 if (!bfq_class_idle(bfqq
))
5011 bfq_mark_bfqq_idle_window(bfqq
);
5012 bfq_mark_bfqq_sync(bfqq
);
5014 bfq_clear_bfqq_sync(bfqq
);
5016 /* set end request to minus infinity from now */
5017 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
5019 bfq_mark_bfqq_IO_bound(bfqq
);
5023 /* Tentative initial value to trade off between thr and lat */
5024 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
5025 bfqq
->budget_timeout
= bfq_smallest_from_now();
5027 /* first request is almost certainly seeky */
5028 bfqq
->seek_history
= 1;
5031 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
5032 struct bfq_group
*bfqg
,
5033 int ioprio_class
, int ioprio
)
5035 switch (ioprio_class
) {
5036 case IOPRIO_CLASS_RT
:
5037 return &bfqg
->async_bfqq
[0][ioprio
];
5038 case IOPRIO_CLASS_NONE
:
5039 ioprio
= IOPRIO_NORM
;
5041 case IOPRIO_CLASS_BE
:
5042 return &bfqg
->async_bfqq
[1][ioprio
];
5043 case IOPRIO_CLASS_IDLE
:
5044 return &bfqg
->async_idle_bfqq
;
5050 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
5051 struct bio
*bio
, bool is_sync
,
5052 struct bfq_io_cq
*bic
)
5054 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5055 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
5056 struct bfq_queue
**async_bfqq
= NULL
;
5057 struct bfq_queue
*bfqq
;
5058 struct bfq_group
*bfqg
;
5062 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
5064 bfqq
= &bfqd
->oom_bfqq
;
5069 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
5076 bfqq
= kmem_cache_alloc_node(bfq_pool
,
5077 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
5081 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
5083 bfq_init_entity(&bfqq
->entity
, bfqg
);
5084 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
5086 bfqq
= &bfqd
->oom_bfqq
;
5087 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
5092 * Pin the queue now that it's allocated, scheduler exit will
5097 * Extra group reference, w.r.t. sync
5098 * queue. This extra reference is removed
5099 * only if bfqq->bfqg disappears, to
5100 * guarantee that this queue is not freed
5101 * until its group goes away.
5103 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
5109 bfqq
->ref
++; /* get a process reference to this queue */
5110 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
5115 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
5116 struct bfq_queue
*bfqq
)
5118 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
5119 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
5121 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
5123 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
5124 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
5125 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
5126 ttime
->ttime_samples
);
5130 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5135 if (bfqq
->last_request_pos
) {
5136 if (bfqq
->last_request_pos
< blk_rq_pos(rq
))
5137 sdist
= blk_rq_pos(rq
) - bfqq
->last_request_pos
;
5139 sdist
= bfqq
->last_request_pos
- blk_rq_pos(rq
);
5142 bfqq
->seek_history
<<= 1;
5143 bfqq
->seek_history
|= sdist
> BFQQ_SEEK_THR
&&
5144 (!blk_queue_nonrot(bfqd
->queue
) ||
5145 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
5149 * Disable idle window if the process thinks too long or seeks so much that
5150 * it doesn't matter.
5152 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
5153 struct bfq_queue
*bfqq
,
5154 struct bfq_io_cq
*bic
)
5158 /* Don't idle for async or idle io prio class. */
5159 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
5162 enable_idle
= bfq_bfqq_idle_window(bfqq
);
5164 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
5165 bfqd
->bfq_slice_idle
== 0 ||
5166 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
)))
5168 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
5169 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
)
5174 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
5178 bfq_mark_bfqq_idle_window(bfqq
);
5180 bfq_clear_bfqq_idle_window(bfqq
);
5184 * Called when a new fs request (rq) is added to bfqq. Check if there's
5185 * something we should do about it.
5187 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5190 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
5192 if (rq
->cmd_flags
& REQ_META
)
5193 bfqq
->meta_pending
++;
5195 bfq_update_io_thinktime(bfqd
, bfqq
);
5196 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
5197 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
5199 bfq_update_idle_window(bfqd
, bfqq
, bic
);
5201 bfq_log_bfqq(bfqd
, bfqq
,
5202 "rq_enqueued: idle_window=%d (seeky %d)",
5203 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
5205 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
5207 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
5208 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
5209 blk_rq_sectors(rq
) < 32;
5210 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
5213 * There is just this request queued: if the request
5214 * is small and the queue is not to be expired, then
5217 * In this way, if the device is being idled to wait
5218 * for a new request from the in-service queue, we
5219 * avoid unplugging the device and committing the
5220 * device to serve just a small request. On the
5221 * contrary, we wait for the block layer to decide
5222 * when to unplug the device: hopefully, new requests
5223 * will be merged to this one quickly, then the device
5224 * will be unplugged and larger requests will be
5227 if (small_req
&& !budget_timeout
)
5231 * A large enough request arrived, or the queue is to
5232 * be expired: in both cases disk idling is to be
5233 * stopped, so clear wait_request flag and reset
5236 bfq_clear_bfqq_wait_request(bfqq
);
5237 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
5238 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
5241 * The queue is not empty, because a new request just
5242 * arrived. Hence we can safely expire the queue, in
5243 * case of budget timeout, without risking that the
5244 * timestamps of the queue are not updated correctly.
5245 * See [1] for more details.
5248 bfq_bfqq_expire(bfqd
, bfqq
, false,
5249 BFQQE_BUDGET_TIMEOUT
);
5253 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
5255 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5257 bfq_add_request(rq
);
5259 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
5260 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
5262 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
5265 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
5268 struct request_queue
*q
= hctx
->queue
;
5269 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5271 spin_lock_irq(&bfqd
->lock
);
5272 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
5273 spin_unlock_irq(&bfqd
->lock
);
5277 spin_unlock_irq(&bfqd
->lock
);
5279 blk_mq_sched_request_inserted(rq
);
5281 spin_lock_irq(&bfqd
->lock
);
5282 if (at_head
|| blk_rq_is_passthrough(rq
)) {
5284 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
5286 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
5288 __bfq_insert_request(bfqd
, rq
);
5290 if (rq_mergeable(rq
)) {
5291 elv_rqhash_add(q
, rq
);
5297 spin_unlock_irq(&bfqd
->lock
);
5300 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
5301 struct list_head
*list
, bool at_head
)
5303 while (!list_empty(list
)) {
5306 rq
= list_first_entry(list
, struct request
, queuelist
);
5307 list_del_init(&rq
->queuelist
);
5308 bfq_insert_request(hctx
, rq
, at_head
);
5312 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
5314 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
5315 bfqd
->rq_in_driver
);
5317 if (bfqd
->hw_tag
== 1)
5321 * This sample is valid if the number of outstanding requests
5322 * is large enough to allow a queueing behavior. Note that the
5323 * sum is not exact, as it's not taking into account deactivated
5326 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
5329 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
5332 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
5333 bfqd
->max_rq_in_driver
= 0;
5334 bfqd
->hw_tag_samples
= 0;
5337 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
5339 bfq_update_hw_tag(bfqd
);
5341 bfqd
->rq_in_driver
--;
5344 bfqq
->ttime
.last_end_request
= ktime_get_ns();
5347 * If this is the in-service queue, check if it needs to be expired,
5348 * or if we want to idle in case it has no pending requests.
5350 if (bfqd
->in_service_queue
== bfqq
) {
5351 if (bfq_bfqq_budget_new(bfqq
))
5352 bfq_set_budget_timeout(bfqd
);
5354 if (bfq_bfqq_must_idle(bfqq
)) {
5355 bfq_arm_slice_timer(bfqd
);
5357 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
5358 bfq_bfqq_expire(bfqd
, bfqq
, false,
5359 BFQQE_BUDGET_TIMEOUT
);
5360 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
5361 (bfqq
->dispatched
== 0 ||
5362 !bfq_bfqq_may_idle(bfqq
)))
5363 bfq_bfqq_expire(bfqd
, bfqq
, false,
5364 BFQQE_NO_MORE_REQUESTS
);
5368 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
5372 bfq_put_queue(bfqq
);
5375 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
5377 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5378 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5380 if (rq
->rq_flags
& RQF_STARTED
)
5381 bfqg_stats_update_completion(bfqq_group(bfqq
),
5382 rq_start_time_ns(rq
),
5383 rq_io_start_time_ns(rq
),
5386 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
5387 unsigned long flags
;
5389 spin_lock_irqsave(&bfqd
->lock
, flags
);
5391 bfq_completed_request(bfqq
, bfqd
);
5392 bfq_put_rq_priv_body(bfqq
);
5394 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5397 * Request rq may be still/already in the scheduler,
5398 * in which case we need to remove it. And we cannot
5399 * defer such a check and removal, to avoid
5400 * inconsistencies in the time interval from the end
5401 * of this function to the start of the deferred work.
5402 * This situation seems to occur only in process
5403 * context, as a consequence of a merge. In the
5404 * current version of the code, this implies that the
5408 if (!RB_EMPTY_NODE(&rq
->rb_node
))
5409 bfq_remove_request(q
, rq
);
5410 bfq_put_rq_priv_body(bfqq
);
5413 rq
->elv
.priv
[0] = NULL
;
5414 rq
->elv
.priv
[1] = NULL
;
5418 * Allocate bfq data structures associated with this request.
5420 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
5423 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5424 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
5425 const int is_sync
= rq_is_sync(rq
);
5426 struct bfq_queue
*bfqq
;
5428 spin_lock_irq(&bfqd
->lock
);
5430 bfq_check_ioprio_change(bic
, bio
);
5435 bfq_bic_update_cgroup(bic
, bio
);
5437 bfqq
= bic_to_bfqq(bic
, is_sync
);
5438 if (!bfqq
|| bfqq
== &bfqd
->oom_bfqq
) {
5440 bfq_put_queue(bfqq
);
5441 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
5442 bic_set_bfqq(bic
, bfqq
, is_sync
);
5447 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
5448 rq
, bfqq
, bfqq
->ref
);
5450 rq
->elv
.priv
[0] = bic
;
5451 rq
->elv
.priv
[1] = bfqq
;
5453 spin_unlock_irq(&bfqd
->lock
);
5458 spin_unlock_irq(&bfqd
->lock
);
5463 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
5465 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5466 enum bfqq_expiration reason
;
5467 unsigned long flags
;
5469 spin_lock_irqsave(&bfqd
->lock
, flags
);
5470 bfq_clear_bfqq_wait_request(bfqq
);
5472 if (bfqq
!= bfqd
->in_service_queue
) {
5473 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5477 if (bfq_bfqq_budget_timeout(bfqq
))
5479 * Also here the queue can be safely expired
5480 * for budget timeout without wasting
5483 reason
= BFQQE_BUDGET_TIMEOUT
;
5484 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
5486 * The queue may not be empty upon timer expiration,
5487 * because we may not disable the timer when the
5488 * first request of the in-service queue arrives
5489 * during disk idling.
5491 reason
= BFQQE_TOO_IDLE
;
5493 goto schedule_dispatch
;
5495 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
5498 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5499 bfq_schedule_dispatch(bfqd
);
5503 * Handler of the expiration of the timer running if the in-service queue
5504 * is idling inside its time slice.
5506 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
5508 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
5510 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5513 * Theoretical race here: the in-service queue can be NULL or
5514 * different from the queue that was idling if a new request
5515 * arrives for the current queue and there is a full dispatch
5516 * cycle that changes the in-service queue. This can hardly
5517 * happen, but in the worst case we just expire a queue too
5521 bfq_idle_slice_timer_body(bfqq
);
5523 return HRTIMER_NORESTART
;
5526 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
5527 struct bfq_queue
**bfqq_ptr
)
5529 struct bfq_queue
*bfqq
= *bfqq_ptr
;
5531 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
5533 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
5535 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
5537 bfq_put_queue(bfqq
);
5543 * Release all the bfqg references to its async queues. If we are
5544 * deallocating the group these queues may still contain requests, so
5545 * we reparent them to the root cgroup (i.e., the only one that will
5546 * exist for sure until all the requests on a device are gone).
5548 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
5552 for (i
= 0; i
< 2; i
++)
5553 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
5554 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
5556 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
5559 static void bfq_exit_queue(struct elevator_queue
*e
)
5561 struct bfq_data
*bfqd
= e
->elevator_data
;
5562 struct bfq_queue
*bfqq
, *n
;
5564 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5566 spin_lock_irq(&bfqd
->lock
);
5567 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
5568 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
5569 spin_unlock_irq(&bfqd
->lock
);
5571 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5573 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5574 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
5576 spin_lock_irq(&bfqd
->lock
);
5577 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
5578 kfree(bfqd
->root_group
);
5579 spin_unlock_irq(&bfqd
->lock
);
5585 static void bfq_init_root_group(struct bfq_group
*root_group
,
5586 struct bfq_data
*bfqd
)
5590 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5591 root_group
->entity
.parent
= NULL
;
5592 root_group
->my_entity
= NULL
;
5593 root_group
->bfqd
= bfqd
;
5595 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
5596 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
5597 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
5600 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
5602 struct bfq_data
*bfqd
;
5603 struct elevator_queue
*eq
;
5605 eq
= elevator_alloc(q
, e
);
5609 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
5611 kobject_put(&eq
->kobj
);
5614 eq
->elevator_data
= bfqd
;
5616 spin_lock_irq(q
->queue_lock
);
5618 spin_unlock_irq(q
->queue_lock
);
5621 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
5622 * Grab a permanent reference to it, so that the normal code flow
5623 * will not attempt to free it.
5625 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
5626 bfqd
->oom_bfqq
.ref
++;
5627 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
5628 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
5629 bfqd
->oom_bfqq
.entity
.new_weight
=
5630 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
5632 * Trigger weight initialization, according to ioprio, at the
5633 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
5634 * class won't be changed any more.
5636 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
5640 INIT_LIST_HEAD(&bfqd
->dispatch
);
5642 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
5644 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
5646 INIT_LIST_HEAD(&bfqd
->active_list
);
5647 INIT_LIST_HEAD(&bfqd
->idle_list
);
5651 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
5653 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
5654 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
5655 bfqd
->bfq_back_max
= bfq_back_max
;
5656 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
5657 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
5658 bfqd
->bfq_timeout
= bfq_timeout
;
5660 bfqd
->bfq_requests_within_timer
= 120;
5662 spin_lock_init(&bfqd
->lock
);
5665 * The invocation of the next bfq_create_group_hierarchy
5666 * function is the head of a chain of function calls
5667 * (bfq_create_group_hierarchy->blkcg_activate_policy->
5668 * blk_mq_freeze_queue) that may lead to the invocation of the
5669 * has_work hook function. For this reason,
5670 * bfq_create_group_hierarchy is invoked only after all
5671 * scheduler data has been initialized, apart from the fields
5672 * that can be initialized only after invoking
5673 * bfq_create_group_hierarchy. This, in particular, enables
5674 * has_work to correctly return false. Of course, to avoid
5675 * other inconsistencies, the blk-mq stack must then refrain
5676 * from invoking further scheduler hooks before this init
5677 * function is finished.
5679 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
5680 if (!bfqd
->root_group
)
5682 bfq_init_root_group(bfqd
->root_group
, bfqd
);
5683 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
5690 kobject_put(&eq
->kobj
);
5694 static void bfq_slab_kill(void)
5696 kmem_cache_destroy(bfq_pool
);
5699 static int __init
bfq_slab_setup(void)
5701 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
5707 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
5709 return sprintf(page
, "%u\n", var
);
5712 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
5715 unsigned long new_val
;
5716 int ret
= kstrtoul(page
, 10, &new_val
);
5724 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
5725 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
5727 struct bfq_data *bfqd = e->elevator_data; \
5728 u64 __data = __VAR; \
5730 __data = jiffies_to_msecs(__data); \
5731 else if (__CONV == 2) \
5732 __data = div_u64(__data, NSEC_PER_MSEC); \
5733 return bfq_var_show(__data, (page)); \
5735 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
5736 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
5737 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
5738 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
5739 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
5740 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
5741 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
5742 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
5743 #undef SHOW_FUNCTION
5745 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
5746 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
5748 struct bfq_data *bfqd = e->elevator_data; \
5749 u64 __data = __VAR; \
5750 __data = div_u64(__data, NSEC_PER_USEC); \
5751 return bfq_var_show(__data, (page)); \
5753 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
5754 #undef USEC_SHOW_FUNCTION
5756 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
5758 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
5760 struct bfq_data *bfqd = e->elevator_data; \
5761 unsigned long uninitialized_var(__data); \
5762 int ret = bfq_var_store(&__data, (page), count); \
5763 if (__data < (MIN)) \
5765 else if (__data > (MAX)) \
5768 *(__PTR) = msecs_to_jiffies(__data); \
5769 else if (__CONV == 2) \
5770 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
5772 *(__PTR) = __data; \
5775 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
5777 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
5779 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
5780 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
5782 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
5783 #undef STORE_FUNCTION
5785 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
5786 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
5788 struct bfq_data *bfqd = e->elevator_data; \
5789 unsigned long uninitialized_var(__data); \
5790 int ret = bfq_var_store(&__data, (page), count); \
5791 if (__data < (MIN)) \
5793 else if (__data > (MAX)) \
5795 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
5798 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
5800 #undef USEC_STORE_FUNCTION
5802 static unsigned long bfq_estimated_max_budget(struct bfq_data
*bfqd
)
5804 u64 timeout
= jiffies_to_msecs(bfqd
->bfq_timeout
);
5806 if (bfqd
->peak_rate_samples
>= BFQ_PEAK_RATE_SAMPLES
)
5807 return bfq_calc_max_budget(bfqd
->peak_rate
, timeout
);
5809 return bfq_default_max_budget
;
5812 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
5813 const char *page
, size_t count
)
5815 struct bfq_data
*bfqd
= e
->elevator_data
;
5816 unsigned long uninitialized_var(__data
);
5817 int ret
= bfq_var_store(&__data
, (page
), count
);
5820 bfqd
->bfq_max_budget
= bfq_estimated_max_budget(bfqd
);
5822 if (__data
> INT_MAX
)
5824 bfqd
->bfq_max_budget
= __data
;
5827 bfqd
->bfq_user_max_budget
= __data
;
5833 * Leaving this name to preserve name compatibility with cfq
5834 * parameters, but this timeout is used for both sync and async.
5836 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
5837 const char *page
, size_t count
)
5839 struct bfq_data
*bfqd
= e
->elevator_data
;
5840 unsigned long uninitialized_var(__data
);
5841 int ret
= bfq_var_store(&__data
, (page
), count
);
5845 else if (__data
> INT_MAX
)
5848 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
5849 if (bfqd
->bfq_user_max_budget
== 0)
5850 bfqd
->bfq_max_budget
= bfq_estimated_max_budget(bfqd
);
5855 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
5856 const char *page
, size_t count
)
5858 struct bfq_data
*bfqd
= e
->elevator_data
;
5859 unsigned long uninitialized_var(__data
);
5860 int ret
= bfq_var_store(&__data
, (page
), count
);
5864 if (!bfqd
->strict_guarantees
&& __data
== 1
5865 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
5866 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
5868 bfqd
->strict_guarantees
= __data
;
5873 #define BFQ_ATTR(name) \
5874 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
5876 static struct elv_fs_entry bfq_attrs
[] = {
5877 BFQ_ATTR(fifo_expire_sync
),
5878 BFQ_ATTR(fifo_expire_async
),
5879 BFQ_ATTR(back_seek_max
),
5880 BFQ_ATTR(back_seek_penalty
),
5881 BFQ_ATTR(slice_idle
),
5882 BFQ_ATTR(slice_idle_us
),
5883 BFQ_ATTR(max_budget
),
5884 BFQ_ATTR(timeout_sync
),
5885 BFQ_ATTR(strict_guarantees
),
5889 static struct elevator_type iosched_bfq_mq
= {
5891 .get_rq_priv
= bfq_get_rq_private
,
5892 .put_rq_priv
= bfq_put_rq_private
,
5893 .exit_icq
= bfq_exit_icq
,
5894 .insert_requests
= bfq_insert_requests
,
5895 .dispatch_request
= bfq_dispatch_request
,
5896 .next_request
= elv_rb_latter_request
,
5897 .former_request
= elv_rb_former_request
,
5898 .allow_merge
= bfq_allow_bio_merge
,
5899 .bio_merge
= bfq_bio_merge
,
5900 .request_merge
= bfq_request_merge
,
5901 .requests_merged
= bfq_requests_merged
,
5902 .request_merged
= bfq_request_merged
,
5903 .has_work
= bfq_has_work
,
5904 .init_sched
= bfq_init_queue
,
5905 .exit_sched
= bfq_exit_queue
,
5909 .icq_size
= sizeof(struct bfq_io_cq
),
5910 .icq_align
= __alignof__(struct bfq_io_cq
),
5911 .elevator_attrs
= bfq_attrs
,
5912 .elevator_name
= "bfq",
5913 .elevator_owner
= THIS_MODULE
,
5916 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5917 static struct blkcg_policy blkcg_policy_bfq
= {
5918 .dfl_cftypes
= bfq_blkg_files
,
5919 .legacy_cftypes
= bfq_blkcg_legacy_files
,
5921 .cpd_alloc_fn
= bfq_cpd_alloc
,
5922 .cpd_init_fn
= bfq_cpd_init
,
5923 .cpd_bind_fn
= bfq_cpd_init
,
5924 .cpd_free_fn
= bfq_cpd_free
,
5926 .pd_alloc_fn
= bfq_pd_alloc
,
5927 .pd_init_fn
= bfq_pd_init
,
5928 .pd_offline_fn
= bfq_pd_offline
,
5929 .pd_free_fn
= bfq_pd_free
,
5930 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
5934 static int __init
bfq_init(void)
5938 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5939 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
5945 if (bfq_slab_setup())
5948 ret
= elv_register(&iosched_bfq_mq
);
5955 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5956 blkcg_policy_unregister(&blkcg_policy_bfq
);
5961 static void __exit
bfq_exit(void)
5963 elv_unregister(&iosched_bfq_mq
);
5964 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5965 blkcg_policy_unregister(&blkcg_policy_bfq
);
5970 module_init(bfq_init
);
5971 module_exit(bfq_exit
);
5973 MODULE_AUTHOR("Paolo Valente");
5974 MODULE_LICENSE("GPL");
5975 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");