2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
125 * struct bfq_service_tree - per ioprio_class service tree.
127 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
128 * ioprio_class has its own independent scheduler, and so its own
129 * bfq_service_tree. All the fields are protected by the queue lock
130 * of the containing bfqd.
132 struct bfq_service_tree
{
133 /* tree for active entities (i.e., those backlogged) */
134 struct rb_root active
;
135 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
138 /* idle entity with minimum F_i */
139 struct bfq_entity
*first_idle
;
140 /* idle entity with maximum F_i */
141 struct bfq_entity
*last_idle
;
143 /* scheduler virtual time */
145 /* scheduler weight sum; active and idle entities contribute to it */
150 * struct bfq_sched_data - multi-class scheduler.
152 * bfq_sched_data is the basic scheduler queue. It supports three
153 * ioprio_classes, and can be used either as a toplevel queue or as an
154 * intermediate queue on a hierarchical setup. @next_in_service
155 * points to the active entity of the sched_data service trees that
156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
159 * The supported ioprio_classes are the same as in CFQ, in descending
160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
161 * Requests from higher priority queues are served before all the
162 * requests from lower priority queues; among requests of the same
163 * queue requests are served according to B-WF2Q+.
164 * All the fields are protected by the queue lock of the containing bfqd.
166 struct bfq_sched_data
{
167 /* entity in service */
168 struct bfq_entity
*in_service_entity
;
169 /* head-of-line entity (see comments above) */
170 struct bfq_entity
*next_in_service
;
171 /* array of service trees, one per ioprio_class */
172 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service
;
179 * struct bfq_entity - schedulable entity.
181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
183 * entity belongs to the sched_data of the parent group in the cgroup
184 * hierarchy. Non-leaf entities have also their own sched_data, stored
187 * Each entity stores independently its priority values; this would
188 * allow different weights on different devices, but this
189 * functionality is not exported to userspace by now. Priorities and
190 * weights are updated lazily, first storing the new values into the
191 * new_* fields, then setting the @prio_changed flag. As soon as
192 * there is a transition in the entity state that allows the priority
193 * update to take place the effective and the requested priority
194 * values are synchronized.
196 * Unless cgroups are used, the weight value is calculated from the
197 * ioprio to export the same interface as CFQ. When dealing with
198 * ``well-behaved'' queues (i.e., queues that do not spend too much
199 * time to consume their budget and have true sequential behavior, and
200 * when there are no external factors breaking anticipation) the
201 * relative weights at each level of the cgroups hierarchy should be
202 * guaranteed. All the fields are protected by the queue lock of the
206 /* service_tree member */
207 struct rb_node rb_node
;
210 * Flag, true if the entity is on a tree (either the active or
211 * the idle one of its service_tree) or is in service.
215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
218 /* tree the entity is enqueued into; %NULL if not on a tree */
219 struct rb_root
*tree
;
222 * minimum start time of the (active) subtree rooted at this
223 * entity; used for O(log N) lookups into active trees
227 /* amount of service received during the last service slot */
230 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
233 /* weight of the queue */
235 /* next weight if a change is in progress */
238 /* original weight, used to implement weight boosting */
241 /* parent entity, for hierarchical scheduling */
242 struct bfq_entity
*parent
;
245 * For non-leaf nodes in the hierarchy, the associated
246 * scheduler queue, %NULL on leaf nodes.
248 struct bfq_sched_data
*my_sched_data
;
249 /* the scheduler queue this entity belongs to */
250 struct bfq_sched_data
*sched_data
;
252 /* flag, set to request a weight, ioprio or ioprio_class change */
259 * struct bfq_ttime - per process thinktime stats.
262 /* completion time of the last request */
263 u64 last_end_request
;
265 /* total process thinktime */
267 /* number of thinktime samples */
268 unsigned long ttime_samples
;
269 /* average process thinktime */
274 * struct bfq_queue - leaf schedulable entity.
276 * A bfq_queue is a leaf request queue; it can be associated with an
277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
284 /* reference counter */
286 /* parent bfq_data */
287 struct bfq_data
*bfqd
;
289 /* current ioprio and ioprio class */
290 unsigned short ioprio
, ioprio_class
;
291 /* next ioprio and ioprio class if a change is in progress */
292 unsigned short new_ioprio
, new_ioprio_class
;
294 /* sorted list of pending requests */
295 struct rb_root sort_list
;
296 /* if fifo isn't expired, next request to serve */
297 struct request
*next_rq
;
298 /* number of sync and async requests queued */
300 /* number of requests currently allocated */
302 /* number of pending metadata requests */
304 /* fifo list of requests in sort_list */
305 struct list_head fifo
;
307 /* entity representing this queue in the scheduler */
308 struct bfq_entity entity
;
310 /* maximum budget allowed from the feedback mechanism */
312 /* budget expiration (in jiffies) */
313 unsigned long budget_timeout
;
315 /* number of requests on the dispatch list or inside driver */
321 /* node for active/idle bfqq list inside parent bfqd */
322 struct list_head bfqq_list
;
324 /* associated @bfq_ttime struct */
325 struct bfq_ttime ttime
;
327 /* bit vector: a 1 for each seeky requests in history */
329 /* position of the last request enqueued */
330 sector_t last_request_pos
;
332 /* Number of consecutive pairs of request completion and
333 * arrival, such that the queue becomes idle after the
334 * completion, but the next request arrives within an idle
335 * time slice; used only if the queue's IO_bound flag has been
338 unsigned int requests_within_timer
;
340 /* pid of the process owning the queue, used for logging purposes */
345 * struct bfq_io_cq - per (request_queue, io_context) structure.
348 /* associated io_cq structure */
349 struct io_cq icq
; /* must be the first member */
350 /* array of two process queues, the sync and the async */
351 struct bfq_queue
*bfqq
[2];
352 /* per (request_queue, blkcg) ioprio */
354 #ifdef CONFIG_BFQ_GROUP_IOSCHED
355 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
360 * struct bfq_data - per-device data structure.
362 * All the fields are protected by @lock.
365 /* device request queue */
366 struct request_queue
*queue
;
368 struct list_head dispatch
;
370 /* root bfq_group for the device */
371 struct bfq_group
*root_group
;
374 * Number of bfq_queues containing requests (including the
375 * queue in service, even if it is idling).
378 /* number of queued requests */
380 /* number of requests dispatched and waiting for completion */
384 * Maximum number of requests in driver in the last
385 * @hw_tag_samples completed requests.
387 int max_rq_in_driver
;
388 /* number of samples used to calculate hw_tag */
390 /* flag set to one if the driver is showing a queueing behavior */
393 /* number of budgets assigned */
394 int budgets_assigned
;
397 * Timer set when idling (waiting) for the next request from
398 * the queue in service.
400 struct hrtimer idle_slice_timer
;
402 /* bfq_queue in service */
403 struct bfq_queue
*in_service_queue
;
404 /* bfq_io_cq (bic) associated with the @in_service_queue */
405 struct bfq_io_cq
*in_service_bic
;
407 /* on-disk position of the last served request */
408 sector_t last_position
;
410 /* time of last request completion (ns) */
413 /* time of first rq dispatch in current observation interval (ns) */
415 /* time of last rq dispatch in current observation interval (ns) */
418 /* beginning of the last budget */
419 ktime_t last_budget_start
;
420 /* beginning of the last idle slice */
421 ktime_t last_idling_start
;
423 /* number of samples in current observation interval */
424 int peak_rate_samples
;
425 /* num of samples of seq dispatches in current observation interval */
426 u32 sequential_samples
;
427 /* total num of sectors transferred in current observation interval */
428 u64 tot_sectors_dispatched
;
429 /* max rq size seen during current observation interval (sectors) */
430 u32 last_rq_max_size
;
431 /* time elapsed from first dispatch in current observ. interval (us) */
432 u64 delta_from_first
;
434 * Current estimate of the device peak rate, measured in
435 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
436 * BFQ_RATE_SHIFT is performed to increase precision in
437 * fixed-point calculations.
441 /* maximum budget allotted to a bfq_queue before rescheduling */
444 /* list of all the bfq_queues active on the device */
445 struct list_head active_list
;
446 /* list of all the bfq_queues idle on the device */
447 struct list_head idle_list
;
450 * Timeout for async/sync requests; when it fires, requests
451 * are served in fifo order.
453 u64 bfq_fifo_expire
[2];
454 /* weight of backward seeks wrt forward ones */
455 unsigned int bfq_back_penalty
;
456 /* maximum allowed backward seek */
457 unsigned int bfq_back_max
;
458 /* maximum idling time */
461 /* user-configured max budget value (0 for auto-tuning) */
462 int bfq_user_max_budget
;
464 * Timeout for bfq_queues to consume their budget; used to
465 * prevent seeky queues from imposing long latencies to
466 * sequential or quasi-sequential ones (this also implies that
467 * seeky queues cannot receive guarantees in the service
468 * domain; after a timeout they are charged for the time they
469 * have been in service, to preserve fairness among them, but
470 * without service-domain guarantees).
472 unsigned int bfq_timeout
;
475 * Number of consecutive requests that must be issued within
476 * the idle time slice to set again idling to a queue which
477 * was marked as non-I/O-bound (see the definition of the
478 * IO_bound flag for further details).
480 unsigned int bfq_requests_within_timer
;
483 * Force device idling whenever needed to provide accurate
484 * service guarantees, without caring about throughput
485 * issues. CAVEAT: this may even increase latencies, in case
486 * of useless idling for processes that did stop doing I/O.
488 bool strict_guarantees
;
490 /* fallback dummy bfqq for extreme OOM conditions */
491 struct bfq_queue oom_bfqq
;
496 * bic associated with the task issuing current bio for
497 * merging. This and the next field are used as a support to
498 * be able to perform the bic lookup, needed by bio-merge
499 * functions, before the scheduler lock is taken, and thus
500 * avoid taking the request-queue lock while the scheduler
501 * lock is being held.
503 struct bfq_io_cq
*bio_bic
;
504 /* bfqq associated with the task issuing current bio for merging */
505 struct bfq_queue
*bio_bfqq
;
508 enum bfqq_state_flags
{
509 BFQQF_busy
= 0, /* has requests or is in service */
510 BFQQF_wait_request
, /* waiting for a request */
511 BFQQF_non_blocking_wait_rq
, /*
512 * waiting for a request
513 * without idling the device
515 BFQQF_fifo_expire
, /* FIFO checked in this slice */
516 BFQQF_idle_window
, /* slice idling enabled */
517 BFQQF_sync
, /* synchronous queue */
518 BFQQF_budget_new
, /* no completion with this budget */
520 * bfqq has timed-out at least once
521 * having consumed at most 2/10 of
526 #define BFQ_BFQQ_FNS(name) \
527 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
529 __set_bit(BFQQF_##name, &(bfqq)->flags); \
531 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
533 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
535 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
537 return test_bit(BFQQF_##name, &(bfqq)->flags); \
541 BFQ_BFQQ_FNS(wait_request
);
542 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
543 BFQ_BFQQ_FNS(fifo_expire
);
544 BFQ_BFQQ_FNS(idle_window
);
546 BFQ_BFQQ_FNS(budget_new
);
547 BFQ_BFQQ_FNS(IO_bound
);
550 /* Logging facilities. */
551 #ifdef CONFIG_BFQ_GROUP_IOSCHED
552 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
553 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
555 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
558 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
559 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
560 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
564 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
567 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
568 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
571 #else /* CONFIG_BFQ_GROUP_IOSCHED */
573 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
574 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
575 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
577 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
579 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
581 #define bfq_log(bfqd, fmt, args...) \
582 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
584 /* Expiration reasons. */
585 enum bfqq_expiration
{
586 BFQQE_TOO_IDLE
= 0, /*
587 * queue has been idling for
590 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
591 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
592 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
593 BFQQE_PREEMPTED
/* preemption in progress */
597 #ifdef CONFIG_BFQ_GROUP_IOSCHED
598 /* number of ios merged */
599 struct blkg_rwstat merged
;
600 /* total time spent on device in ns, may not be accurate w/ queueing */
601 struct blkg_rwstat service_time
;
602 /* total time spent waiting in scheduler queue in ns */
603 struct blkg_rwstat wait_time
;
604 /* number of IOs queued up */
605 struct blkg_rwstat queued
;
606 /* total disk time and nr sectors dispatched by this group */
607 struct blkg_stat time
;
608 /* sum of number of ios queued across all samples */
609 struct blkg_stat avg_queue_size_sum
;
610 /* count of samples taken for average */
611 struct blkg_stat avg_queue_size_samples
;
612 /* how many times this group has been removed from service tree */
613 struct blkg_stat dequeue
;
614 /* total time spent waiting for it to be assigned a timeslice. */
615 struct blkg_stat group_wait_time
;
616 /* time spent idling for this blkcg_gq */
617 struct blkg_stat idle_time
;
618 /* total time with empty current active q with other requests queued */
619 struct blkg_stat empty_time
;
620 /* fields after this shouldn't be cleared on stat reset */
621 uint64_t start_group_wait_time
;
622 uint64_t start_idle_time
;
623 uint64_t start_empty_time
;
625 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
628 #ifdef CONFIG_BFQ_GROUP_IOSCHED
631 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
633 * @ps: @blkcg_policy_storage that this structure inherits
634 * @weight: weight of the bfq_group
636 struct bfq_group_data
{
637 /* must be the first member */
638 struct blkcg_policy_data pd
;
640 unsigned short weight
;
644 * struct bfq_group - per (device, cgroup) data structure.
645 * @entity: schedulable entity to insert into the parent group sched_data.
646 * @sched_data: own sched_data, to contain child entities (they may be
647 * both bfq_queues and bfq_groups).
648 * @bfqd: the bfq_data for the device this group acts upon.
649 * @async_bfqq: array of async queues for all the tasks belonging to
650 * the group, one queue per ioprio value per ioprio_class,
651 * except for the idle class that has only one queue.
652 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
653 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
654 * to avoid too many special cases during group creation/
656 * @stats: stats for this bfqg.
658 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
659 * there is a set of bfq_groups, each one collecting the lower-level
660 * entities belonging to the group that are acting on the same device.
662 * Locking works as follows:
663 * o @bfqd is protected by the queue lock, RCU is used to access it
665 * o All the other fields are protected by the @bfqd queue lock.
668 /* must be the first member */
669 struct blkg_policy_data pd
;
671 struct bfq_entity entity
;
672 struct bfq_sched_data sched_data
;
676 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
677 struct bfq_queue
*async_idle_bfqq
;
679 struct bfq_entity
*my_entity
;
681 struct bfqg_stats stats
;
686 struct bfq_sched_data sched_data
;
688 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
689 struct bfq_queue
*async_idle_bfqq
;
691 struct rb_root rq_pos_tree
;
695 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
697 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
699 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
701 return bfqq
? bfqq
->ioprio_class
- 1 :
702 BFQ_DEFAULT_GRP_CLASS
- 1;
705 static struct bfq_service_tree
*
706 bfq_entity_service_tree(struct bfq_entity
*entity
)
708 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
709 unsigned int idx
= bfq_class_idx(entity
);
711 return sched_data
->service_tree
+ idx
;
714 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
716 return bic
->bfqq
[is_sync
];
719 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
722 bic
->bfqq
[is_sync
] = bfqq
;
725 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
727 return bic
->icq
.q
->elevator
->elevator_data
;
730 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
731 static void bfq_put_queue(struct bfq_queue
*bfqq
);
732 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
733 struct bio
*bio
, bool is_sync
,
734 struct bfq_io_cq
*bic
);
735 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
736 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
738 /* Expiration time of sync (0) and async (1) requests, in ns. */
739 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
741 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
742 static const int bfq_back_max
= 16 * 1024;
744 /* Penalty of a backwards seek, in number of sectors. */
745 static const int bfq_back_penalty
= 2;
747 /* Idling period duration, in ns. */
748 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
750 /* Minimum number of assigned budgets for which stats are safe to compute. */
751 static const int bfq_stats_min_budgets
= 194;
753 /* Default maximum budget values, in sectors and number of requests. */
754 static const int bfq_default_max_budget
= 16 * 1024;
757 * Async to sync throughput distribution is controlled as follows:
758 * when an async request is served, the entity is charged the number
759 * of sectors of the request, multiplied by the factor below
761 static const int bfq_async_charge_factor
= 10;
763 /* Default timeout values, in jiffies, approximating CFQ defaults. */
764 static const int bfq_timeout
= HZ
/ 8;
766 static struct kmem_cache
*bfq_pool
;
768 /* Below this threshold (in ns), we consider thinktime immediate. */
769 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
771 /* hw_tag detection: parallel requests threshold and min samples needed. */
772 #define BFQ_HW_QUEUE_THRESHOLD 4
773 #define BFQ_HW_QUEUE_SAMPLES 32
775 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
776 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
777 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
778 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
780 /* Min number of samples required to perform peak-rate update */
781 #define BFQ_RATE_MIN_SAMPLES 32
782 /* Min observation time interval required to perform a peak-rate update (ns) */
783 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
784 /* Target observation time interval for a peak-rate update (ns) */
785 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
787 /* Shift used for peak rate fixed precision calculations. */
788 #define BFQ_RATE_SHIFT 16
790 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
791 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
793 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
794 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
797 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
798 * @icq: the iocontext queue.
800 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
802 /* bic->icq is the first member, %NULL will convert to %NULL */
803 return container_of(icq
, struct bfq_io_cq
, icq
);
807 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
808 * @bfqd: the lookup key.
809 * @ioc: the io_context of the process doing I/O.
810 * @q: the request queue.
812 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
813 struct io_context
*ioc
,
814 struct request_queue
*q
)
818 struct bfq_io_cq
*icq
;
820 spin_lock_irqsave(q
->queue_lock
, flags
);
821 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
822 spin_unlock_irqrestore(q
->queue_lock
, flags
);
831 * Scheduler run of queue, if there are requests pending and no one in the
832 * driver that will restart queueing.
834 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
836 if (bfqd
->queued
!= 0) {
837 bfq_log(bfqd
, "schedule dispatch");
838 blk_mq_run_hw_queues(bfqd
->queue
, true);
843 * bfq_gt - compare two timestamps.
847 * Return @a > @b, dealing with wrapping correctly.
849 static int bfq_gt(u64 a
, u64 b
)
851 return (s64
)(a
- b
) > 0;
854 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
856 struct rb_node
*node
= tree
->rb_node
;
858 return rb_entry(node
, struct bfq_entity
, rb_node
);
861 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
863 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
866 * bfq_update_next_in_service - update sd->next_in_service
867 * @sd: sched_data for which to perform the update.
868 * @new_entity: if not NULL, pointer to the entity whose activation,
869 * requeueing or repositionig triggered the invocation of
872 * This function is called to update sd->next_in_service, which, in
873 * its turn, may change as a consequence of the insertion or
874 * extraction of an entity into/from one of the active trees of
875 * sd. These insertions/extractions occur as a consequence of
876 * activations/deactivations of entities, with some activations being
877 * 'true' activations, and other activations being requeueings (i.e.,
878 * implementing the second, requeueing phase of the mechanism used to
879 * reposition an entity in its active tree; see comments on
880 * __bfq_activate_entity and __bfq_requeue_entity for details). In
881 * both the last two activation sub-cases, new_entity points to the
882 * just activated or requeued entity.
884 * Returns true if sd->next_in_service changes in such a way that
885 * entity->parent may become the next_in_service for its parent
888 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
889 struct bfq_entity
*new_entity
)
891 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
892 bool parent_sched_may_change
= false;
895 * If this update is triggered by the activation, requeueing
896 * or repositiong of an entity that does not coincide with
897 * sd->next_in_service, then a full lookup in the active tree
898 * can be avoided. In fact, it is enough to check whether the
899 * just-modified entity has a higher priority than
900 * sd->next_in_service, or, even if it has the same priority
901 * as sd->next_in_service, is eligible and has a lower virtual
902 * finish time than sd->next_in_service. If this compound
903 * condition holds, then the new entity becomes the new
904 * next_in_service. Otherwise no change is needed.
906 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
908 * Flag used to decide whether to replace
909 * sd->next_in_service with new_entity. Tentatively
910 * set to true, and left as true if
911 * sd->next_in_service is NULL.
913 bool replace_next
= true;
916 * If there is already a next_in_service candidate
917 * entity, then compare class priorities or timestamps
918 * to decide whether to replace sd->service_tree with
921 if (next_in_service
) {
922 unsigned int new_entity_class_idx
=
923 bfq_class_idx(new_entity
);
924 struct bfq_service_tree
*st
=
925 sd
->service_tree
+ new_entity_class_idx
;
928 * For efficiency, evaluate the most likely
929 * sub-condition first.
932 (new_entity_class_idx
==
933 bfq_class_idx(next_in_service
)
935 !bfq_gt(new_entity
->start
, st
->vtime
)
937 bfq_gt(next_in_service
->finish
,
940 new_entity_class_idx
<
941 bfq_class_idx(next_in_service
);
945 next_in_service
= new_entity
;
946 } else /* invoked because of a deactivation: lookup needed */
947 next_in_service
= bfq_lookup_next_entity(sd
);
949 if (next_in_service
) {
950 parent_sched_may_change
= !sd
->next_in_service
||
951 bfq_update_parent_budget(next_in_service
);
954 sd
->next_in_service
= next_in_service
;
956 if (!next_in_service
)
957 return parent_sched_may_change
;
959 return parent_sched_may_change
;
962 #ifdef CONFIG_BFQ_GROUP_IOSCHED
963 /* both next loops stop at one of the child entities of the root group */
964 #define for_each_entity(entity) \
965 for (; entity ; entity = entity->parent)
968 * For each iteration, compute parent in advance, so as to be safe if
969 * entity is deallocated during the iteration. Such a deallocation may
970 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
973 #define for_each_entity_safe(entity, parent) \
974 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
977 * Returns true if this budget changes may let next_in_service->parent
978 * become the next_in_service entity for its parent entity.
980 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
982 struct bfq_entity
*bfqg_entity
;
983 struct bfq_group
*bfqg
;
984 struct bfq_sched_data
*group_sd
;
987 group_sd
= next_in_service
->sched_data
;
989 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
991 * bfq_group's my_entity field is not NULL only if the group
992 * is not the root group. We must not touch the root entity
993 * as it must never become an in-service entity.
995 bfqg_entity
= bfqg
->my_entity
;
997 if (bfqg_entity
->budget
> next_in_service
->budget
)
999 bfqg_entity
->budget
= next_in_service
->budget
;
1006 * This function tells whether entity stops being a candidate for next
1007 * service, according to the following logic.
1009 * This function is invoked for an entity that is about to be set in
1010 * service. If such an entity is a queue, then the entity is no longer
1011 * a candidate for next service (i.e, a candidate entity to serve
1012 * after the in-service entity is expired). The function then returns
1015 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1017 if (bfq_entity_to_bfqq(entity
))
1023 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1025 * Next two macros are fake loops when cgroups support is not
1026 * enabled. I fact, in such a case, there is only one level to go up
1027 * (to reach the root group).
1029 #define for_each_entity(entity) \
1030 for (; entity ; entity = NULL)
1032 #define for_each_entity_safe(entity, parent) \
1033 for (parent = NULL; entity ; entity = parent)
1035 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1040 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1045 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1048 * Shift for timestamp calculations. This actually limits the maximum
1049 * service allowed in one timestamp delta (small shift values increase it),
1050 * the maximum total weight that can be used for the queues in the system
1051 * (big shift values increase it), and the period of virtual time
1054 #define WFQ_SERVICE_SHIFT 22
1056 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1058 struct bfq_queue
*bfqq
= NULL
;
1060 if (!entity
->my_sched_data
)
1061 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1068 * bfq_delta - map service into the virtual time domain.
1069 * @service: amount of service.
1070 * @weight: scale factor (weight of an entity or weight sum).
1072 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1074 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1081 * bfq_calc_finish - assign the finish time to an entity.
1082 * @entity: the entity to act upon.
1083 * @service: the service to be charged to the entity.
1085 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1087 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1089 entity
->finish
= entity
->start
+
1090 bfq_delta(service
, entity
->weight
);
1093 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1094 "calc_finish: serv %lu, w %d",
1095 service
, entity
->weight
);
1096 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1097 "calc_finish: start %llu, finish %llu, delta %llu",
1098 entity
->start
, entity
->finish
,
1099 bfq_delta(service
, entity
->weight
));
1104 * bfq_entity_of - get an entity from a node.
1105 * @node: the node field of the entity.
1107 * Convert a node pointer to the relative entity. This is used only
1108 * to simplify the logic of some functions and not as the generic
1109 * conversion mechanism because, e.g., in the tree walking functions,
1110 * the check for a %NULL value would be redundant.
1112 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1114 struct bfq_entity
*entity
= NULL
;
1117 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1123 * bfq_extract - remove an entity from a tree.
1124 * @root: the tree root.
1125 * @entity: the entity to remove.
1127 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1129 entity
->tree
= NULL
;
1130 rb_erase(&entity
->rb_node
, root
);
1134 * bfq_idle_extract - extract an entity from the idle tree.
1135 * @st: the service tree of the owning @entity.
1136 * @entity: the entity being removed.
1138 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1139 struct bfq_entity
*entity
)
1141 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1142 struct rb_node
*next
;
1144 if (entity
== st
->first_idle
) {
1145 next
= rb_next(&entity
->rb_node
);
1146 st
->first_idle
= bfq_entity_of(next
);
1149 if (entity
== st
->last_idle
) {
1150 next
= rb_prev(&entity
->rb_node
);
1151 st
->last_idle
= bfq_entity_of(next
);
1154 bfq_extract(&st
->idle
, entity
);
1157 list_del(&bfqq
->bfqq_list
);
1161 * bfq_insert - generic tree insertion.
1163 * @entity: entity to insert.
1165 * This is used for the idle and the active tree, since they are both
1166 * ordered by finish time.
1168 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1170 struct bfq_entity
*entry
;
1171 struct rb_node
**node
= &root
->rb_node
;
1172 struct rb_node
*parent
= NULL
;
1176 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1178 if (bfq_gt(entry
->finish
, entity
->finish
))
1179 node
= &parent
->rb_left
;
1181 node
= &parent
->rb_right
;
1184 rb_link_node(&entity
->rb_node
, parent
, node
);
1185 rb_insert_color(&entity
->rb_node
, root
);
1187 entity
->tree
= root
;
1191 * bfq_update_min - update the min_start field of a entity.
1192 * @entity: the entity to update.
1193 * @node: one of its children.
1195 * This function is called when @entity may store an invalid value for
1196 * min_start due to updates to the active tree. The function assumes
1197 * that the subtree rooted at @node (which may be its left or its right
1198 * child) has a valid min_start value.
1200 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1202 struct bfq_entity
*child
;
1205 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1206 if (bfq_gt(entity
->min_start
, child
->min_start
))
1207 entity
->min_start
= child
->min_start
;
1212 * bfq_update_active_node - recalculate min_start.
1213 * @node: the node to update.
1215 * @node may have changed position or one of its children may have moved,
1216 * this function updates its min_start value. The left and right subtrees
1217 * are assumed to hold a correct min_start value.
1219 static void bfq_update_active_node(struct rb_node
*node
)
1221 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1223 entity
->min_start
= entity
->start
;
1224 bfq_update_min(entity
, node
->rb_right
);
1225 bfq_update_min(entity
, node
->rb_left
);
1229 * bfq_update_active_tree - update min_start for the whole active tree.
1230 * @node: the starting node.
1232 * @node must be the deepest modified node after an update. This function
1233 * updates its min_start using the values held by its children, assuming
1234 * that they did not change, and then updates all the nodes that may have
1235 * changed in the path to the root. The only nodes that may have changed
1236 * are the ones in the path or their siblings.
1238 static void bfq_update_active_tree(struct rb_node
*node
)
1240 struct rb_node
*parent
;
1243 bfq_update_active_node(node
);
1245 parent
= rb_parent(node
);
1249 if (node
== parent
->rb_left
&& parent
->rb_right
)
1250 bfq_update_active_node(parent
->rb_right
);
1251 else if (parent
->rb_left
)
1252 bfq_update_active_node(parent
->rb_left
);
1259 * bfq_active_insert - insert an entity in the active tree of its
1261 * @st: the service tree of the entity.
1262 * @entity: the entity being inserted.
1264 * The active tree is ordered by finish time, but an extra key is kept
1265 * per each node, containing the minimum value for the start times of
1266 * its children (and the node itself), so it's possible to search for
1267 * the eligible node with the lowest finish time in logarithmic time.
1269 static void bfq_active_insert(struct bfq_service_tree
*st
,
1270 struct bfq_entity
*entity
)
1272 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1273 struct rb_node
*node
= &entity
->rb_node
;
1274 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1275 struct bfq_sched_data
*sd
= NULL
;
1276 struct bfq_group
*bfqg
= NULL
;
1277 struct bfq_data
*bfqd
= NULL
;
1280 bfq_insert(&st
->active
, entity
);
1283 node
= node
->rb_left
;
1284 else if (node
->rb_right
)
1285 node
= node
->rb_right
;
1287 bfq_update_active_tree(node
);
1289 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1290 sd
= entity
->sched_data
;
1291 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1292 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1295 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1299 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1300 * @ioprio: the ioprio value to convert.
1302 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1304 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1308 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1309 * @weight: the weight value to convert.
1311 * To preserve as much as possible the old only-ioprio user interface,
1312 * 0 is used as an escape ioprio value for weights (numerically) equal or
1313 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1315 static unsigned short bfq_weight_to_ioprio(int weight
)
1317 return max_t(int, 0,
1318 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1321 static void bfq_get_entity(struct bfq_entity
*entity
)
1323 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1327 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1333 * bfq_find_deepest - find the deepest node that an extraction can modify.
1334 * @node: the node being removed.
1336 * Do the first step of an extraction in an rb tree, looking for the
1337 * node that will replace @node, and returning the deepest node that
1338 * the following modifications to the tree can touch. If @node is the
1339 * last node in the tree return %NULL.
1341 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1343 struct rb_node
*deepest
;
1345 if (!node
->rb_right
&& !node
->rb_left
)
1346 deepest
= rb_parent(node
);
1347 else if (!node
->rb_right
)
1348 deepest
= node
->rb_left
;
1349 else if (!node
->rb_left
)
1350 deepest
= node
->rb_right
;
1352 deepest
= rb_next(node
);
1353 if (deepest
->rb_right
)
1354 deepest
= deepest
->rb_right
;
1355 else if (rb_parent(deepest
) != node
)
1356 deepest
= rb_parent(deepest
);
1363 * bfq_active_extract - remove an entity from the active tree.
1364 * @st: the service_tree containing the tree.
1365 * @entity: the entity being removed.
1367 static void bfq_active_extract(struct bfq_service_tree
*st
,
1368 struct bfq_entity
*entity
)
1370 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1371 struct rb_node
*node
;
1372 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1373 struct bfq_sched_data
*sd
= NULL
;
1374 struct bfq_group
*bfqg
= NULL
;
1375 struct bfq_data
*bfqd
= NULL
;
1378 node
= bfq_find_deepest(&entity
->rb_node
);
1379 bfq_extract(&st
->active
, entity
);
1382 bfq_update_active_tree(node
);
1384 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1385 sd
= entity
->sched_data
;
1386 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1387 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1390 list_del(&bfqq
->bfqq_list
);
1394 * bfq_idle_insert - insert an entity into the idle tree.
1395 * @st: the service tree containing the tree.
1396 * @entity: the entity to insert.
1398 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1399 struct bfq_entity
*entity
)
1401 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1402 struct bfq_entity
*first_idle
= st
->first_idle
;
1403 struct bfq_entity
*last_idle
= st
->last_idle
;
1405 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1406 st
->first_idle
= entity
;
1407 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1408 st
->last_idle
= entity
;
1410 bfq_insert(&st
->idle
, entity
);
1413 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1417 * bfq_forget_entity - do not consider entity any longer for scheduling
1418 * @st: the service tree.
1419 * @entity: the entity being removed.
1420 * @is_in_service: true if entity is currently the in-service entity.
1422 * Forget everything about @entity. In addition, if entity represents
1423 * a queue, and the latter is not in service, then release the service
1424 * reference to the queue (the one taken through bfq_get_entity). In
1425 * fact, in this case, there is really no more service reference to
1426 * the queue, as the latter is also outside any service tree. If,
1427 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1428 * will take care of putting the reference when the queue finally
1429 * stops being served.
1431 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1432 struct bfq_entity
*entity
,
1435 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1437 entity
->on_st
= false;
1438 st
->wsum
-= entity
->weight
;
1439 if (bfqq
&& !is_in_service
)
1440 bfq_put_queue(bfqq
);
1444 * bfq_put_idle_entity - release the idle tree ref of an entity.
1445 * @st: service tree for the entity.
1446 * @entity: the entity being released.
1448 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1449 struct bfq_entity
*entity
)
1451 bfq_idle_extract(st
, entity
);
1452 bfq_forget_entity(st
, entity
,
1453 entity
== entity
->sched_data
->in_service_entity
);
1457 * bfq_forget_idle - update the idle tree if necessary.
1458 * @st: the service tree to act upon.
1460 * To preserve the global O(log N) complexity we only remove one entry here;
1461 * as the idle tree will not grow indefinitely this can be done safely.
1463 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1465 struct bfq_entity
*first_idle
= st
->first_idle
;
1466 struct bfq_entity
*last_idle
= st
->last_idle
;
1468 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1469 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1471 * Forget the whole idle tree, increasing the vtime past
1472 * the last finish time of idle entities.
1474 st
->vtime
= last_idle
->finish
;
1477 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1478 bfq_put_idle_entity(st
, first_idle
);
1481 static struct bfq_service_tree
*
1482 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1483 struct bfq_entity
*entity
)
1485 struct bfq_service_tree
*new_st
= old_st
;
1487 if (entity
->prio_changed
) {
1488 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1489 unsigned short prev_weight
, new_weight
;
1490 struct bfq_data
*bfqd
= NULL
;
1491 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1492 struct bfq_sched_data
*sd
;
1493 struct bfq_group
*bfqg
;
1498 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1500 sd
= entity
->my_sched_data
;
1501 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1502 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1506 old_st
->wsum
-= entity
->weight
;
1508 if (entity
->new_weight
!= entity
->orig_weight
) {
1509 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1510 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1511 pr_crit("update_weight_prio: new_weight %d\n",
1512 entity
->new_weight
);
1513 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1514 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1516 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1518 entity
->orig_weight
= entity
->new_weight
;
1521 bfq_weight_to_ioprio(entity
->orig_weight
);
1525 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1526 entity
->prio_changed
= 0;
1529 * NOTE: here we may be changing the weight too early,
1530 * this will cause unfairness. The correct approach
1531 * would have required additional complexity to defer
1532 * weight changes to the proper time instants (i.e.,
1533 * when entity->finish <= old_st->vtime).
1535 new_st
= bfq_entity_service_tree(entity
);
1537 prev_weight
= entity
->weight
;
1538 new_weight
= entity
->orig_weight
;
1539 entity
->weight
= new_weight
;
1541 new_st
->wsum
+= entity
->weight
;
1543 if (new_st
!= old_st
)
1544 entity
->start
= new_st
->vtime
;
1550 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1551 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1554 * bfq_bfqq_served - update the scheduler status after selection for
1556 * @bfqq: the queue being served.
1557 * @served: bytes to transfer.
1559 * NOTE: this can be optimized, as the timestamps of upper level entities
1560 * are synchronized every time a new bfqq is selected for service. By now,
1561 * we keep it to better check consistency.
1563 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1565 struct bfq_entity
*entity
= &bfqq
->entity
;
1566 struct bfq_service_tree
*st
;
1568 for_each_entity(entity
) {
1569 st
= bfq_entity_service_tree(entity
);
1571 entity
->service
+= served
;
1573 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1574 bfq_forget_idle(st
);
1576 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1577 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1581 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1582 * of the time interval during which bfqq has been in
1585 * @bfqq: the queue that needs a service update.
1586 * @time_ms: the amount of time during which the queue has received service
1588 * If a queue does not consume its budget fast enough, then providing
1589 * the queue with service fairness may impair throughput, more or less
1590 * severely. For this reason, queues that consume their budget slowly
1591 * are provided with time fairness instead of service fairness. This
1592 * goal is achieved through the BFQ scheduling engine, even if such an
1593 * engine works in the service, and not in the time domain. The trick
1594 * is charging these queues with an inflated amount of service, equal
1595 * to the amount of service that they would have received during their
1596 * service slot if they had been fast, i.e., if their requests had
1597 * been dispatched at a rate equal to the estimated peak rate.
1599 * It is worth noting that time fairness can cause important
1600 * distortions in terms of bandwidth distribution, on devices with
1601 * internal queueing. The reason is that I/O requests dispatched
1602 * during the service slot of a queue may be served after that service
1603 * slot is finished, and may have a total processing time loosely
1604 * correlated with the duration of the service slot. This is
1605 * especially true for short service slots.
1607 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1608 unsigned long time_ms
)
1610 struct bfq_entity
*entity
= &bfqq
->entity
;
1611 int tot_serv_to_charge
= entity
->service
;
1612 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
1614 if (time_ms
> 0 && time_ms
< timeout_ms
)
1615 tot_serv_to_charge
=
1616 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
1618 if (tot_serv_to_charge
< entity
->service
)
1619 tot_serv_to_charge
= entity
->service
;
1621 /* Increase budget to avoid inconsistencies */
1622 if (tot_serv_to_charge
> entity
->budget
)
1623 entity
->budget
= tot_serv_to_charge
;
1625 bfq_bfqq_served(bfqq
,
1626 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
1629 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1630 struct bfq_service_tree
*st
,
1633 st
= __bfq_entity_update_weight_prio(st
, entity
);
1634 bfq_calc_finish(entity
, entity
->budget
);
1637 * If some queues enjoy backshifting for a while, then their
1638 * (virtual) finish timestamps may happen to become lower and
1639 * lower than the system virtual time. In particular, if
1640 * these queues often happen to be idle for short time
1641 * periods, and during such time periods other queues with
1642 * higher timestamps happen to be busy, then the backshifted
1643 * timestamps of the former queues can become much lower than
1644 * the system virtual time. In fact, to serve the queues with
1645 * higher timestamps while the ones with lower timestamps are
1646 * idle, the system virtual time may be pushed-up to much
1647 * higher values than the finish timestamps of the idle
1648 * queues. As a consequence, the finish timestamps of all new
1649 * or newly activated queues may end up being much larger than
1650 * those of lucky queues with backshifted timestamps. The
1651 * latter queues may then monopolize the device for a lot of
1652 * time. This would simply break service guarantees.
1654 * To reduce this problem, push up a little bit the
1655 * backshifted timestamps of the queue associated with this
1656 * entity (only a queue can happen to have the backshifted
1657 * flag set): just enough to let the finish timestamp of the
1658 * queue be equal to the current value of the system virtual
1659 * time. This may introduce a little unfairness among queues
1660 * with backshifted timestamps, but it does not break
1661 * worst-case fairness guarantees.
1663 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1664 unsigned long delta
= st
->vtime
- entity
->finish
;
1666 entity
->start
+= delta
;
1667 entity
->finish
+= delta
;
1670 bfq_active_insert(st
, entity
);
1674 * __bfq_activate_entity - handle activation of entity.
1675 * @entity: the entity being activated.
1676 * @non_blocking_wait_rq: true if entity was waiting for a request
1678 * Called for a 'true' activation, i.e., if entity is not active and
1679 * one of its children receives a new request.
1681 * Basically, this function updates the timestamps of entity and
1682 * inserts entity into its active tree, ater possible extracting it
1683 * from its idle tree.
1685 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1686 bool non_blocking_wait_rq
)
1688 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1689 bool backshifted
= false;
1690 unsigned long long min_vstart
;
1692 /* See comments on bfq_fqq_update_budg_for_activation */
1693 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1695 min_vstart
= entity
->finish
;
1697 min_vstart
= st
->vtime
;
1699 if (entity
->tree
== &st
->idle
) {
1701 * Must be on the idle tree, bfq_idle_extract() will
1704 bfq_idle_extract(st
, entity
);
1705 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1706 min_vstart
: entity
->finish
;
1709 * The finish time of the entity may be invalid, and
1710 * it is in the past for sure, otherwise the queue
1711 * would have been on the idle tree.
1713 entity
->start
= min_vstart
;
1714 st
->wsum
+= entity
->weight
;
1716 * entity is about to be inserted into a service tree,
1717 * and then set in service: get a reference to make
1718 * sure entity does not disappear until it is no
1719 * longer in service or scheduled for service.
1721 bfq_get_entity(entity
);
1723 entity
->on_st
= true;
1726 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1730 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1731 * @entity: the entity being requeued or repositioned.
1733 * Requeueing is needed if this entity stops being served, which
1734 * happens if a leaf descendant entity has expired. On the other hand,
1735 * repositioning is needed if the next_inservice_entity for the child
1736 * entity has changed. See the comments inside the function for
1739 * Basically, this function: 1) removes entity from its active tree if
1740 * present there, 2) updates the timestamps of entity and 3) inserts
1741 * entity back into its active tree (in the new, right position for
1742 * the new values of the timestamps).
1744 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
1746 struct bfq_sched_data
*sd
= entity
->sched_data
;
1747 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1749 if (entity
== sd
->in_service_entity
) {
1751 * We are requeueing the current in-service entity,
1752 * which may have to be done for one of the following
1754 * - entity represents the in-service queue, and the
1755 * in-service queue is being requeued after an
1757 * - entity represents a group, and its budget has
1758 * changed because one of its child entities has
1759 * just been either activated or requeued for some
1760 * reason; the timestamps of the entity need then to
1761 * be updated, and the entity needs to be enqueued
1762 * or repositioned accordingly.
1764 * In particular, before requeueing, the start time of
1765 * the entity must be moved forward to account for the
1766 * service that the entity has received while in
1767 * service. This is done by the next instructions. The
1768 * finish time will then be updated according to this
1769 * new value of the start time, and to the budget of
1772 bfq_calc_finish(entity
, entity
->service
);
1773 entity
->start
= entity
->finish
;
1775 * In addition, if the entity had more than one child
1776 * when set in service, then was not extracted from
1777 * the active tree. This implies that the position of
1778 * the entity in the active tree may need to be
1779 * changed now, because we have just updated the start
1780 * time of the entity, and we will update its finish
1781 * time in a moment (the requeueing is then, more
1782 * precisely, a repositioning in this case). To
1783 * implement this repositioning, we: 1) dequeue the
1784 * entity here, 2) update the finish time and
1785 * requeue the entity according to the new
1789 bfq_active_extract(st
, entity
);
1790 } else { /* The entity is already active, and not in service */
1792 * In this case, this function gets called only if the
1793 * next_in_service entity below this entity has
1794 * changed, and this change has caused the budget of
1795 * this entity to change, which, finally implies that
1796 * the finish time of this entity must be
1797 * updated. Such an update may cause the scheduling,
1798 * i.e., the position in the active tree, of this
1799 * entity to change. We handle this change by: 1)
1800 * dequeueing the entity here, 2) updating the finish
1801 * time and requeueing the entity according to the new
1802 * timestamps below. This is the same approach as the
1803 * non-extracted-entity sub-case above.
1805 bfq_active_extract(st
, entity
);
1808 bfq_update_fin_time_enqueue(entity
, st
, false);
1811 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1812 struct bfq_sched_data
*sd
,
1813 bool non_blocking_wait_rq
)
1815 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1817 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
1819 * in service or already queued on the active tree,
1820 * requeue or reposition
1822 __bfq_requeue_entity(entity
);
1825 * Not in service and not queued on its active tree:
1826 * the activity is idle and this is a true activation.
1828 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
1833 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1834 * and activate, requeue or reposition all ancestors
1835 * for which such an update becomes necessary.
1836 * @entity: the entity to activate.
1837 * @non_blocking_wait_rq: true if this entity was waiting for a request
1838 * @requeue: true if this is a requeue, which implies that bfqq is
1839 * being expired; thus ALL its ancestors stop being served and must
1840 * therefore be requeued
1842 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1843 bool non_blocking_wait_rq
,
1846 struct bfq_sched_data
*sd
;
1848 for_each_entity(entity
) {
1849 sd
= entity
->sched_data
;
1850 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
1852 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
1858 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1859 * @entity: the entity to deactivate.
1860 * @ins_into_idle_tree: if false, the entity will not be put into the
1863 * Deactivates an entity, independently from its previous state. Must
1864 * be invoked only if entity is on a service tree. Extracts the entity
1865 * from that tree, and if necessary and allowed, puts it on the idle
1868 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
1869 bool ins_into_idle_tree
)
1871 struct bfq_sched_data
*sd
= entity
->sched_data
;
1872 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1873 int is_in_service
= entity
== sd
->in_service_entity
;
1875 if (!entity
->on_st
) /* entity never activated, or already inactive */
1879 bfq_calc_finish(entity
, entity
->service
);
1881 if (entity
->tree
== &st
->active
)
1882 bfq_active_extract(st
, entity
);
1883 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
1884 bfq_idle_extract(st
, entity
);
1886 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
1887 bfq_forget_entity(st
, entity
, is_in_service
);
1889 bfq_idle_insert(st
, entity
);
1895 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1896 * @entity: the entity to deactivate.
1897 * @ins_into_idle_tree: true if the entity can be put on the idle tree
1899 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
1900 bool ins_into_idle_tree
,
1903 struct bfq_sched_data
*sd
;
1904 struct bfq_entity
*parent
= NULL
;
1906 for_each_entity_safe(entity
, parent
) {
1907 sd
= entity
->sched_data
;
1909 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
1911 * entity is not in any tree any more, so
1912 * this deactivation is a no-op, and there is
1913 * nothing to change for upper-level entities
1914 * (in case of expiration, this can never
1920 if (sd
->next_in_service
== entity
)
1922 * entity was the next_in_service entity,
1923 * then, since entity has just been
1924 * deactivated, a new one must be found.
1926 bfq_update_next_in_service(sd
, NULL
);
1928 if (sd
->next_in_service
)
1930 * The parent entity is still backlogged,
1931 * because next_in_service is not NULL. So, no
1932 * further upwards deactivation must be
1933 * performed. Yet, next_in_service has
1934 * changed. Then the schedule does need to be
1940 * If we get here, then the parent is no more
1941 * backlogged and we need to propagate the
1942 * deactivation upwards. Thus let the loop go on.
1946 * Also let parent be queued into the idle tree on
1947 * deactivation, to preserve service guarantees, and
1948 * assuming that who invoked this function does not
1949 * need parent entities too to be removed completely.
1951 ins_into_idle_tree
= true;
1955 * If the deactivation loop is fully executed, then there are
1956 * no more entities to touch and next loop is not executed at
1957 * all. Otherwise, requeue remaining entities if they are
1958 * about to stop receiving service, or reposition them if this
1962 for_each_entity(entity
) {
1964 * Invoke __bfq_requeue_entity on entity, even if
1965 * already active, to requeue/reposition it in the
1966 * active tree (because sd->next_in_service has
1969 __bfq_requeue_entity(entity
);
1971 sd
= entity
->sched_data
;
1972 if (!bfq_update_next_in_service(sd
, entity
) &&
1975 * next_in_service unchanged or not causing
1976 * any change in entity->parent->sd, and no
1977 * requeueing needed for expiration: stop
1985 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1986 * if needed, to have at least one entity eligible.
1987 * @st: the service tree to act upon.
1989 * Assumes that st is not empty.
1991 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
1993 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
1995 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
1996 return root_entity
->min_start
;
2001 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2003 if (new_value
> st
->vtime
) {
2004 st
->vtime
= new_value
;
2005 bfq_forget_idle(st
);
2010 * bfq_first_active_entity - find the eligible entity with
2011 * the smallest finish time
2012 * @st: the service tree to select from.
2013 * @vtime: the system virtual to use as a reference for eligibility
2015 * This function searches the first schedulable entity, starting from the
2016 * root of the tree and going on the left every time on this side there is
2017 * a subtree with at least one eligible (start >= vtime) entity. The path on
2018 * the right is followed only if a) the left subtree contains no eligible
2019 * entities and b) no eligible entity has been found yet.
2021 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2024 struct bfq_entity
*entry
, *first
= NULL
;
2025 struct rb_node
*node
= st
->active
.rb_node
;
2028 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2030 if (!bfq_gt(entry
->start
, vtime
))
2033 if (node
->rb_left
) {
2034 entry
= rb_entry(node
->rb_left
,
2035 struct bfq_entity
, rb_node
);
2036 if (!bfq_gt(entry
->min_start
, vtime
)) {
2037 node
= node
->rb_left
;
2043 node
= node
->rb_right
;
2050 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2051 * @st: the service tree.
2053 * If there is no in-service entity for the sched_data st belongs to,
2054 * then return the entity that will be set in service if:
2055 * 1) the parent entity this st belongs to is set in service;
2056 * 2) no entity belonging to such parent entity undergoes a state change
2057 * that would influence the timestamps of the entity (e.g., becomes idle,
2058 * becomes backlogged, changes its budget, ...).
2060 * In this first case, update the virtual time in @st too (see the
2061 * comments on this update inside the function).
2063 * In constrast, if there is an in-service entity, then return the
2064 * entity that would be set in service if not only the above
2065 * conditions, but also the next one held true: the currently
2066 * in-service entity, on expiration,
2067 * 1) gets a finish time equal to the current one, or
2068 * 2) is not eligible any more, or
2071 static struct bfq_entity
*
2072 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2074 struct bfq_entity
*entity
;
2077 if (RB_EMPTY_ROOT(&st
->active
))
2081 * Get the value of the system virtual time for which at
2082 * least one entity is eligible.
2084 new_vtime
= bfq_calc_vtime_jump(st
);
2087 * If there is no in-service entity for the sched_data this
2088 * active tree belongs to, then push the system virtual time
2089 * up to the value that guarantees that at least one entity is
2090 * eligible. If, instead, there is an in-service entity, then
2091 * do not make any such update, because there is already an
2092 * eligible entity, namely the in-service one (even if the
2093 * entity is not on st, because it was extracted when set in
2097 bfq_update_vtime(st
, new_vtime
);
2099 entity
= bfq_first_active_entity(st
, new_vtime
);
2105 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2106 * @sd: the sched_data.
2108 * This function is invoked when there has been a change in the trees
2109 * for sd, and we need know what is the new next entity after this
2112 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2114 struct bfq_service_tree
*st
= sd
->service_tree
;
2115 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2116 struct bfq_entity
*entity
= NULL
;
2120 * Choose from idle class, if needed to guarantee a minimum
2121 * bandwidth to this class (and if there is some active entity
2122 * in idle class). This should also mitigate
2123 * priority-inversion problems in case a low priority task is
2124 * holding file system resources.
2126 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2127 BFQ_CL_IDLE_TIMEOUT
)) {
2128 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2129 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2130 /* About to be served if backlogged, or not yet backlogged */
2131 sd
->bfq_class_idle_last_service
= jiffies
;
2135 * Find the next entity to serve for the highest-priority
2136 * class, unless the idle class needs to be served.
2138 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2139 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2140 sd
->in_service_entity
);
2152 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2154 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2156 return sd
->next_in_service
!= sd
->in_service_entity
;
2160 * Get next queue for service.
2162 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2164 struct bfq_entity
*entity
= NULL
;
2165 struct bfq_sched_data
*sd
;
2166 struct bfq_queue
*bfqq
;
2168 if (bfqd
->busy_queues
== 0)
2172 * Traverse the path from the root to the leaf entity to
2173 * serve. Set in service all the entities visited along the
2176 sd
= &bfqd
->root_group
->sched_data
;
2177 for (; sd
; sd
= entity
->my_sched_data
) {
2179 * WARNING. We are about to set the in-service entity
2180 * to sd->next_in_service, i.e., to the (cached) value
2181 * returned by bfq_lookup_next_entity(sd) the last
2182 * time it was invoked, i.e., the last time when the
2183 * service order in sd changed as a consequence of the
2184 * activation or deactivation of an entity. In this
2185 * respect, if we execute bfq_lookup_next_entity(sd)
2186 * in this very moment, it may, although with low
2187 * probability, yield a different entity than that
2188 * pointed to by sd->next_in_service. This rare event
2189 * happens in case there was no CLASS_IDLE entity to
2190 * serve for sd when bfq_lookup_next_entity(sd) was
2191 * invoked for the last time, while there is now one
2194 * If the above event happens, then the scheduling of
2195 * such entity in CLASS_IDLE is postponed until the
2196 * service of the sd->next_in_service entity
2197 * finishes. In fact, when the latter is expired,
2198 * bfq_lookup_next_entity(sd) gets called again,
2199 * exactly to update sd->next_in_service.
2202 /* Make next_in_service entity become in_service_entity */
2203 entity
= sd
->next_in_service
;
2204 sd
->in_service_entity
= entity
;
2207 * Reset the accumulator of the amount of service that
2208 * the entity is about to receive.
2210 entity
->service
= 0;
2213 * If entity is no longer a candidate for next
2214 * service, then we extract it from its active tree,
2215 * for the following reason. To further boost the
2216 * throughput in some special case, BFQ needs to know
2217 * which is the next candidate entity to serve, while
2218 * there is already an entity in service. In this
2219 * respect, to make it easy to compute/update the next
2220 * candidate entity to serve after the current
2221 * candidate has been set in service, there is a case
2222 * where it is necessary to extract the current
2223 * candidate from its service tree. Such a case is
2224 * when the entity just set in service cannot be also
2225 * a candidate for next service. Details about when
2226 * this conditions holds are reported in the comments
2227 * on the function bfq_no_longer_next_in_service()
2230 if (bfq_no_longer_next_in_service(entity
))
2231 bfq_active_extract(bfq_entity_service_tree(entity
),
2235 * For the same reason why we may have just extracted
2236 * entity from its active tree, we may need to update
2237 * next_in_service for the sched_data of entity too,
2238 * regardless of whether entity has been extracted.
2239 * In fact, even if entity has not been extracted, a
2240 * descendant entity may get extracted. Such an event
2241 * would cause a change in next_in_service for the
2242 * level of the descendant entity, and thus possibly
2243 * back to upper levels.
2245 * We cannot perform the resulting needed update
2246 * before the end of this loop, because, to know which
2247 * is the correct next-to-serve candidate entity for
2248 * each level, we need first to find the leaf entity
2249 * to set in service. In fact, only after we know
2250 * which is the next-to-serve leaf entity, we can
2251 * discover whether the parent entity of the leaf
2252 * entity becomes the next-to-serve, and so on.
2257 bfqq
= bfq_entity_to_bfqq(entity
);
2260 * We can finally update all next-to-serve entities along the
2261 * path from the leaf entity just set in service to the root.
2263 for_each_entity(entity
) {
2264 struct bfq_sched_data
*sd
= entity
->sched_data
;
2266 if (!bfq_update_next_in_service(sd
, NULL
))
2273 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2275 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2276 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2277 struct bfq_entity
*entity
= in_serv_entity
;
2279 if (bfqd
->in_service_bic
) {
2280 put_io_context(bfqd
->in_service_bic
->icq
.ioc
);
2281 bfqd
->in_service_bic
= NULL
;
2284 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2285 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2286 bfqd
->in_service_queue
= NULL
;
2289 * When this function is called, all in-service entities have
2290 * been properly deactivated or requeued, so we can safely
2291 * execute the final step: reset in_service_entity along the
2292 * path from entity to the root.
2294 for_each_entity(entity
)
2295 entity
->sched_data
->in_service_entity
= NULL
;
2298 * in_serv_entity is no longer in service, so, if it is in no
2299 * service tree either, then release the service reference to
2300 * the queue it represents (taken with bfq_get_entity).
2302 if (!in_serv_entity
->on_st
)
2303 bfq_put_queue(in_serv_bfqq
);
2306 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2307 bool ins_into_idle_tree
, bool expiration
)
2309 struct bfq_entity
*entity
= &bfqq
->entity
;
2311 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2314 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2316 struct bfq_entity
*entity
= &bfqq
->entity
;
2318 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2320 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2323 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2325 struct bfq_entity
*entity
= &bfqq
->entity
;
2327 bfq_activate_requeue_entity(entity
, false,
2328 bfqq
== bfqd
->in_service_queue
);
2331 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2334 * Called when the bfqq no longer has requests pending, remove it from
2335 * the service tree. As a special case, it can be invoked during an
2338 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2341 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2343 bfq_clear_bfqq_busy(bfqq
);
2345 bfqd
->busy_queues
--;
2347 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2349 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2353 * Called when an inactive queue receives a new request.
2355 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2357 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2359 bfq_activate_bfqq(bfqd
, bfqq
);
2361 bfq_mark_bfqq_busy(bfqq
);
2362 bfqd
->busy_queues
++;
2365 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2367 /* bfqg stats flags */
2368 enum bfqg_stats_flags
{
2369 BFQG_stats_waiting
= 0,
2374 #define BFQG_FLAG_FNS(name) \
2375 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2377 stats->flags |= (1 << BFQG_stats_##name); \
2379 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2381 stats->flags &= ~(1 << BFQG_stats_##name); \
2383 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2385 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2388 BFQG_FLAG_FNS(waiting)
2389 BFQG_FLAG_FNS(idling
)
2390 BFQG_FLAG_FNS(empty
)
2391 #undef BFQG_FLAG_FNS
2393 /* This should be called with the queue_lock held. */
2394 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2396 unsigned long long now
;
2398 if (!bfqg_stats_waiting(stats
))
2401 now
= sched_clock();
2402 if (time_after64(now
, stats
->start_group_wait_time
))
2403 blkg_stat_add(&stats
->group_wait_time
,
2404 now
- stats
->start_group_wait_time
);
2405 bfqg_stats_clear_waiting(stats
);
2408 /* This should be called with the queue_lock held. */
2409 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2410 struct bfq_group
*curr_bfqg
)
2412 struct bfqg_stats
*stats
= &bfqg
->stats
;
2414 if (bfqg_stats_waiting(stats
))
2416 if (bfqg
== curr_bfqg
)
2418 stats
->start_group_wait_time
= sched_clock();
2419 bfqg_stats_mark_waiting(stats
);
2422 /* This should be called with the queue_lock held. */
2423 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2425 unsigned long long now
;
2427 if (!bfqg_stats_empty(stats
))
2430 now
= sched_clock();
2431 if (time_after64(now
, stats
->start_empty_time
))
2432 blkg_stat_add(&stats
->empty_time
,
2433 now
- stats
->start_empty_time
);
2434 bfqg_stats_clear_empty(stats
);
2437 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2439 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2442 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2444 struct bfqg_stats
*stats
= &bfqg
->stats
;
2446 if (blkg_rwstat_total(&stats
->queued
))
2450 * group is already marked empty. This can happen if bfqq got new
2451 * request in parent group and moved to this group while being added
2452 * to service tree. Just ignore the event and move on.
2454 if (bfqg_stats_empty(stats
))
2457 stats
->start_empty_time
= sched_clock();
2458 bfqg_stats_mark_empty(stats
);
2461 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2463 struct bfqg_stats
*stats
= &bfqg
->stats
;
2465 if (bfqg_stats_idling(stats
)) {
2466 unsigned long long now
= sched_clock();
2468 if (time_after64(now
, stats
->start_idle_time
))
2469 blkg_stat_add(&stats
->idle_time
,
2470 now
- stats
->start_idle_time
);
2471 bfqg_stats_clear_idling(stats
);
2475 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2477 struct bfqg_stats
*stats
= &bfqg
->stats
;
2479 stats
->start_idle_time
= sched_clock();
2480 bfqg_stats_mark_idling(stats
);
2483 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2485 struct bfqg_stats
*stats
= &bfqg
->stats
;
2487 blkg_stat_add(&stats
->avg_queue_size_sum
,
2488 blkg_rwstat_total(&stats
->queued
));
2489 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2490 bfqg_stats_update_group_wait_time(stats
);
2494 * blk-cgroup policy-related handlers
2495 * The following functions help in converting between blk-cgroup
2496 * internal structures and BFQ-specific structures.
2499 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2501 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2504 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2506 return pd_to_blkg(&bfqg
->pd
);
2509 static struct blkcg_policy blkcg_policy_bfq
;
2511 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2513 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2517 * bfq_group handlers
2518 * The following functions help in navigating the bfq_group hierarchy
2519 * by allowing to find the parent of a bfq_group or the bfq_group
2520 * associated to a bfq_queue.
2523 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2525 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2527 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2530 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2532 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2534 return group_entity
? container_of(group_entity
, struct bfq_group
,
2536 bfqq
->bfqd
->root_group
;
2540 * The following two functions handle get and put of a bfq_group by
2541 * wrapping the related blk-cgroup hooks.
2544 static void bfqg_get(struct bfq_group
*bfqg
)
2546 return blkg_get(bfqg_to_blkg(bfqg
));
2549 static void bfqg_put(struct bfq_group
*bfqg
)
2551 return blkg_put(bfqg_to_blkg(bfqg
));
2554 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2555 struct bfq_queue
*bfqq
,
2558 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2559 bfqg_stats_end_empty_time(&bfqg
->stats
);
2560 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2561 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2564 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2566 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2569 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2571 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2574 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2575 uint64_t start_time
, uint64_t io_start_time
,
2578 struct bfqg_stats
*stats
= &bfqg
->stats
;
2579 unsigned long long now
= sched_clock();
2581 if (time_after64(now
, io_start_time
))
2582 blkg_rwstat_add(&stats
->service_time
, op
,
2583 now
- io_start_time
);
2584 if (time_after64(io_start_time
, start_time
))
2585 blkg_rwstat_add(&stats
->wait_time
, op
,
2586 io_start_time
- start_time
);
2590 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2592 /* queued stats shouldn't be cleared */
2593 blkg_rwstat_reset(&stats
->merged
);
2594 blkg_rwstat_reset(&stats
->service_time
);
2595 blkg_rwstat_reset(&stats
->wait_time
);
2596 blkg_stat_reset(&stats
->time
);
2597 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2598 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2599 blkg_stat_reset(&stats
->dequeue
);
2600 blkg_stat_reset(&stats
->group_wait_time
);
2601 blkg_stat_reset(&stats
->idle_time
);
2602 blkg_stat_reset(&stats
->empty_time
);
2606 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2611 /* queued stats shouldn't be cleared */
2612 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2613 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2614 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2615 blkg_stat_add_aux(&from
->time
, &from
->time
);
2616 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2617 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2618 &from
->avg_queue_size_samples
);
2619 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2620 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2621 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2622 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2626 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2627 * recursive stats can still account for the amount used by this bfqg after
2630 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2632 struct bfq_group
*parent
;
2634 if (!bfqg
) /* root_group */
2637 parent
= bfqg_parent(bfqg
);
2639 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2641 if (unlikely(!parent
))
2644 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2645 bfqg_stats_reset(&bfqg
->stats
);
2648 static void bfq_init_entity(struct bfq_entity
*entity
,
2649 struct bfq_group
*bfqg
)
2651 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2653 entity
->weight
= entity
->new_weight
;
2654 entity
->orig_weight
= entity
->new_weight
;
2656 bfqq
->ioprio
= bfqq
->new_ioprio
;
2657 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2660 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2661 entity
->sched_data
= &bfqg
->sched_data
;
2664 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2666 blkg_rwstat_exit(&stats
->merged
);
2667 blkg_rwstat_exit(&stats
->service_time
);
2668 blkg_rwstat_exit(&stats
->wait_time
);
2669 blkg_rwstat_exit(&stats
->queued
);
2670 blkg_stat_exit(&stats
->time
);
2671 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2672 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2673 blkg_stat_exit(&stats
->dequeue
);
2674 blkg_stat_exit(&stats
->group_wait_time
);
2675 blkg_stat_exit(&stats
->idle_time
);
2676 blkg_stat_exit(&stats
->empty_time
);
2679 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2681 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2682 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2683 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2684 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2685 blkg_stat_init(&stats
->time
, gfp
) ||
2686 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2687 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2688 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2689 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2690 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2691 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2692 bfqg_stats_exit(stats
);
2699 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2701 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2704 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2706 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2709 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2711 struct bfq_group_data
*bgd
;
2713 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2719 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2721 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2723 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2724 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2727 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2729 kfree(cpd_to_bfqgd(cpd
));
2732 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
2734 struct bfq_group
*bfqg
;
2736 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
2740 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
2748 static void bfq_pd_init(struct blkg_policy_data
*pd
)
2750 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
2751 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
2752 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
2753 struct bfq_entity
*entity
= &bfqg
->entity
;
2754 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
2756 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
2757 entity
->my_sched_data
= &bfqg
->sched_data
;
2758 bfqg
->my_entity
= entity
; /*
2759 * the root_group's will be set to NULL
2760 * in bfq_init_queue()
2765 static void bfq_pd_free(struct blkg_policy_data
*pd
)
2767 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2769 bfqg_stats_exit(&bfqg
->stats
);
2773 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
2775 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2777 bfqg_stats_reset(&bfqg
->stats
);
2780 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
2781 struct bfq_group
*parent
)
2783 struct bfq_entity
*entity
;
2785 entity
= &bfqg
->entity
;
2786 entity
->parent
= parent
->my_entity
;
2787 entity
->sched_data
= &parent
->sched_data
;
2790 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
2791 struct blkcg
*blkcg
)
2793 struct blkcg_gq
*blkg
;
2795 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
2797 return blkg_to_bfqg(blkg
);
2801 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
2802 struct blkcg
*blkcg
)
2804 struct bfq_group
*bfqg
, *parent
;
2805 struct bfq_entity
*entity
;
2807 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
2809 if (unlikely(!bfqg
))
2813 * Update chain of bfq_groups as we might be handling a leaf group
2814 * which, along with some of its relatives, has not been hooked yet
2815 * to the private hierarchy of BFQ.
2817 entity
= &bfqg
->entity
;
2818 for_each_entity(entity
) {
2819 bfqg
= container_of(entity
, struct bfq_group
, entity
);
2820 if (bfqg
!= bfqd
->root_group
) {
2821 parent
= bfqg_parent(bfqg
);
2823 parent
= bfqd
->root_group
;
2824 bfq_group_set_parent(bfqg
, parent
);
2831 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
2832 struct bfq_queue
*bfqq
,
2834 enum bfqq_expiration reason
);
2837 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2838 * @bfqd: queue descriptor.
2839 * @bfqq: the queue to move.
2840 * @bfqg: the group to move to.
2842 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2843 * it on the new one. Avoid putting the entity on the old group idle tree.
2845 * Must be called under the queue lock; the cgroup owning @bfqg must
2846 * not disappear (by now this just means that we are called under
2849 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2850 struct bfq_group
*bfqg
)
2852 struct bfq_entity
*entity
= &bfqq
->entity
;
2854 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2855 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2856 * from data structures related to current group. Otherwise we
2857 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2860 if (bfqq
== bfqd
->in_service_queue
)
2861 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
2862 false, BFQQE_PREEMPTED
);
2864 if (bfq_bfqq_busy(bfqq
))
2865 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
2866 else if (entity
->on_st
)
2867 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
2868 bfqg_put(bfqq_group(bfqq
));
2871 * Here we use a reference to bfqg. We don't need a refcounter
2872 * as the cgroup reference will not be dropped, so that its
2873 * destroy() callback will not be invoked.
2875 entity
->parent
= bfqg
->my_entity
;
2876 entity
->sched_data
= &bfqg
->sched_data
;
2879 if (bfq_bfqq_busy(bfqq
))
2880 bfq_activate_bfqq(bfqd
, bfqq
);
2882 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
2883 bfq_schedule_dispatch(bfqd
);
2887 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2888 * @bfqd: the queue descriptor.
2889 * @bic: the bic to move.
2890 * @blkcg: the blk-cgroup to move to.
2892 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2893 * has to make sure that the reference to cgroup is valid across the call.
2895 * NOTE: an alternative approach might have been to store the current
2896 * cgroup in bfqq and getting a reference to it, reducing the lookup
2897 * time here, at the price of slightly more complex code.
2899 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
2900 struct bfq_io_cq
*bic
,
2901 struct blkcg
*blkcg
)
2903 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
2904 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
2905 struct bfq_group
*bfqg
;
2906 struct bfq_entity
*entity
;
2908 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
2910 if (unlikely(!bfqg
))
2911 bfqg
= bfqd
->root_group
;
2914 entity
= &async_bfqq
->entity
;
2916 if (entity
->sched_data
!= &bfqg
->sched_data
) {
2917 bic_set_bfqq(bic
, NULL
, 0);
2918 bfq_log_bfqq(bfqd
, async_bfqq
,
2919 "bic_change_group: %p %d",
2922 bfq_put_queue(async_bfqq
);
2927 entity
= &sync_bfqq
->entity
;
2928 if (entity
->sched_data
!= &bfqg
->sched_data
)
2929 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
2935 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
2937 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
2938 struct bfq_group
*bfqg
= NULL
;
2942 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
2945 * Check whether blkcg has changed. The condition may trigger
2946 * spuriously on a newly created cic but there's no harm.
2948 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
2951 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
2952 bic
->blkcg_serial_nr
= serial_nr
;
2958 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
2959 * @st: the service tree being flushed.
2961 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
2963 struct bfq_entity
*entity
= st
->first_idle
;
2965 for (; entity
; entity
= st
->first_idle
)
2966 __bfq_deactivate_entity(entity
, false);
2970 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
2971 * @bfqd: the device data structure with the root group.
2972 * @entity: the entity to move.
2974 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
2975 struct bfq_entity
*entity
)
2977 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2979 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
2983 * bfq_reparent_active_entities - move to the root group all active
2985 * @bfqd: the device data structure with the root group.
2986 * @bfqg: the group to move from.
2987 * @st: the service tree with the entities.
2989 * Needs queue_lock to be taken and reference to be valid over the call.
2991 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
2992 struct bfq_group
*bfqg
,
2993 struct bfq_service_tree
*st
)
2995 struct rb_root
*active
= &st
->active
;
2996 struct bfq_entity
*entity
= NULL
;
2998 if (!RB_EMPTY_ROOT(&st
->active
))
2999 entity
= bfq_entity_of(rb_first(active
));
3001 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3002 bfq_reparent_leaf_entity(bfqd
, entity
);
3004 if (bfqg
->sched_data
.in_service_entity
)
3005 bfq_reparent_leaf_entity(bfqd
,
3006 bfqg
->sched_data
.in_service_entity
);
3010 * bfq_pd_offline - deactivate the entity associated with @pd,
3011 * and reparent its children entities.
3012 * @pd: descriptor of the policy going offline.
3014 * blkio already grabs the queue_lock for us, so no need to use
3017 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3019 struct bfq_service_tree
*st
;
3020 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3021 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3022 struct bfq_entity
*entity
= bfqg
->my_entity
;
3023 unsigned long flags
;
3026 if (!entity
) /* root group */
3029 spin_lock_irqsave(&bfqd
->lock
, flags
);
3031 * Empty all service_trees belonging to this group before
3032 * deactivating the group itself.
3034 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3035 st
= bfqg
->sched_data
.service_tree
+ i
;
3038 * The idle tree may still contain bfq_queues belonging
3039 * to exited task because they never migrated to a different
3040 * cgroup from the one being destroyed now. No one else
3041 * can access them so it's safe to act without any lock.
3043 bfq_flush_idle_tree(st
);
3046 * It may happen that some queues are still active
3047 * (busy) upon group destruction (if the corresponding
3048 * processes have been forced to terminate). We move
3049 * all the leaf entities corresponding to these queues
3050 * to the root_group.
3051 * Also, it may happen that the group has an entity
3052 * in service, which is disconnected from the active
3053 * tree: it must be moved, too.
3054 * There is no need to put the sync queues, as the
3055 * scheduler has taken no reference.
3057 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3060 __bfq_deactivate_entity(entity
, false);
3061 bfq_put_async_queues(bfqd
, bfqg
);
3063 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
3065 * @blkg is going offline and will be ignored by
3066 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3067 * that they don't get lost. If IOs complete after this point, the
3068 * stats for them will be lost. Oh well...
3070 bfqg_stats_xfer_dead(bfqg
);
3073 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3075 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3076 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3077 unsigned int val
= 0;
3080 val
= bfqgd
->weight
;
3082 seq_printf(sf
, "%u\n", val
);
3087 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3088 struct cftype
*cftype
,
3091 struct blkcg
*blkcg
= css_to_blkcg(css
);
3092 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3093 struct blkcg_gq
*blkg
;
3096 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3100 spin_lock_irq(&blkcg
->lock
);
3101 bfqgd
->weight
= (unsigned short)val
;
3102 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3103 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3108 * Setting the prio_changed flag of the entity
3109 * to 1 with new_weight == weight would re-set
3110 * the value of the weight to its ioprio mapping.
3111 * Set the flag only if necessary.
3113 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3114 bfqg
->entity
.new_weight
= (unsigned short)val
;
3116 * Make sure that the above new value has been
3117 * stored in bfqg->entity.new_weight before
3118 * setting the prio_changed flag. In fact,
3119 * this flag may be read asynchronously (in
3120 * critical sections protected by a different
3121 * lock than that held here), and finding this
3122 * flag set may cause the execution of the code
3123 * for updating parameters whose value may
3124 * depend also on bfqg->entity.new_weight (in
3125 * __bfq_entity_update_weight_prio).
3126 * This barrier makes sure that the new value
3127 * of bfqg->entity.new_weight is correctly
3128 * seen in that code.
3131 bfqg
->entity
.prio_changed
= 1;
3134 spin_unlock_irq(&blkcg
->lock
);
3139 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3140 char *buf
, size_t nbytes
,
3144 /* First unsigned long found in the file is used */
3145 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3150 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3153 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3155 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3156 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3160 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3162 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3163 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3167 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3168 struct blkg_policy_data
*pd
, int off
)
3170 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3171 &blkcg_policy_bfq
, off
);
3172 return __blkg_prfill_u64(sf
, pd
, sum
);
3175 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3176 struct blkg_policy_data
*pd
, int off
)
3178 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3181 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3184 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3186 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3187 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3188 seq_cft(sf
)->private, false);
3192 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3194 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3195 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3196 seq_cft(sf
)->private, true);
3200 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3203 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3205 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3208 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3210 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3211 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3215 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3216 struct blkg_policy_data
*pd
, int off
)
3218 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3219 offsetof(struct blkcg_gq
, stat_bytes
));
3220 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3221 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3223 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3226 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3228 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3229 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3234 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3235 struct blkg_policy_data
*pd
, int off
)
3237 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3238 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3242 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3243 v
= div64_u64(v
, samples
);
3245 __blkg_prfill_u64(sf
, pd
, v
);
3249 /* print avg_queue_size */
3250 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3252 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3253 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3258 static struct bfq_group
*
3259 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3263 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3267 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3270 static struct cftype bfq_blkcg_legacy_files
[] = {
3272 .name
= "bfq.weight",
3273 .flags
= CFTYPE_NOT_ON_ROOT
,
3274 .seq_show
= bfq_io_show_weight
,
3275 .write_u64
= bfq_io_set_weight_legacy
,
3278 /* statistics, covers only the tasks in the bfqg */
3281 .private = offsetof(struct bfq_group
, stats
.time
),
3282 .seq_show
= bfqg_print_stat
,
3285 .name
= "bfq.sectors",
3286 .seq_show
= bfqg_print_stat_sectors
,
3289 .name
= "bfq.io_service_bytes",
3290 .private = (unsigned long)&blkcg_policy_bfq
,
3291 .seq_show
= blkg_print_stat_bytes
,
3294 .name
= "bfq.io_serviced",
3295 .private = (unsigned long)&blkcg_policy_bfq
,
3296 .seq_show
= blkg_print_stat_ios
,
3299 .name
= "bfq.io_service_time",
3300 .private = offsetof(struct bfq_group
, stats
.service_time
),
3301 .seq_show
= bfqg_print_rwstat
,
3304 .name
= "bfq.io_wait_time",
3305 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3306 .seq_show
= bfqg_print_rwstat
,
3309 .name
= "bfq.io_merged",
3310 .private = offsetof(struct bfq_group
, stats
.merged
),
3311 .seq_show
= bfqg_print_rwstat
,
3314 .name
= "bfq.io_queued",
3315 .private = offsetof(struct bfq_group
, stats
.queued
),
3316 .seq_show
= bfqg_print_rwstat
,
3319 /* the same statictics which cover the bfqg and its descendants */
3321 .name
= "bfq.time_recursive",
3322 .private = offsetof(struct bfq_group
, stats
.time
),
3323 .seq_show
= bfqg_print_stat_recursive
,
3326 .name
= "bfq.sectors_recursive",
3327 .seq_show
= bfqg_print_stat_sectors_recursive
,
3330 .name
= "bfq.io_service_bytes_recursive",
3331 .private = (unsigned long)&blkcg_policy_bfq
,
3332 .seq_show
= blkg_print_stat_bytes_recursive
,
3335 .name
= "bfq.io_serviced_recursive",
3336 .private = (unsigned long)&blkcg_policy_bfq
,
3337 .seq_show
= blkg_print_stat_ios_recursive
,
3340 .name
= "bfq.io_service_time_recursive",
3341 .private = offsetof(struct bfq_group
, stats
.service_time
),
3342 .seq_show
= bfqg_print_rwstat_recursive
,
3345 .name
= "bfq.io_wait_time_recursive",
3346 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3347 .seq_show
= bfqg_print_rwstat_recursive
,
3350 .name
= "bfq.io_merged_recursive",
3351 .private = offsetof(struct bfq_group
, stats
.merged
),
3352 .seq_show
= bfqg_print_rwstat_recursive
,
3355 .name
= "bfq.io_queued_recursive",
3356 .private = offsetof(struct bfq_group
, stats
.queued
),
3357 .seq_show
= bfqg_print_rwstat_recursive
,
3360 .name
= "bfq.avg_queue_size",
3361 .seq_show
= bfqg_print_avg_queue_size
,
3364 .name
= "bfq.group_wait_time",
3365 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3366 .seq_show
= bfqg_print_stat
,
3369 .name
= "bfq.idle_time",
3370 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3371 .seq_show
= bfqg_print_stat
,
3374 .name
= "bfq.empty_time",
3375 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3376 .seq_show
= bfqg_print_stat
,
3379 .name
= "bfq.dequeue",
3380 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3381 .seq_show
= bfqg_print_stat
,
3386 static struct cftype bfq_blkg_files
[] = {
3388 .name
= "bfq.weight",
3389 .flags
= CFTYPE_NOT_ON_ROOT
,
3390 .seq_show
= bfq_io_show_weight
,
3391 .write
= bfq_io_set_weight
,
3396 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3398 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3399 struct bfq_queue
*bfqq
, unsigned int op
) { }
3401 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3403 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3404 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3405 uint64_t start_time
, uint64_t io_start_time
,
3406 unsigned int op
) { }
3408 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3409 struct bfq_group
*curr_bfqg
) { }
3410 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3411 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3412 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3413 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3414 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3415 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3417 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3418 struct bfq_group
*bfqg
) {}
3420 static void bfq_init_entity(struct bfq_entity
*entity
,
3421 struct bfq_group
*bfqg
)
3423 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3425 entity
->weight
= entity
->new_weight
;
3426 entity
->orig_weight
= entity
->new_weight
;
3428 bfqq
->ioprio
= bfqq
->new_ioprio
;
3429 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3431 entity
->sched_data
= &bfqg
->sched_data
;
3434 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3436 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3437 struct blkcg
*blkcg
)
3439 return bfqd
->root_group
;
3442 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3444 return bfqq
->bfqd
->root_group
;
3447 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3450 struct bfq_group
*bfqg
;
3453 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3457 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3458 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3462 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3464 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3465 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3467 #define bfq_sample_valid(samples) ((samples) > 80)
3470 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3471 * We choose the request that is closesr to the head right now. Distance
3472 * behind the head is penalized and only allowed to a certain extent.
3474 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3475 struct request
*rq1
,
3476 struct request
*rq2
,
3479 sector_t s1
, s2
, d1
= 0, d2
= 0;
3480 unsigned long back_max
;
3481 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3482 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3483 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3485 if (!rq1
|| rq1
== rq2
)
3490 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3492 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3494 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3496 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3499 s1
= blk_rq_pos(rq1
);
3500 s2
= blk_rq_pos(rq2
);
3503 * By definition, 1KiB is 2 sectors.
3505 back_max
= bfqd
->bfq_back_max
* 2;
3508 * Strict one way elevator _except_ in the case where we allow
3509 * short backward seeks which are biased as twice the cost of a
3510 * similar forward seek.
3514 else if (s1
+ back_max
>= last
)
3515 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3517 wrap
|= BFQ_RQ1_WRAP
;
3521 else if (s2
+ back_max
>= last
)
3522 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3524 wrap
|= BFQ_RQ2_WRAP
;
3526 /* Found required data */
3529 * By doing switch() on the bit mask "wrap" we avoid having to
3530 * check two variables for all permutations: --> faster!
3533 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3548 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3551 * Since both rqs are wrapped,
3552 * start with the one that's further behind head
3553 * (--> only *one* back seek required),
3554 * since back seek takes more time than forward.
3564 * Return expired entry, or NULL to just start from scratch in rbtree.
3566 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3567 struct request
*last
)
3571 if (bfq_bfqq_fifo_expire(bfqq
))
3574 bfq_mark_bfqq_fifo_expire(bfqq
);
3576 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3578 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3581 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3585 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3586 struct bfq_queue
*bfqq
,
3587 struct request
*last
)
3589 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3590 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3591 struct request
*next
, *prev
= NULL
;
3593 /* Follow expired path, else get first next available. */
3594 next
= bfq_check_fifo(bfqq
, last
);
3599 prev
= rb_entry_rq(rbprev
);
3602 next
= rb_entry_rq(rbnext
);
3604 rbnext
= rb_first(&bfqq
->sort_list
);
3605 if (rbnext
&& rbnext
!= &last
->rb_node
)
3606 next
= rb_entry_rq(rbnext
);
3609 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3612 /* see the definition of bfq_async_charge_factor for details */
3613 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3614 struct bfq_queue
*bfqq
)
3616 if (bfq_bfqq_sync(bfqq
))
3617 return blk_rq_sectors(rq
);
3619 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
3623 * bfq_updated_next_req - update the queue after a new next_rq selection.
3624 * @bfqd: the device data the queue belongs to.
3625 * @bfqq: the queue to update.
3627 * If the first request of a queue changes we make sure that the queue
3628 * has enough budget to serve at least its first request (if the
3629 * request has grown). We do this because if the queue has not enough
3630 * budget for its first request, it has to go through two dispatch
3631 * rounds to actually get it dispatched.
3633 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3634 struct bfq_queue
*bfqq
)
3636 struct bfq_entity
*entity
= &bfqq
->entity
;
3637 struct request
*next_rq
= bfqq
->next_rq
;
3638 unsigned long new_budget
;
3643 if (bfqq
== bfqd
->in_service_queue
)
3645 * In order not to break guarantees, budgets cannot be
3646 * changed after an entity has been selected.
3650 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
3651 bfq_serv_to_charge(next_rq
, bfqq
));
3652 if (entity
->budget
!= new_budget
) {
3653 entity
->budget
= new_budget
;
3654 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
3656 bfq_requeue_bfqq(bfqd
, bfqq
);
3660 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
3662 struct bfq_entity
*entity
= &bfqq
->entity
;
3664 return entity
->budget
- entity
->service
;
3668 * If enough samples have been computed, return the current max budget
3669 * stored in bfqd, which is dynamically updated according to the
3670 * estimated disk peak rate; otherwise return the default max budget
3672 static int bfq_max_budget(struct bfq_data
*bfqd
)
3674 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3675 return bfq_default_max_budget
;
3677 return bfqd
->bfq_max_budget
;
3681 * Return min budget, which is a fraction of the current or default
3682 * max budget (trying with 1/32)
3684 static int bfq_min_budget(struct bfq_data
*bfqd
)
3686 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3687 return bfq_default_max_budget
/ 32;
3689 return bfqd
->bfq_max_budget
/ 32;
3692 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3693 struct bfq_queue
*bfqq
,
3695 enum bfqq_expiration reason
);
3698 * The next function, invoked after the input queue bfqq switches from
3699 * idle to busy, updates the budget of bfqq. The function also tells
3700 * whether the in-service queue should be expired, by returning
3701 * true. The purpose of expiring the in-service queue is to give bfqq
3702 * the chance to possibly preempt the in-service queue, and the reason
3703 * for preempting the in-service queue is to achieve the following
3704 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
3705 * expired because it has remained idle.
3707 * In particular, bfqq may have expired for one of the following two
3710 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3711 * and did not make it to issue a new request before its last
3712 * request was served;
3714 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3715 * a new request before the expiration of the idling-time.
3717 * Even if bfqq has expired for one of the above reasons, the process
3718 * associated with the queue may be however issuing requests greedily,
3719 * and thus be sensitive to the bandwidth it receives (bfqq may have
3720 * remained idle for other reasons: CPU high load, bfqq not enjoying
3721 * idling, I/O throttling somewhere in the path from the process to
3722 * the I/O scheduler, ...). But if, after every expiration for one of
3723 * the above two reasons, bfqq has to wait for the service of at least
3724 * one full budget of another queue before being served again, then
3725 * bfqq is likely to get a much lower bandwidth or resource time than
3726 * its reserved ones. To address this issue, two countermeasures need
3729 * First, the budget and the timestamps of bfqq need to be updated in
3730 * a special way on bfqq reactivation: they need to be updated as if
3731 * bfqq did not remain idle and did not expire. In fact, if they are
3732 * computed as if bfqq expired and remained idle until reactivation,
3733 * then the process associated with bfqq is treated as if, instead of
3734 * being greedy, it stopped issuing requests when bfqq remained idle,
3735 * and restarts issuing requests only on this reactivation. In other
3736 * words, the scheduler does not help the process recover the "service
3737 * hole" between bfqq expiration and reactivation. As a consequence,
3738 * the process receives a lower bandwidth than its reserved one. In
3739 * contrast, to recover this hole, the budget must be updated as if
3740 * bfqq was not expired at all before this reactivation, i.e., it must
3741 * be set to the value of the remaining budget when bfqq was
3742 * expired. Along the same line, timestamps need to be assigned the
3743 * value they had the last time bfqq was selected for service, i.e.,
3744 * before last expiration. Thus timestamps need to be back-shifted
3745 * with respect to their normal computation (see [1] for more details
3746 * on this tricky aspect).
3748 * Secondly, to allow the process to recover the hole, the in-service
3749 * queue must be expired too, to give bfqq the chance to preempt it
3750 * immediately. In fact, if bfqq has to wait for a full budget of the
3751 * in-service queue to be completed, then it may become impossible to
3752 * let the process recover the hole, even if the back-shifted
3753 * timestamps of bfqq are lower than those of the in-service queue. If
3754 * this happens for most or all of the holes, then the process may not
3755 * receive its reserved bandwidth. In this respect, it is worth noting
3756 * that, being the service of outstanding requests unpreemptible, a
3757 * little fraction of the holes may however be unrecoverable, thereby
3758 * causing a little loss of bandwidth.
3760 * The last important point is detecting whether bfqq does need this
3761 * bandwidth recovery. In this respect, the next function deems the
3762 * process associated with bfqq greedy, and thus allows it to recover
3763 * the hole, if: 1) the process is waiting for the arrival of a new
3764 * request (which implies that bfqq expired for one of the above two
3765 * reasons), and 2) such a request has arrived soon. The first
3766 * condition is controlled through the flag non_blocking_wait_rq,
3767 * while the second through the flag arrived_in_time. If both
3768 * conditions hold, then the function computes the budget in the
3769 * above-described special way, and signals that the in-service queue
3770 * should be expired. Timestamp back-shifting is done later in
3771 * __bfq_activate_entity.
3773 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
3774 struct bfq_queue
*bfqq
,
3775 bool arrived_in_time
)
3777 struct bfq_entity
*entity
= &bfqq
->entity
;
3779 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
3781 * We do not clear the flag non_blocking_wait_rq here, as
3782 * the latter is used in bfq_activate_bfqq to signal
3783 * that timestamps need to be back-shifted (and is
3784 * cleared right after).
3788 * In next assignment we rely on that either
3789 * entity->service or entity->budget are not updated
3790 * on expiration if bfqq is empty (see
3791 * __bfq_bfqq_recalc_budget). Thus both quantities
3792 * remain unchanged after such an expiration, and the
3793 * following statement therefore assigns to
3794 * entity->budget the remaining budget on such an
3795 * expiration. For clarity, entity->service is not
3796 * updated on expiration in any case, and, in normal
3797 * operation, is reset only when bfqq is selected for
3798 * service (see bfq_get_next_queue).
3800 entity
->budget
= min_t(unsigned long,
3801 bfq_bfqq_budget_left(bfqq
),
3807 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
3808 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
3809 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
3813 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
3814 struct bfq_queue
*bfqq
,
3817 bool bfqq_wants_to_preempt
,
3819 * See the comments on
3820 * bfq_bfqq_update_budg_for_activation for
3821 * details on the usage of the next variable.
3823 arrived_in_time
= ktime_get_ns() <=
3824 bfqq
->ttime
.last_end_request
+
3825 bfqd
->bfq_slice_idle
* 3;
3827 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
3830 * Update budget and check whether bfqq may want to preempt
3831 * the in-service queue.
3833 bfqq_wants_to_preempt
=
3834 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
3837 if (!bfq_bfqq_IO_bound(bfqq
)) {
3838 if (arrived_in_time
) {
3839 bfqq
->requests_within_timer
++;
3840 if (bfqq
->requests_within_timer
>=
3841 bfqd
->bfq_requests_within_timer
)
3842 bfq_mark_bfqq_IO_bound(bfqq
);
3844 bfqq
->requests_within_timer
= 0;
3847 bfq_add_bfqq_busy(bfqd
, bfqq
);
3850 * Expire in-service queue only if preemption may be needed
3851 * for guarantees. In this respect, the function
3852 * next_queue_may_preempt just checks a simple, necessary
3853 * condition, and not a sufficient condition based on
3854 * timestamps. In fact, for the latter condition to be
3855 * evaluated, timestamps would need first to be updated, and
3856 * this operation is quite costly (see the comments on the
3857 * function bfq_bfqq_update_budg_for_activation).
3859 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
3860 next_queue_may_preempt(bfqd
))
3861 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3862 false, BFQQE_PREEMPTED
);
3865 static void bfq_add_request(struct request
*rq
)
3867 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3868 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3869 struct request
*next_rq
, *prev
;
3871 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
3872 bfqq
->queued
[rq_is_sync(rq
)]++;
3875 elv_rb_add(&bfqq
->sort_list
, rq
);
3878 * Check if this request is a better next-serve candidate.
3880 prev
= bfqq
->next_rq
;
3881 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
3882 bfqq
->next_rq
= next_rq
;
3884 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
3885 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, rq
);
3886 else if (prev
!= bfqq
->next_rq
)
3887 bfq_updated_next_req(bfqd
, bfqq
);
3890 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
3892 struct request_queue
*q
)
3894 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
3898 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
3903 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
3906 return abs(blk_rq_pos(rq
) - last_pos
);
3911 #if 0 /* Still not clear if we can do without next two functions */
3912 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
3914 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3916 bfqd
->rq_in_driver
++;
3919 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
3921 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3923 bfqd
->rq_in_driver
--;
3927 static void bfq_remove_request(struct request_queue
*q
,
3930 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3931 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3932 const int sync
= rq_is_sync(rq
);
3934 if (bfqq
->next_rq
== rq
) {
3935 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
3936 bfq_updated_next_req(bfqd
, bfqq
);
3939 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
3940 list_del_init(&rq
->queuelist
);
3941 bfqq
->queued
[sync
]--;
3943 elv_rb_del(&bfqq
->sort_list
, rq
);
3945 elv_rqhash_del(q
, rq
);
3946 if (q
->last_merge
== rq
)
3947 q
->last_merge
= NULL
;
3949 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
3950 bfqq
->next_rq
= NULL
;
3952 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
3953 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
3955 * bfqq emptied. In normal operation, when
3956 * bfqq is empty, bfqq->entity.service and
3957 * bfqq->entity.budget must contain,
3958 * respectively, the service received and the
3959 * budget used last time bfqq emptied. These
3960 * facts do not hold in this case, as at least
3961 * this last removal occurred while bfqq is
3962 * not in service. To avoid inconsistencies,
3963 * reset both bfqq->entity.service and
3964 * bfqq->entity.budget, if bfqq has still a
3965 * process that may issue I/O requests to it.
3967 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
3971 if (rq
->cmd_flags
& REQ_META
)
3972 bfqq
->meta_pending
--;
3974 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
3977 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
3979 struct request_queue
*q
= hctx
->queue
;
3980 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3981 struct request
*free
= NULL
;
3983 * bfq_bic_lookup grabs the queue_lock: invoke it now and
3984 * store its return value for later use, to avoid nesting
3985 * queue_lock inside the bfqd->lock. We assume that the bic
3986 * returned by bfq_bic_lookup does not go away before
3987 * bfqd->lock is taken.
3989 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
3992 spin_lock_irq(&bfqd
->lock
);
3995 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
3997 bfqd
->bio_bfqq
= NULL
;
3998 bfqd
->bio_bic
= bic
;
4000 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
4003 blk_mq_free_request(free
);
4004 spin_unlock_irq(&bfqd
->lock
);
4009 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
4012 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4013 struct request
*__rq
;
4015 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
4016 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
4018 return ELEVATOR_FRONT_MERGE
;
4021 return ELEVATOR_NO_MERGE
;
4024 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
4025 enum elv_merge type
)
4027 if (type
== ELEVATOR_FRONT_MERGE
&&
4028 rb_prev(&req
->rb_node
) &&
4030 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
4031 struct request
, rb_node
))) {
4032 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
4033 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4034 struct request
*prev
, *next_rq
;
4036 /* Reposition request in its sort_list */
4037 elv_rb_del(&bfqq
->sort_list
, req
);
4038 elv_rb_add(&bfqq
->sort_list
, req
);
4040 /* Choose next request to be served for bfqq */
4041 prev
= bfqq
->next_rq
;
4042 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4043 bfqd
->last_position
);
4044 bfqq
->next_rq
= next_rq
;
4046 * If next_rq changes, update the queue's budget to fit
4049 if (prev
!= bfqq
->next_rq
)
4050 bfq_updated_next_req(bfqd
, bfqq
);
4054 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4055 struct request
*next
)
4057 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4059 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4061 spin_lock_irq(&bfqq
->bfqd
->lock
);
4064 * If next and rq belong to the same bfq_queue and next is older
4065 * than rq, then reposition rq in the fifo (by substituting next
4066 * with rq). Otherwise, if next and rq belong to different
4067 * bfq_queues, never reposition rq: in fact, we would have to
4068 * reposition it with respect to next's position in its own fifo,
4069 * which would most certainly be too expensive with respect to
4072 if (bfqq
== next_bfqq
&&
4073 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4074 next
->fifo_time
< rq
->fifo_time
) {
4075 list_del_init(&rq
->queuelist
);
4076 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4077 rq
->fifo_time
= next
->fifo_time
;
4080 if (bfqq
->next_rq
== next
)
4083 bfq_remove_request(q
, next
);
4085 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4087 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4090 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
4093 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4094 bool is_sync
= op_is_sync(bio
->bi_opf
);
4095 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4098 * Disallow merge of a sync bio into an async request.
4100 if (is_sync
&& !rq_is_sync(rq
))
4104 * Lookup the bfqq that this bio will be queued with. Allow
4105 * merge only if rq is queued there.
4110 return bfqq
== RQ_BFQQ(rq
);
4113 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
4114 struct bfq_queue
*bfqq
)
4117 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
4118 bfq_mark_bfqq_budget_new(bfqq
);
4119 bfq_clear_bfqq_fifo_expire(bfqq
);
4121 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
4123 bfq_log_bfqq(bfqd
, bfqq
,
4124 "set_in_service_queue, cur-budget = %d",
4125 bfqq
->entity
.budget
);
4128 bfqd
->in_service_queue
= bfqq
;
4132 * Get and set a new queue for service.
4134 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
4136 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
4138 __bfq_set_in_service_queue(bfqd
, bfqq
);
4142 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
4144 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4145 struct bfq_io_cq
*bic
;
4148 /* Processes have exited, don't wait. */
4149 bic
= bfqd
->in_service_bic
;
4150 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
4153 bfq_mark_bfqq_wait_request(bfqq
);
4156 * We don't want to idle for seeks, but we do want to allow
4157 * fair distribution of slice time for a process doing back-to-back
4158 * seeks. So allow a little bit of time for him to submit a new rq.
4160 sl
= bfqd
->bfq_slice_idle
;
4162 * Grant only minimum idle time if the queue is seeky.
4164 if (BFQQ_SEEKY(bfqq
))
4165 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
4167 bfqd
->last_idling_start
= ktime_get();
4168 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
4170 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
4174 * Set the maximum time for the in-service queue to consume its
4175 * budget. This prevents seeky processes from lowering the disk
4176 * throughput (always guaranteed with a time slice scheme as in CFQ).
4178 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
)
4180 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4181 unsigned int timeout_coeff
= bfqq
->entity
.weight
/
4182 bfqq
->entity
.orig_weight
;
4184 bfqd
->last_budget_start
= ktime_get();
4186 bfq_clear_bfqq_budget_new(bfqq
);
4187 bfqq
->budget_timeout
= jiffies
+
4188 bfqd
->bfq_timeout
* timeout_coeff
;
4190 bfq_log_bfqq(bfqd
, bfqq
, "set budget_timeout %u",
4191 jiffies_to_msecs(bfqd
->bfq_timeout
* timeout_coeff
));
4195 * In autotuning mode, max_budget is dynamically recomputed as the
4196 * amount of sectors transferred in timeout at the estimated peak
4197 * rate. This enables BFQ to utilize a full timeslice with a full
4198 * budget, even if the in-service queue is served at peak rate. And
4199 * this maximises throughput with sequential workloads.
4201 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
4203 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
4204 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
4207 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
4210 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
4211 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
4212 bfqd
->peak_rate_samples
= 1;
4213 bfqd
->sequential_samples
= 0;
4214 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
4216 } else /* no new rq dispatched, just reset the number of samples */
4217 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
4220 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4221 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
4222 bfqd
->tot_sectors_dispatched
);
4225 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
4227 u32 rate
, weight
, divisor
;
4230 * For the convergence property to hold (see comments on
4231 * bfq_update_peak_rate()) and for the assessment to be
4232 * reliable, a minimum number of samples must be present, and
4233 * a minimum amount of time must have elapsed. If not so, do
4234 * not compute new rate. Just reset parameters, to get ready
4235 * for a new evaluation attempt.
4237 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
4238 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
4239 goto reset_computation
;
4242 * If a new request completion has occurred after last
4243 * dispatch, then, to approximate the rate at which requests
4244 * have been served by the device, it is more precise to
4245 * extend the observation interval to the last completion.
4247 bfqd
->delta_from_first
=
4248 max_t(u64
, bfqd
->delta_from_first
,
4249 bfqd
->last_completion
- bfqd
->first_dispatch
);
4252 * Rate computed in sects/usec, and not sects/nsec, for
4255 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
4256 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
4259 * Peak rate not updated if:
4260 * - the percentage of sequential dispatches is below 3/4 of the
4261 * total, and rate is below the current estimated peak rate
4262 * - rate is unreasonably high (> 20M sectors/sec)
4264 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
4265 rate
<= bfqd
->peak_rate
) ||
4266 rate
> 20<<BFQ_RATE_SHIFT
)
4267 goto reset_computation
;
4270 * We have to update the peak rate, at last! To this purpose,
4271 * we use a low-pass filter. We compute the smoothing constant
4272 * of the filter as a function of the 'weight' of the new
4275 * As can be seen in next formulas, we define this weight as a
4276 * quantity proportional to how sequential the workload is,
4277 * and to how long the observation time interval is.
4279 * The weight runs from 0 to 8. The maximum value of the
4280 * weight, 8, yields the minimum value for the smoothing
4281 * constant. At this minimum value for the smoothing constant,
4282 * the measured rate contributes for half of the next value of
4283 * the estimated peak rate.
4285 * So, the first step is to compute the weight as a function
4286 * of how sequential the workload is. Note that the weight
4287 * cannot reach 9, because bfqd->sequential_samples cannot
4288 * become equal to bfqd->peak_rate_samples, which, in its
4289 * turn, holds true because bfqd->sequential_samples is not
4290 * incremented for the first sample.
4292 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
4295 * Second step: further refine the weight as a function of the
4296 * duration of the observation interval.
4298 weight
= min_t(u32
, 8,
4299 div_u64(weight
* bfqd
->delta_from_first
,
4300 BFQ_RATE_REF_INTERVAL
));
4303 * Divisor ranging from 10, for minimum weight, to 2, for
4306 divisor
= 10 - weight
;
4309 * Finally, update peak rate:
4311 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4313 bfqd
->peak_rate
*= divisor
-1;
4314 bfqd
->peak_rate
/= divisor
;
4315 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
4317 bfqd
->peak_rate
+= rate
;
4318 if (bfqd
->bfq_user_max_budget
== 0)
4319 bfqd
->bfq_max_budget
=
4320 bfq_calc_max_budget(bfqd
);
4323 bfq_reset_rate_computation(bfqd
, rq
);
4327 * Update the read/write peak rate (the main quantity used for
4328 * auto-tuning, see update_thr_responsiveness_params()).
4330 * It is not trivial to estimate the peak rate (correctly): because of
4331 * the presence of sw and hw queues between the scheduler and the
4332 * device components that finally serve I/O requests, it is hard to
4333 * say exactly when a given dispatched request is served inside the
4334 * device, and for how long. As a consequence, it is hard to know
4335 * precisely at what rate a given set of requests is actually served
4338 * On the opposite end, the dispatch time of any request is trivially
4339 * available, and, from this piece of information, the "dispatch rate"
4340 * of requests can be immediately computed. So, the idea in the next
4341 * function is to use what is known, namely request dispatch times
4342 * (plus, when useful, request completion times), to estimate what is
4343 * unknown, namely in-device request service rate.
4345 * The main issue is that, because of the above facts, the rate at
4346 * which a certain set of requests is dispatched over a certain time
4347 * interval can vary greatly with respect to the rate at which the
4348 * same requests are then served. But, since the size of any
4349 * intermediate queue is limited, and the service scheme is lossless
4350 * (no request is silently dropped), the following obvious convergence
4351 * property holds: the number of requests dispatched MUST become
4352 * closer and closer to the number of requests completed as the
4353 * observation interval grows. This is the key property used in
4354 * the next function to estimate the peak service rate as a function
4355 * of the observed dispatch rate. The function assumes to be invoked
4356 * on every request dispatch.
4358 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
4360 u64 now_ns
= ktime_get_ns();
4362 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
4363 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
4364 bfqd
->peak_rate_samples
);
4365 bfq_reset_rate_computation(bfqd
, rq
);
4366 goto update_last_values
; /* will add one sample */
4370 * Device idle for very long: the observation interval lasting
4371 * up to this dispatch cannot be a valid observation interval
4372 * for computing a new peak rate (similarly to the late-
4373 * completion event in bfq_completed_request()). Go to
4374 * update_rate_and_reset to have the following three steps
4376 * - close the observation interval at the last (previous)
4377 * request dispatch or completion
4378 * - compute rate, if possible, for that observation interval
4379 * - start a new observation interval with this dispatch
4381 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
4382 bfqd
->rq_in_driver
== 0)
4383 goto update_rate_and_reset
;
4385 /* Update sampling information */
4386 bfqd
->peak_rate_samples
++;
4388 if ((bfqd
->rq_in_driver
> 0 ||
4389 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
4390 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
4391 bfqd
->sequential_samples
++;
4393 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
4395 /* Reset max observed rq size every 32 dispatches */
4396 if (likely(bfqd
->peak_rate_samples
% 32))
4397 bfqd
->last_rq_max_size
=
4398 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
4400 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
4402 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
4404 /* Target observation interval not yet reached, go on sampling */
4405 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
4406 goto update_last_values
;
4408 update_rate_and_reset
:
4409 bfq_update_rate_reset(bfqd
, rq
);
4411 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
4412 bfqd
->last_dispatch
= now_ns
;
4416 * Remove request from internal lists.
4418 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
4420 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4423 * For consistency, the next instruction should have been
4424 * executed after removing the request from the queue and
4425 * dispatching it. We execute instead this instruction before
4426 * bfq_remove_request() (and hence introduce a temporary
4427 * inconsistency), for efficiency. In fact, should this
4428 * dispatch occur for a non in-service bfqq, this anticipated
4429 * increment prevents two counters related to bfqq->dispatched
4430 * from risking to be, first, uselessly decremented, and then
4431 * incremented again when the (new) value of bfqq->dispatched
4432 * happens to be taken into account.
4435 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
4437 bfq_remove_request(q
, rq
);
4440 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4442 if (RB_EMPTY_ROOT(&bfqq
->sort_list
))
4443 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
4445 bfq_requeue_bfqq(bfqd
, bfqq
);
4448 * All in-service entities must have been properly deactivated
4449 * or requeued before executing the next function, which
4450 * resets all in-service entites as no more in service.
4452 __bfq_bfqd_reset_in_service(bfqd
);
4456 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4457 * @bfqd: device data.
4458 * @bfqq: queue to update.
4459 * @reason: reason for expiration.
4461 * Handle the feedback on @bfqq budget at queue expiration.
4462 * See the body for detailed comments.
4464 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
4465 struct bfq_queue
*bfqq
,
4466 enum bfqq_expiration reason
)
4468 struct request
*next_rq
;
4469 int budget
, min_budget
;
4471 budget
= bfqq
->max_budget
;
4472 min_budget
= bfq_min_budget(bfqd
);
4474 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
4475 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
4476 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
4477 budget
, bfq_min_budget(bfqd
));
4478 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
4479 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
4481 if (bfq_bfqq_sync(bfqq
)) {
4484 * Caveat: in all the following cases we trade latency
4487 case BFQQE_TOO_IDLE
:
4489 * This is the only case where we may reduce
4490 * the budget: if there is no request of the
4491 * process still waiting for completion, then
4492 * we assume (tentatively) that the timer has
4493 * expired because the batch of requests of
4494 * the process could have been served with a
4495 * smaller budget. Hence, betting that
4496 * process will behave in the same way when it
4497 * becomes backlogged again, we reduce its
4498 * next budget. As long as we guess right,
4499 * this budget cut reduces the latency
4500 * experienced by the process.
4502 * However, if there are still outstanding
4503 * requests, then the process may have not yet
4504 * issued its next request just because it is
4505 * still waiting for the completion of some of
4506 * the still outstanding ones. So in this
4507 * subcase we do not reduce its budget, on the
4508 * contrary we increase it to possibly boost
4509 * the throughput, as discussed in the
4510 * comments to the BUDGET_TIMEOUT case.
4512 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
4513 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4515 if (budget
> 5 * min_budget
)
4516 budget
-= 4 * min_budget
;
4518 budget
= min_budget
;
4521 case BFQQE_BUDGET_TIMEOUT
:
4523 * We double the budget here because it gives
4524 * the chance to boost the throughput if this
4525 * is not a seeky process (and has bumped into
4526 * this timeout because of, e.g., ZBR).
4528 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4530 case BFQQE_BUDGET_EXHAUSTED
:
4532 * The process still has backlog, and did not
4533 * let either the budget timeout or the disk
4534 * idling timeout expire. Hence it is not
4535 * seeky, has a short thinktime and may be
4536 * happy with a higher budget too. So
4537 * definitely increase the budget of this good
4538 * candidate to boost the disk throughput.
4540 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
4542 case BFQQE_NO_MORE_REQUESTS
:
4544 * For queues that expire for this reason, it
4545 * is particularly important to keep the
4546 * budget close to the actual service they
4547 * need. Doing so reduces the timestamp
4548 * misalignment problem described in the
4549 * comments in the body of
4550 * __bfq_activate_entity. In fact, suppose
4551 * that a queue systematically expires for
4552 * BFQQE_NO_MORE_REQUESTS and presents a
4553 * new request in time to enjoy timestamp
4554 * back-shifting. The larger the budget of the
4555 * queue is with respect to the service the
4556 * queue actually requests in each service
4557 * slot, the more times the queue can be
4558 * reactivated with the same virtual finish
4559 * time. It follows that, even if this finish
4560 * time is pushed to the system virtual time
4561 * to reduce the consequent timestamp
4562 * misalignment, the queue unjustly enjoys for
4563 * many re-activations a lower finish time
4564 * than all newly activated queues.
4566 * The service needed by bfqq is measured
4567 * quite precisely by bfqq->entity.service.
4568 * Since bfqq does not enjoy device idling,
4569 * bfqq->entity.service is equal to the number
4570 * of sectors that the process associated with
4571 * bfqq requested to read/write before waiting
4572 * for request completions, or blocking for
4575 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
4582 * Async queues get always the maximum possible
4583 * budget, as for them we do not care about latency
4584 * (in addition, their ability to dispatch is limited
4585 * by the charging factor).
4587 budget
= bfqd
->bfq_max_budget
;
4590 bfqq
->max_budget
= budget
;
4592 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
4593 !bfqd
->bfq_user_max_budget
)
4594 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
4597 * If there is still backlog, then assign a new budget, making
4598 * sure that it is large enough for the next request. Since
4599 * the finish time of bfqq must be kept in sync with the
4600 * budget, be sure to call __bfq_bfqq_expire() *after* this
4603 * If there is no backlog, then no need to update the budget;
4604 * it will be updated on the arrival of a new request.
4606 next_rq
= bfqq
->next_rq
;
4608 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
4609 bfq_serv_to_charge(next_rq
, bfqq
));
4611 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
4612 next_rq
? blk_rq_sectors(next_rq
) : 0,
4613 bfqq
->entity
.budget
);
4617 * Return true if the process associated with bfqq is "slow". The slow
4618 * flag is used, in addition to the budget timeout, to reduce the
4619 * amount of service provided to seeky processes, and thus reduce
4620 * their chances to lower the throughput. More details in the comments
4621 * on the function bfq_bfqq_expire().
4623 * An important observation is in order: as discussed in the comments
4624 * on the function bfq_update_peak_rate(), with devices with internal
4625 * queues, it is hard if ever possible to know when and for how long
4626 * an I/O request is processed by the device (apart from the trivial
4627 * I/O pattern where a new request is dispatched only after the
4628 * previous one has been completed). This makes it hard to evaluate
4629 * the real rate at which the I/O requests of each bfq_queue are
4630 * served. In fact, for an I/O scheduler like BFQ, serving a
4631 * bfq_queue means just dispatching its requests during its service
4632 * slot (i.e., until the budget of the queue is exhausted, or the
4633 * queue remains idle, or, finally, a timeout fires). But, during the
4634 * service slot of a bfq_queue, around 100 ms at most, the device may
4635 * be even still processing requests of bfq_queues served in previous
4636 * service slots. On the opposite end, the requests of the in-service
4637 * bfq_queue may be completed after the service slot of the queue
4640 * Anyway, unless more sophisticated solutions are used
4641 * (where possible), the sum of the sizes of the requests dispatched
4642 * during the service slot of a bfq_queue is probably the only
4643 * approximation available for the service received by the bfq_queue
4644 * during its service slot. And this sum is the quantity used in this
4645 * function to evaluate the I/O speed of a process.
4647 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
4648 bool compensate
, enum bfqq_expiration reason
,
4649 unsigned long *delta_ms
)
4651 ktime_t delta_ktime
;
4653 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
4655 if (!bfq_bfqq_sync(bfqq
))
4659 delta_ktime
= bfqd
->last_idling_start
;
4661 delta_ktime
= ktime_get();
4662 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
4663 delta_usecs
= ktime_to_us(delta_ktime
);
4665 /* don't use too short time intervals */
4666 if (delta_usecs
< 1000) {
4667 if (blk_queue_nonrot(bfqd
->queue
))
4669 * give same worst-case guarantees as idling
4672 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
4673 else /* charge at least one seek */
4674 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
4679 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
4682 * Use only long (> 20ms) intervals to filter out excessive
4683 * spikes in service rate estimation.
4685 if (delta_usecs
> 20000) {
4687 * Caveat for rotational devices: processes doing I/O
4688 * in the slower disk zones tend to be slow(er) even
4689 * if not seeky. In this respect, the estimated peak
4690 * rate is likely to be an average over the disk
4691 * surface. Accordingly, to not be too harsh with
4692 * unlucky processes, a process is deemed slow only if
4693 * its rate has been lower than half of the estimated
4696 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
4699 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
4705 * Return the farthest past time instant according to jiffies
4708 static unsigned long bfq_smallest_from_now(void)
4710 return jiffies
- MAX_JIFFY_OFFSET
;
4714 * bfq_bfqq_expire - expire a queue.
4715 * @bfqd: device owning the queue.
4716 * @bfqq: the queue to expire.
4717 * @compensate: if true, compensate for the time spent idling.
4718 * @reason: the reason causing the expiration.
4720 * If the process associated with bfqq does slow I/O (e.g., because it
4721 * issues random requests), we charge bfqq with the time it has been
4722 * in service instead of the service it has received (see
4723 * bfq_bfqq_charge_time for details on how this goal is achieved). As
4724 * a consequence, bfqq will typically get higher timestamps upon
4725 * reactivation, and hence it will be rescheduled as if it had
4726 * received more service than what it has actually received. In the
4727 * end, bfqq receives less service in proportion to how slowly its
4728 * associated process consumes its budgets (and hence how seriously it
4729 * tends to lower the throughput). In addition, this time-charging
4730 * strategy guarantees time fairness among slow processes. In
4731 * contrast, if the process associated with bfqq is not slow, we
4732 * charge bfqq exactly with the service it has received.
4734 * Charging time to the first type of queues and the exact service to
4735 * the other has the effect of using the WF2Q+ policy to schedule the
4736 * former on a timeslice basis, without violating service domain
4737 * guarantees among the latter.
4739 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4740 struct bfq_queue
*bfqq
,
4742 enum bfqq_expiration reason
)
4745 unsigned long delta
= 0;
4746 struct bfq_entity
*entity
= &bfqq
->entity
;
4750 * Check whether the process is slow (see bfq_bfqq_is_slow).
4752 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
4755 * As above explained, charge slow (typically seeky) and
4756 * timed-out queues with the time and not the service
4757 * received, to favor sequential workloads.
4759 * Processes doing I/O in the slower disk zones will tend to
4760 * be slow(er) even if not seeky. Therefore, since the
4761 * estimated peak rate is actually an average over the disk
4762 * surface, these processes may timeout just for bad luck. To
4763 * avoid punishing them, do not charge time to processes that
4764 * succeeded in consuming at least 2/3 of their budget. This
4765 * allows BFQ to preserve enough elasticity to still perform
4766 * bandwidth, and not time, distribution with little unlucky
4767 * or quasi-sequential processes.
4770 (reason
== BFQQE_BUDGET_TIMEOUT
&&
4771 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3))
4772 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
4774 if (reason
== BFQQE_TOO_IDLE
&&
4775 entity
->service
<= 2 * entity
->budget
/ 10)
4776 bfq_clear_bfqq_IO_bound(bfqq
);
4778 bfq_log_bfqq(bfqd
, bfqq
,
4779 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
4780 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
4783 * Increase, decrease or leave budget unchanged according to
4786 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
4788 __bfq_bfqq_expire(bfqd
, bfqq
);
4790 /* mark bfqq as waiting a request only if a bic still points to it */
4791 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
4792 reason
!= BFQQE_BUDGET_TIMEOUT
&&
4793 reason
!= BFQQE_BUDGET_EXHAUSTED
)
4794 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
4798 * Budget timeout is not implemented through a dedicated timer, but
4799 * just checked on request arrivals and completions, as well as on
4800 * idle timer expirations.
4802 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
4804 if (bfq_bfqq_budget_new(bfqq
) ||
4805 time_is_after_jiffies(bfqq
->budget_timeout
))
4811 * If we expire a queue that is actively waiting (i.e., with the
4812 * device idled) for the arrival of a new request, then we may incur
4813 * the timestamp misalignment problem described in the body of the
4814 * function __bfq_activate_entity. Hence we return true only if this
4815 * condition does not hold, or if the queue is slow enough to deserve
4816 * only to be kicked off for preserving a high throughput.
4818 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
4820 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4821 "may_budget_timeout: wait_request %d left %d timeout %d",
4822 bfq_bfqq_wait_request(bfqq
),
4823 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
4824 bfq_bfqq_budget_timeout(bfqq
));
4826 return (!bfq_bfqq_wait_request(bfqq
) ||
4827 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
4829 bfq_bfqq_budget_timeout(bfqq
);
4833 * For a queue that becomes empty, device idling is allowed only if
4834 * this function returns true for the queue. And this function returns
4835 * true only if idling is beneficial for throughput.
4837 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
4839 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4840 bool idling_boosts_thr
;
4842 if (bfqd
->strict_guarantees
)
4846 * The value of the next variable is computed considering that
4847 * idling is usually beneficial for the throughput if:
4848 * (a) the device is not NCQ-capable, or
4849 * (b) regardless of the presence of NCQ, the request pattern
4850 * for bfqq is I/O-bound (possible throughput losses
4851 * caused by granting idling to seeky queues are mitigated
4852 * by the fact that, in all scenarios where boosting
4853 * throughput is the best thing to do, i.e., in all
4854 * symmetric scenarios, only a minimal idle time is
4855 * allowed to seeky queues).
4857 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
4860 * We have now the components we need to compute the return
4861 * value of the function, which is true only if both the
4862 * following conditions hold:
4863 * 1) bfqq is sync, because idling make sense only for sync queues;
4864 * 2) idling boosts the throughput.
4866 return bfq_bfqq_sync(bfqq
) && idling_boosts_thr
;
4870 * If the in-service queue is empty but the function bfq_bfqq_may_idle
4871 * returns true, then:
4872 * 1) the queue must remain in service and cannot be expired, and
4873 * 2) the device must be idled to wait for the possible arrival of a new
4874 * request for the queue.
4875 * See the comments on the function bfq_bfqq_may_idle for the reasons
4876 * why performing device idling is the best choice to boost the throughput
4877 * and preserve service guarantees when bfq_bfqq_may_idle itself
4880 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
4882 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4884 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
4885 bfq_bfqq_may_idle(bfqq
);
4889 * Select a queue for service. If we have a current queue in service,
4890 * check whether to continue servicing it, or retrieve and set a new one.
4892 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
4894 struct bfq_queue
*bfqq
;
4895 struct request
*next_rq
;
4896 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
4898 bfqq
= bfqd
->in_service_queue
;
4902 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
4904 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
4905 !bfq_bfqq_wait_request(bfqq
) &&
4906 !bfq_bfqq_must_idle(bfqq
))
4911 * This loop is rarely executed more than once. Even when it
4912 * happens, it is much more convenient to re-execute this loop
4913 * than to return NULL and trigger a new dispatch to get a
4916 next_rq
= bfqq
->next_rq
;
4918 * If bfqq has requests queued and it has enough budget left to
4919 * serve them, keep the queue, otherwise expire it.
4922 if (bfq_serv_to_charge(next_rq
, bfqq
) >
4923 bfq_bfqq_budget_left(bfqq
)) {
4925 * Expire the queue for budget exhaustion,
4926 * which makes sure that the next budget is
4927 * enough to serve the next request, even if
4928 * it comes from the fifo expired path.
4930 reason
= BFQQE_BUDGET_EXHAUSTED
;
4934 * The idle timer may be pending because we may
4935 * not disable disk idling even when a new request
4938 if (bfq_bfqq_wait_request(bfqq
)) {
4940 * If we get here: 1) at least a new request
4941 * has arrived but we have not disabled the
4942 * timer because the request was too small,
4943 * 2) then the block layer has unplugged
4944 * the device, causing the dispatch to be
4947 * Since the device is unplugged, now the
4948 * requests are probably large enough to
4949 * provide a reasonable throughput.
4950 * So we disable idling.
4952 bfq_clear_bfqq_wait_request(bfqq
);
4953 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
4954 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
4961 * No requests pending. However, if the in-service queue is idling
4962 * for a new request, or has requests waiting for a completion and
4963 * may idle after their completion, then keep it anyway.
4965 if (bfq_bfqq_wait_request(bfqq
) ||
4966 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
4971 reason
= BFQQE_NO_MORE_REQUESTS
;
4973 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
4975 bfqq
= bfq_set_in_service_queue(bfqd
);
4977 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
4982 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
4984 bfq_log(bfqd
, "select_queue: no queue returned");
4990 * Dispatch next request from bfqq.
4992 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
4993 struct bfq_queue
*bfqq
)
4995 struct request
*rq
= bfqq
->next_rq
;
4996 unsigned long service_to_charge
;
4998 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
5000 bfq_bfqq_served(bfqq
, service_to_charge
);
5002 bfq_dispatch_remove(bfqd
->queue
, rq
);
5004 if (!bfqd
->in_service_bic
) {
5005 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
5006 bfqd
->in_service_bic
= RQ_BIC(rq
);
5010 * Expire bfqq, pretending that its budget expired, if bfqq
5011 * belongs to CLASS_IDLE and other queues are waiting for
5014 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
5020 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
5024 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
5026 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5029 * Avoiding lock: a race on bfqd->busy_queues should cause at
5030 * most a call to dispatch for nothing
5032 return !list_empty_careful(&bfqd
->dispatch
) ||
5033 bfqd
->busy_queues
> 0;
5036 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5038 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5039 struct request
*rq
= NULL
;
5040 struct bfq_queue
*bfqq
= NULL
;
5042 if (!list_empty(&bfqd
->dispatch
)) {
5043 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
5045 list_del_init(&rq
->queuelist
);
5051 * Increment counters here, because this
5052 * dispatch does not follow the standard
5053 * dispatch flow (where counters are
5058 goto inc_in_driver_start_rq
;
5062 * We exploit the put_rq_private hook to decrement
5063 * rq_in_driver, but put_rq_private will not be
5064 * invoked on this request. So, to avoid unbalance,
5065 * just start this request, without incrementing
5066 * rq_in_driver. As a negative consequence,
5067 * rq_in_driver is deceptively lower than it should be
5068 * while this request is in service. This may cause
5069 * bfq_schedule_dispatch to be invoked uselessly.
5071 * As for implementing an exact solution, the
5072 * put_request hook, if defined, is probably invoked
5073 * also on this request. So, by exploiting this hook,
5074 * we could 1) increment rq_in_driver here, and 2)
5075 * decrement it in put_request. Such a solution would
5076 * let the value of the counter be always accurate,
5077 * but it would entail using an extra interface
5078 * function. This cost seems higher than the benefit,
5079 * being the frequency of non-elevator-private
5080 * requests very low.
5085 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
5087 if (bfqd
->busy_queues
== 0)
5091 * Force device to serve one request at a time if
5092 * strict_guarantees is true. Forcing this service scheme is
5093 * currently the ONLY way to guarantee that the request
5094 * service order enforced by the scheduler is respected by a
5095 * queueing device. Otherwise the device is free even to make
5096 * some unlucky request wait for as long as the device
5099 * Of course, serving one request at at time may cause loss of
5102 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
5105 bfqq
= bfq_select_queue(bfqd
);
5109 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
5112 inc_in_driver_start_rq
:
5113 bfqd
->rq_in_driver
++;
5115 rq
->rq_flags
|= RQF_STARTED
;
5121 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5123 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5126 spin_lock_irq(&bfqd
->lock
);
5127 rq
= __bfq_dispatch_request(hctx
);
5128 spin_unlock_irq(&bfqd
->lock
);
5134 * Task holds one reference to the queue, dropped when task exits. Each rq
5135 * in-flight on this queue also holds a reference, dropped when rq is freed.
5137 * Scheduler lock must be held here. Recall not to use bfqq after calling
5138 * this function on it.
5140 static void bfq_put_queue(struct bfq_queue
*bfqq
)
5142 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5143 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
5147 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
5154 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
5156 kmem_cache_free(bfq_pool
, bfqq
);
5157 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5162 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5164 if (bfqq
== bfqd
->in_service_queue
) {
5165 __bfq_bfqq_expire(bfqd
, bfqq
);
5166 bfq_schedule_dispatch(bfqd
);
5169 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
5171 bfq_put_queue(bfqq
); /* release process reference */
5174 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
5176 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
5177 struct bfq_data
*bfqd
;
5180 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
5183 unsigned long flags
;
5185 spin_lock_irqsave(&bfqd
->lock
, flags
);
5186 bfq_exit_bfqq(bfqd
, bfqq
);
5187 bic_set_bfqq(bic
, NULL
, is_sync
);
5188 spin_unlock_irq(&bfqd
->lock
);
5192 static void bfq_exit_icq(struct io_cq
*icq
)
5194 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
5196 bfq_exit_icq_bfqq(bic
, true);
5197 bfq_exit_icq_bfqq(bic
, false);
5201 * Update the entity prio values; note that the new values will not
5202 * be used until the next (re)activation.
5205 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
5207 struct task_struct
*tsk
= current
;
5209 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5214 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
5215 switch (ioprio_class
) {
5217 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
5218 "bfq: bad prio class %d\n", ioprio_class
);
5219 case IOPRIO_CLASS_NONE
:
5221 * No prio set, inherit CPU scheduling settings.
5223 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
5224 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
5226 case IOPRIO_CLASS_RT
:
5227 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5228 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
5230 case IOPRIO_CLASS_BE
:
5231 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5232 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
5234 case IOPRIO_CLASS_IDLE
:
5235 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
5236 bfqq
->new_ioprio
= 7;
5237 bfq_clear_bfqq_idle_window(bfqq
);
5241 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
5242 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
5244 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
5247 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
5248 bfqq
->entity
.prio_changed
= 1;
5251 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
5253 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
5254 struct bfq_queue
*bfqq
;
5255 int ioprio
= bic
->icq
.ioc
->ioprio
;
5258 * This condition may trigger on a newly created bic, be sure to
5259 * drop the lock before returning.
5261 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
5264 bic
->ioprio
= ioprio
;
5266 bfqq
= bic_to_bfqq(bic
, false);
5268 /* release process reference on this queue */
5269 bfq_put_queue(bfqq
);
5270 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
5271 bic_set_bfqq(bic
, bfqq
, false);
5274 bfqq
= bic_to_bfqq(bic
, true);
5276 bfq_set_next_ioprio_data(bfqq
, bic
);
5279 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5280 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
5282 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
5283 INIT_LIST_HEAD(&bfqq
->fifo
);
5289 bfq_set_next_ioprio_data(bfqq
, bic
);
5292 if (!bfq_class_idle(bfqq
))
5293 bfq_mark_bfqq_idle_window(bfqq
);
5294 bfq_mark_bfqq_sync(bfqq
);
5296 bfq_clear_bfqq_sync(bfqq
);
5298 /* set end request to minus infinity from now */
5299 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
5301 bfq_mark_bfqq_IO_bound(bfqq
);
5305 /* Tentative initial value to trade off between thr and lat */
5306 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
5307 bfqq
->budget_timeout
= bfq_smallest_from_now();
5309 /* first request is almost certainly seeky */
5310 bfqq
->seek_history
= 1;
5313 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
5314 struct bfq_group
*bfqg
,
5315 int ioprio_class
, int ioprio
)
5317 switch (ioprio_class
) {
5318 case IOPRIO_CLASS_RT
:
5319 return &bfqg
->async_bfqq
[0][ioprio
];
5320 case IOPRIO_CLASS_NONE
:
5321 ioprio
= IOPRIO_NORM
;
5323 case IOPRIO_CLASS_BE
:
5324 return &bfqg
->async_bfqq
[1][ioprio
];
5325 case IOPRIO_CLASS_IDLE
:
5326 return &bfqg
->async_idle_bfqq
;
5332 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
5333 struct bio
*bio
, bool is_sync
,
5334 struct bfq_io_cq
*bic
)
5336 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5337 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
5338 struct bfq_queue
**async_bfqq
= NULL
;
5339 struct bfq_queue
*bfqq
;
5340 struct bfq_group
*bfqg
;
5344 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
5346 bfqq
= &bfqd
->oom_bfqq
;
5351 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
5358 bfqq
= kmem_cache_alloc_node(bfq_pool
,
5359 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
5363 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
5365 bfq_init_entity(&bfqq
->entity
, bfqg
);
5366 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
5368 bfqq
= &bfqd
->oom_bfqq
;
5369 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
5374 * Pin the queue now that it's allocated, scheduler exit will
5379 * Extra group reference, w.r.t. sync
5380 * queue. This extra reference is removed
5381 * only if bfqq->bfqg disappears, to
5382 * guarantee that this queue is not freed
5383 * until its group goes away.
5385 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
5391 bfqq
->ref
++; /* get a process reference to this queue */
5392 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
5397 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
5398 struct bfq_queue
*bfqq
)
5400 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
5401 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
5403 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
5405 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
5406 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
5407 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
5408 ttime
->ttime_samples
);
5412 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5415 bfqq
->seek_history
<<= 1;
5416 bfqq
->seek_history
|=
5417 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
5418 (!blk_queue_nonrot(bfqd
->queue
) ||
5419 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
5423 * Disable idle window if the process thinks too long or seeks so much that
5424 * it doesn't matter.
5426 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
5427 struct bfq_queue
*bfqq
,
5428 struct bfq_io_cq
*bic
)
5432 /* Don't idle for async or idle io prio class. */
5433 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
5436 enable_idle
= bfq_bfqq_idle_window(bfqq
);
5438 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
5439 bfqd
->bfq_slice_idle
== 0 ||
5440 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
)))
5442 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
5443 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
)
5448 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
5452 bfq_mark_bfqq_idle_window(bfqq
);
5454 bfq_clear_bfqq_idle_window(bfqq
);
5458 * Called when a new fs request (rq) is added to bfqq. Check if there's
5459 * something we should do about it.
5461 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5464 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
5466 if (rq
->cmd_flags
& REQ_META
)
5467 bfqq
->meta_pending
++;
5469 bfq_update_io_thinktime(bfqd
, bfqq
);
5470 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
5471 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
5473 bfq_update_idle_window(bfqd
, bfqq
, bic
);
5475 bfq_log_bfqq(bfqd
, bfqq
,
5476 "rq_enqueued: idle_window=%d (seeky %d)",
5477 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
5479 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
5481 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
5482 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
5483 blk_rq_sectors(rq
) < 32;
5484 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
5487 * There is just this request queued: if the request
5488 * is small and the queue is not to be expired, then
5491 * In this way, if the device is being idled to wait
5492 * for a new request from the in-service queue, we
5493 * avoid unplugging the device and committing the
5494 * device to serve just a small request. On the
5495 * contrary, we wait for the block layer to decide
5496 * when to unplug the device: hopefully, new requests
5497 * will be merged to this one quickly, then the device
5498 * will be unplugged and larger requests will be
5501 if (small_req
&& !budget_timeout
)
5505 * A large enough request arrived, or the queue is to
5506 * be expired: in both cases disk idling is to be
5507 * stopped, so clear wait_request flag and reset
5510 bfq_clear_bfqq_wait_request(bfqq
);
5511 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
5512 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
5515 * The queue is not empty, because a new request just
5516 * arrived. Hence we can safely expire the queue, in
5517 * case of budget timeout, without risking that the
5518 * timestamps of the queue are not updated correctly.
5519 * See [1] for more details.
5522 bfq_bfqq_expire(bfqd
, bfqq
, false,
5523 BFQQE_BUDGET_TIMEOUT
);
5527 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
5529 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5531 bfq_add_request(rq
);
5533 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
5534 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
5536 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
5539 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
5542 struct request_queue
*q
= hctx
->queue
;
5543 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5545 spin_lock_irq(&bfqd
->lock
);
5546 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
5547 spin_unlock_irq(&bfqd
->lock
);
5551 spin_unlock_irq(&bfqd
->lock
);
5553 blk_mq_sched_request_inserted(rq
);
5555 spin_lock_irq(&bfqd
->lock
);
5556 if (at_head
|| blk_rq_is_passthrough(rq
)) {
5558 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
5560 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
5562 __bfq_insert_request(bfqd
, rq
);
5564 if (rq_mergeable(rq
)) {
5565 elv_rqhash_add(q
, rq
);
5571 spin_unlock_irq(&bfqd
->lock
);
5574 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
5575 struct list_head
*list
, bool at_head
)
5577 while (!list_empty(list
)) {
5580 rq
= list_first_entry(list
, struct request
, queuelist
);
5581 list_del_init(&rq
->queuelist
);
5582 bfq_insert_request(hctx
, rq
, at_head
);
5586 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
5588 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
5589 bfqd
->rq_in_driver
);
5591 if (bfqd
->hw_tag
== 1)
5595 * This sample is valid if the number of outstanding requests
5596 * is large enough to allow a queueing behavior. Note that the
5597 * sum is not exact, as it's not taking into account deactivated
5600 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
5603 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
5606 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
5607 bfqd
->max_rq_in_driver
= 0;
5608 bfqd
->hw_tag_samples
= 0;
5611 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
5616 bfq_update_hw_tag(bfqd
);
5618 bfqd
->rq_in_driver
--;
5621 now_ns
= ktime_get_ns();
5623 bfqq
->ttime
.last_end_request
= now_ns
;
5626 * Using us instead of ns, to get a reasonable precision in
5627 * computing rate in next check.
5629 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
5632 * If the request took rather long to complete, and, according
5633 * to the maximum request size recorded, this completion latency
5634 * implies that the request was certainly served at a very low
5635 * rate (less than 1M sectors/sec), then the whole observation
5636 * interval that lasts up to this time instant cannot be a
5637 * valid time interval for computing a new peak rate. Invoke
5638 * bfq_update_rate_reset to have the following three steps
5640 * - close the observation interval at the last (previous)
5641 * request dispatch or completion
5642 * - compute rate, if possible, for that observation interval
5643 * - reset to zero samples, which will trigger a proper
5644 * re-initialization of the observation interval on next
5647 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
5648 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
5649 1UL<<(BFQ_RATE_SHIFT
- 10))
5650 bfq_update_rate_reset(bfqd
, NULL
);
5651 bfqd
->last_completion
= now_ns
;
5654 * If this is the in-service queue, check if it needs to be expired,
5655 * or if we want to idle in case it has no pending requests.
5657 if (bfqd
->in_service_queue
== bfqq
) {
5658 if (bfq_bfqq_budget_new(bfqq
))
5659 bfq_set_budget_timeout(bfqd
);
5661 if (bfq_bfqq_must_idle(bfqq
)) {
5662 bfq_arm_slice_timer(bfqd
);
5664 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
5665 bfq_bfqq_expire(bfqd
, bfqq
, false,
5666 BFQQE_BUDGET_TIMEOUT
);
5667 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
5668 (bfqq
->dispatched
== 0 ||
5669 !bfq_bfqq_may_idle(bfqq
)))
5670 bfq_bfqq_expire(bfqd
, bfqq
, false,
5671 BFQQE_NO_MORE_REQUESTS
);
5675 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
5679 bfq_put_queue(bfqq
);
5682 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
5684 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5685 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5687 if (rq
->rq_flags
& RQF_STARTED
)
5688 bfqg_stats_update_completion(bfqq_group(bfqq
),
5689 rq_start_time_ns(rq
),
5690 rq_io_start_time_ns(rq
),
5693 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
5694 unsigned long flags
;
5696 spin_lock_irqsave(&bfqd
->lock
, flags
);
5698 bfq_completed_request(bfqq
, bfqd
);
5699 bfq_put_rq_priv_body(bfqq
);
5701 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5704 * Request rq may be still/already in the scheduler,
5705 * in which case we need to remove it. And we cannot
5706 * defer such a check and removal, to avoid
5707 * inconsistencies in the time interval from the end
5708 * of this function to the start of the deferred work.
5709 * This situation seems to occur only in process
5710 * context, as a consequence of a merge. In the
5711 * current version of the code, this implies that the
5715 if (!RB_EMPTY_NODE(&rq
->rb_node
))
5716 bfq_remove_request(q
, rq
);
5717 bfq_put_rq_priv_body(bfqq
);
5720 rq
->elv
.priv
[0] = NULL
;
5721 rq
->elv
.priv
[1] = NULL
;
5725 * Allocate bfq data structures associated with this request.
5727 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
5730 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5731 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
5732 const int is_sync
= rq_is_sync(rq
);
5733 struct bfq_queue
*bfqq
;
5735 spin_lock_irq(&bfqd
->lock
);
5737 bfq_check_ioprio_change(bic
, bio
);
5742 bfq_bic_update_cgroup(bic
, bio
);
5744 bfqq
= bic_to_bfqq(bic
, is_sync
);
5745 if (!bfqq
|| bfqq
== &bfqd
->oom_bfqq
) {
5747 bfq_put_queue(bfqq
);
5748 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
5749 bic_set_bfqq(bic
, bfqq
, is_sync
);
5754 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
5755 rq
, bfqq
, bfqq
->ref
);
5757 rq
->elv
.priv
[0] = bic
;
5758 rq
->elv
.priv
[1] = bfqq
;
5760 spin_unlock_irq(&bfqd
->lock
);
5765 spin_unlock_irq(&bfqd
->lock
);
5770 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
5772 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5773 enum bfqq_expiration reason
;
5774 unsigned long flags
;
5776 spin_lock_irqsave(&bfqd
->lock
, flags
);
5777 bfq_clear_bfqq_wait_request(bfqq
);
5779 if (bfqq
!= bfqd
->in_service_queue
) {
5780 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5784 if (bfq_bfqq_budget_timeout(bfqq
))
5786 * Also here the queue can be safely expired
5787 * for budget timeout without wasting
5790 reason
= BFQQE_BUDGET_TIMEOUT
;
5791 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
5793 * The queue may not be empty upon timer expiration,
5794 * because we may not disable the timer when the
5795 * first request of the in-service queue arrives
5796 * during disk idling.
5798 reason
= BFQQE_TOO_IDLE
;
5800 goto schedule_dispatch
;
5802 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
5805 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5806 bfq_schedule_dispatch(bfqd
);
5810 * Handler of the expiration of the timer running if the in-service queue
5811 * is idling inside its time slice.
5813 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
5815 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
5817 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5820 * Theoretical race here: the in-service queue can be NULL or
5821 * different from the queue that was idling if a new request
5822 * arrives for the current queue and there is a full dispatch
5823 * cycle that changes the in-service queue. This can hardly
5824 * happen, but in the worst case we just expire a queue too
5828 bfq_idle_slice_timer_body(bfqq
);
5830 return HRTIMER_NORESTART
;
5833 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
5834 struct bfq_queue
**bfqq_ptr
)
5836 struct bfq_queue
*bfqq
= *bfqq_ptr
;
5838 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
5840 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
5842 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
5844 bfq_put_queue(bfqq
);
5850 * Release all the bfqg references to its async queues. If we are
5851 * deallocating the group these queues may still contain requests, so
5852 * we reparent them to the root cgroup (i.e., the only one that will
5853 * exist for sure until all the requests on a device are gone).
5855 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
5859 for (i
= 0; i
< 2; i
++)
5860 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
5861 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
5863 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
5866 static void bfq_exit_queue(struct elevator_queue
*e
)
5868 struct bfq_data
*bfqd
= e
->elevator_data
;
5869 struct bfq_queue
*bfqq
, *n
;
5871 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5873 spin_lock_irq(&bfqd
->lock
);
5874 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
5875 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
5876 spin_unlock_irq(&bfqd
->lock
);
5878 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5880 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5881 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
5883 spin_lock_irq(&bfqd
->lock
);
5884 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
5885 kfree(bfqd
->root_group
);
5886 spin_unlock_irq(&bfqd
->lock
);
5892 static void bfq_init_root_group(struct bfq_group
*root_group
,
5893 struct bfq_data
*bfqd
)
5897 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5898 root_group
->entity
.parent
= NULL
;
5899 root_group
->my_entity
= NULL
;
5900 root_group
->bfqd
= bfqd
;
5902 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
5903 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
5904 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
5907 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
5909 struct bfq_data
*bfqd
;
5910 struct elevator_queue
*eq
;
5912 eq
= elevator_alloc(q
, e
);
5916 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
5918 kobject_put(&eq
->kobj
);
5921 eq
->elevator_data
= bfqd
;
5923 spin_lock_irq(q
->queue_lock
);
5925 spin_unlock_irq(q
->queue_lock
);
5928 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
5929 * Grab a permanent reference to it, so that the normal code flow
5930 * will not attempt to free it.
5932 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
5933 bfqd
->oom_bfqq
.ref
++;
5934 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
5935 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
5936 bfqd
->oom_bfqq
.entity
.new_weight
=
5937 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
5939 * Trigger weight initialization, according to ioprio, at the
5940 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
5941 * class won't be changed any more.
5943 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
5947 INIT_LIST_HEAD(&bfqd
->dispatch
);
5949 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
5951 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
5953 INIT_LIST_HEAD(&bfqd
->active_list
);
5954 INIT_LIST_HEAD(&bfqd
->idle_list
);
5958 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
5960 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
5961 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
5962 bfqd
->bfq_back_max
= bfq_back_max
;
5963 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
5964 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
5965 bfqd
->bfq_timeout
= bfq_timeout
;
5967 bfqd
->bfq_requests_within_timer
= 120;
5969 spin_lock_init(&bfqd
->lock
);
5972 * The invocation of the next bfq_create_group_hierarchy
5973 * function is the head of a chain of function calls
5974 * (bfq_create_group_hierarchy->blkcg_activate_policy->
5975 * blk_mq_freeze_queue) that may lead to the invocation of the
5976 * has_work hook function. For this reason,
5977 * bfq_create_group_hierarchy is invoked only after all
5978 * scheduler data has been initialized, apart from the fields
5979 * that can be initialized only after invoking
5980 * bfq_create_group_hierarchy. This, in particular, enables
5981 * has_work to correctly return false. Of course, to avoid
5982 * other inconsistencies, the blk-mq stack must then refrain
5983 * from invoking further scheduler hooks before this init
5984 * function is finished.
5986 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
5987 if (!bfqd
->root_group
)
5989 bfq_init_root_group(bfqd
->root_group
, bfqd
);
5990 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
5997 kobject_put(&eq
->kobj
);
6001 static void bfq_slab_kill(void)
6003 kmem_cache_destroy(bfq_pool
);
6006 static int __init
bfq_slab_setup(void)
6008 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
6014 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
6016 return sprintf(page
, "%u\n", var
);
6019 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
6022 unsigned long new_val
;
6023 int ret
= kstrtoul(page
, 10, &new_val
);
6031 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
6032 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6034 struct bfq_data *bfqd = e->elevator_data; \
6035 u64 __data = __VAR; \
6037 __data = jiffies_to_msecs(__data); \
6038 else if (__CONV == 2) \
6039 __data = div_u64(__data, NSEC_PER_MSEC); \
6040 return bfq_var_show(__data, (page)); \
6042 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
6043 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
6044 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
6045 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
6046 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
6047 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
6048 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
6049 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
6050 #undef SHOW_FUNCTION
6052 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6053 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6055 struct bfq_data *bfqd = e->elevator_data; \
6056 u64 __data = __VAR; \
6057 __data = div_u64(__data, NSEC_PER_USEC); \
6058 return bfq_var_show(__data, (page)); \
6060 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
6061 #undef USEC_SHOW_FUNCTION
6063 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6065 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
6067 struct bfq_data *bfqd = e->elevator_data; \
6068 unsigned long uninitialized_var(__data); \
6069 int ret = bfq_var_store(&__data, (page), count); \
6070 if (__data < (MIN)) \
6072 else if (__data > (MAX)) \
6075 *(__PTR) = msecs_to_jiffies(__data); \
6076 else if (__CONV == 2) \
6077 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6079 *(__PTR) = __data; \
6082 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
6084 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
6086 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
6087 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
6089 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
6090 #undef STORE_FUNCTION
6092 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
6093 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
6095 struct bfq_data *bfqd = e->elevator_data; \
6096 unsigned long uninitialized_var(__data); \
6097 int ret = bfq_var_store(&__data, (page), count); \
6098 if (__data < (MIN)) \
6100 else if (__data > (MAX)) \
6102 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
6105 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
6107 #undef USEC_STORE_FUNCTION
6109 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
6110 const char *page
, size_t count
)
6112 struct bfq_data
*bfqd
= e
->elevator_data
;
6113 unsigned long uninitialized_var(__data
);
6114 int ret
= bfq_var_store(&__data
, (page
), count
);
6117 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6119 if (__data
> INT_MAX
)
6121 bfqd
->bfq_max_budget
= __data
;
6124 bfqd
->bfq_user_max_budget
= __data
;
6130 * Leaving this name to preserve name compatibility with cfq
6131 * parameters, but this timeout is used for both sync and async.
6133 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
6134 const char *page
, size_t count
)
6136 struct bfq_data
*bfqd
= e
->elevator_data
;
6137 unsigned long uninitialized_var(__data
);
6138 int ret
= bfq_var_store(&__data
, (page
), count
);
6142 else if (__data
> INT_MAX
)
6145 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
6146 if (bfqd
->bfq_user_max_budget
== 0)
6147 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6152 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
6153 const char *page
, size_t count
)
6155 struct bfq_data
*bfqd
= e
->elevator_data
;
6156 unsigned long uninitialized_var(__data
);
6157 int ret
= bfq_var_store(&__data
, (page
), count
);
6161 if (!bfqd
->strict_guarantees
&& __data
== 1
6162 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
6163 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
6165 bfqd
->strict_guarantees
= __data
;
6170 #define BFQ_ATTR(name) \
6171 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
6173 static struct elv_fs_entry bfq_attrs
[] = {
6174 BFQ_ATTR(fifo_expire_sync
),
6175 BFQ_ATTR(fifo_expire_async
),
6176 BFQ_ATTR(back_seek_max
),
6177 BFQ_ATTR(back_seek_penalty
),
6178 BFQ_ATTR(slice_idle
),
6179 BFQ_ATTR(slice_idle_us
),
6180 BFQ_ATTR(max_budget
),
6181 BFQ_ATTR(timeout_sync
),
6182 BFQ_ATTR(strict_guarantees
),
6186 static struct elevator_type iosched_bfq_mq
= {
6188 .get_rq_priv
= bfq_get_rq_private
,
6189 .put_rq_priv
= bfq_put_rq_private
,
6190 .exit_icq
= bfq_exit_icq
,
6191 .insert_requests
= bfq_insert_requests
,
6192 .dispatch_request
= bfq_dispatch_request
,
6193 .next_request
= elv_rb_latter_request
,
6194 .former_request
= elv_rb_former_request
,
6195 .allow_merge
= bfq_allow_bio_merge
,
6196 .bio_merge
= bfq_bio_merge
,
6197 .request_merge
= bfq_request_merge
,
6198 .requests_merged
= bfq_requests_merged
,
6199 .request_merged
= bfq_request_merged
,
6200 .has_work
= bfq_has_work
,
6201 .init_sched
= bfq_init_queue
,
6202 .exit_sched
= bfq_exit_queue
,
6206 .icq_size
= sizeof(struct bfq_io_cq
),
6207 .icq_align
= __alignof__(struct bfq_io_cq
),
6208 .elevator_attrs
= bfq_attrs
,
6209 .elevator_name
= "bfq",
6210 .elevator_owner
= THIS_MODULE
,
6213 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6214 static struct blkcg_policy blkcg_policy_bfq
= {
6215 .dfl_cftypes
= bfq_blkg_files
,
6216 .legacy_cftypes
= bfq_blkcg_legacy_files
,
6218 .cpd_alloc_fn
= bfq_cpd_alloc
,
6219 .cpd_init_fn
= bfq_cpd_init
,
6220 .cpd_bind_fn
= bfq_cpd_init
,
6221 .cpd_free_fn
= bfq_cpd_free
,
6223 .pd_alloc_fn
= bfq_pd_alloc
,
6224 .pd_init_fn
= bfq_pd_init
,
6225 .pd_offline_fn
= bfq_pd_offline
,
6226 .pd_free_fn
= bfq_pd_free
,
6227 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
6231 static int __init
bfq_init(void)
6235 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6236 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
6242 if (bfq_slab_setup())
6245 ret
= elv_register(&iosched_bfq_mq
);
6252 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6253 blkcg_policy_unregister(&blkcg_policy_bfq
);
6258 static void __exit
bfq_exit(void)
6260 elv_unregister(&iosched_bfq_mq
);
6261 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6262 blkcg_policy_unregister(&blkcg_policy_bfq
);
6267 module_init(bfq_init
);
6268 module_exit(bfq_exit
);
6270 MODULE_AUTHOR("Paolo Valente");
6271 MODULE_LICENSE("GPL");
6272 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");