2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
125 * struct bfq_service_tree - per ioprio_class service tree.
127 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
128 * ioprio_class has its own independent scheduler, and so its own
129 * bfq_service_tree. All the fields are protected by the queue lock
130 * of the containing bfqd.
132 struct bfq_service_tree
{
133 /* tree for active entities (i.e., those backlogged) */
134 struct rb_root active
;
135 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
138 /* idle entity with minimum F_i */
139 struct bfq_entity
*first_idle
;
140 /* idle entity with maximum F_i */
141 struct bfq_entity
*last_idle
;
143 /* scheduler virtual time */
145 /* scheduler weight sum; active and idle entities contribute to it */
150 * struct bfq_sched_data - multi-class scheduler.
152 * bfq_sched_data is the basic scheduler queue. It supports three
153 * ioprio_classes, and can be used either as a toplevel queue or as an
154 * intermediate queue on a hierarchical setup. @next_in_service
155 * points to the active entity of the sched_data service trees that
156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
159 * The supported ioprio_classes are the same as in CFQ, in descending
160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
161 * Requests from higher priority queues are served before all the
162 * requests from lower priority queues; among requests of the same
163 * queue requests are served according to B-WF2Q+.
164 * All the fields are protected by the queue lock of the containing bfqd.
166 struct bfq_sched_data
{
167 /* entity in service */
168 struct bfq_entity
*in_service_entity
;
169 /* head-of-line entity (see comments above) */
170 struct bfq_entity
*next_in_service
;
171 /* array of service trees, one per ioprio_class */
172 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service
;
179 * struct bfq_entity - schedulable entity.
181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
183 * entity belongs to the sched_data of the parent group in the cgroup
184 * hierarchy. Non-leaf entities have also their own sched_data, stored
187 * Each entity stores independently its priority values; this would
188 * allow different weights on different devices, but this
189 * functionality is not exported to userspace by now. Priorities and
190 * weights are updated lazily, first storing the new values into the
191 * new_* fields, then setting the @prio_changed flag. As soon as
192 * there is a transition in the entity state that allows the priority
193 * update to take place the effective and the requested priority
194 * values are synchronized.
196 * Unless cgroups are used, the weight value is calculated from the
197 * ioprio to export the same interface as CFQ. When dealing with
198 * ``well-behaved'' queues (i.e., queues that do not spend too much
199 * time to consume their budget and have true sequential behavior, and
200 * when there are no external factors breaking anticipation) the
201 * relative weights at each level of the cgroups hierarchy should be
202 * guaranteed. All the fields are protected by the queue lock of the
206 /* service_tree member */
207 struct rb_node rb_node
;
210 * Flag, true if the entity is on a tree (either the active or
211 * the idle one of its service_tree) or is in service.
215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
218 /* tree the entity is enqueued into; %NULL if not on a tree */
219 struct rb_root
*tree
;
222 * minimum start time of the (active) subtree rooted at this
223 * entity; used for O(log N) lookups into active trees
227 /* amount of service received during the last service slot */
230 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
233 /* weight of the queue */
235 /* next weight if a change is in progress */
238 /* original weight, used to implement weight boosting */
241 /* parent entity, for hierarchical scheduling */
242 struct bfq_entity
*parent
;
245 * For non-leaf nodes in the hierarchy, the associated
246 * scheduler queue, %NULL on leaf nodes.
248 struct bfq_sched_data
*my_sched_data
;
249 /* the scheduler queue this entity belongs to */
250 struct bfq_sched_data
*sched_data
;
252 /* flag, set to request a weight, ioprio or ioprio_class change */
259 * struct bfq_ttime - per process thinktime stats.
262 /* completion time of the last request */
263 u64 last_end_request
;
265 /* total process thinktime */
267 /* number of thinktime samples */
268 unsigned long ttime_samples
;
269 /* average process thinktime */
274 * struct bfq_queue - leaf schedulable entity.
276 * A bfq_queue is a leaf request queue; it can be associated with an
277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
284 /* reference counter */
286 /* parent bfq_data */
287 struct bfq_data
*bfqd
;
289 /* current ioprio and ioprio class */
290 unsigned short ioprio
, ioprio_class
;
291 /* next ioprio and ioprio class if a change is in progress */
292 unsigned short new_ioprio
, new_ioprio_class
;
294 /* sorted list of pending requests */
295 struct rb_root sort_list
;
296 /* if fifo isn't expired, next request to serve */
297 struct request
*next_rq
;
298 /* number of sync and async requests queued */
300 /* number of requests currently allocated */
302 /* number of pending metadata requests */
304 /* fifo list of requests in sort_list */
305 struct list_head fifo
;
307 /* entity representing this queue in the scheduler */
308 struct bfq_entity entity
;
310 /* maximum budget allowed from the feedback mechanism */
312 /* budget expiration (in jiffies) */
313 unsigned long budget_timeout
;
315 /* number of requests on the dispatch list or inside driver */
321 /* node for active/idle bfqq list inside parent bfqd */
322 struct list_head bfqq_list
;
324 /* associated @bfq_ttime struct */
325 struct bfq_ttime ttime
;
327 /* bit vector: a 1 for each seeky requests in history */
329 /* position of the last request enqueued */
330 sector_t last_request_pos
;
332 /* Number of consecutive pairs of request completion and
333 * arrival, such that the queue becomes idle after the
334 * completion, but the next request arrives within an idle
335 * time slice; used only if the queue's IO_bound flag has been
338 unsigned int requests_within_timer
;
340 /* pid of the process owning the queue, used for logging purposes */
345 * struct bfq_io_cq - per (request_queue, io_context) structure.
348 /* associated io_cq structure */
349 struct io_cq icq
; /* must be the first member */
350 /* array of two process queues, the sync and the async */
351 struct bfq_queue
*bfqq
[2];
352 /* per (request_queue, blkcg) ioprio */
354 #ifdef CONFIG_BFQ_GROUP_IOSCHED
355 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
360 * struct bfq_data - per-device data structure.
362 * All the fields are protected by @lock.
365 /* device request queue */
366 struct request_queue
*queue
;
368 struct list_head dispatch
;
370 /* root bfq_group for the device */
371 struct bfq_group
*root_group
;
374 * Number of bfq_queues containing requests (including the
375 * queue in service, even if it is idling).
378 /* number of queued requests */
380 /* number of requests dispatched and waiting for completion */
384 * Maximum number of requests in driver in the last
385 * @hw_tag_samples completed requests.
387 int max_rq_in_driver
;
388 /* number of samples used to calculate hw_tag */
390 /* flag set to one if the driver is showing a queueing behavior */
393 /* number of budgets assigned */
394 int budgets_assigned
;
397 * Timer set when idling (waiting) for the next request from
398 * the queue in service.
400 struct hrtimer idle_slice_timer
;
402 /* bfq_queue in service */
403 struct bfq_queue
*in_service_queue
;
404 /* bfq_io_cq (bic) associated with the @in_service_queue */
405 struct bfq_io_cq
*in_service_bic
;
407 /* on-disk position of the last served request */
408 sector_t last_position
;
410 /* time of last request completion (ns) */
413 /* time of first rq dispatch in current observation interval (ns) */
415 /* time of last rq dispatch in current observation interval (ns) */
418 /* beginning of the last budget */
419 ktime_t last_budget_start
;
420 /* beginning of the last idle slice */
421 ktime_t last_idling_start
;
423 /* number of samples in current observation interval */
424 int peak_rate_samples
;
425 /* num of samples of seq dispatches in current observation interval */
426 u32 sequential_samples
;
427 /* total num of sectors transferred in current observation interval */
428 u64 tot_sectors_dispatched
;
429 /* max rq size seen during current observation interval (sectors) */
430 u32 last_rq_max_size
;
431 /* time elapsed from first dispatch in current observ. interval (us) */
432 u64 delta_from_first
;
434 * Current estimate of the device peak rate, measured in
435 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
436 * BFQ_RATE_SHIFT is performed to increase precision in
437 * fixed-point calculations.
441 /* maximum budget allotted to a bfq_queue before rescheduling */
444 /* list of all the bfq_queues active on the device */
445 struct list_head active_list
;
446 /* list of all the bfq_queues idle on the device */
447 struct list_head idle_list
;
450 * Timeout for async/sync requests; when it fires, requests
451 * are served in fifo order.
453 u64 bfq_fifo_expire
[2];
454 /* weight of backward seeks wrt forward ones */
455 unsigned int bfq_back_penalty
;
456 /* maximum allowed backward seek */
457 unsigned int bfq_back_max
;
458 /* maximum idling time */
461 /* user-configured max budget value (0 for auto-tuning) */
462 int bfq_user_max_budget
;
464 * Timeout for bfq_queues to consume their budget; used to
465 * prevent seeky queues from imposing long latencies to
466 * sequential or quasi-sequential ones (this also implies that
467 * seeky queues cannot receive guarantees in the service
468 * domain; after a timeout they are charged for the time they
469 * have been in service, to preserve fairness among them, but
470 * without service-domain guarantees).
472 unsigned int bfq_timeout
;
475 * Number of consecutive requests that must be issued within
476 * the idle time slice to set again idling to a queue which
477 * was marked as non-I/O-bound (see the definition of the
478 * IO_bound flag for further details).
480 unsigned int bfq_requests_within_timer
;
483 * Force device idling whenever needed to provide accurate
484 * service guarantees, without caring about throughput
485 * issues. CAVEAT: this may even increase latencies, in case
486 * of useless idling for processes that did stop doing I/O.
488 bool strict_guarantees
;
490 /* fallback dummy bfqq for extreme OOM conditions */
491 struct bfq_queue oom_bfqq
;
496 * bic associated with the task issuing current bio for
497 * merging. This and the next field are used as a support to
498 * be able to perform the bic lookup, needed by bio-merge
499 * functions, before the scheduler lock is taken, and thus
500 * avoid taking the request-queue lock while the scheduler
501 * lock is being held.
503 struct bfq_io_cq
*bio_bic
;
504 /* bfqq associated with the task issuing current bio for merging */
505 struct bfq_queue
*bio_bfqq
;
508 enum bfqq_state_flags
{
509 BFQQF_busy
= 0, /* has requests or is in service */
510 BFQQF_wait_request
, /* waiting for a request */
511 BFQQF_non_blocking_wait_rq
, /*
512 * waiting for a request
513 * without idling the device
515 BFQQF_fifo_expire
, /* FIFO checked in this slice */
516 BFQQF_idle_window
, /* slice idling enabled */
517 BFQQF_sync
, /* synchronous queue */
518 BFQQF_budget_new
, /* no completion with this budget */
520 * bfqq has timed-out at least once
521 * having consumed at most 2/10 of
526 #define BFQ_BFQQ_FNS(name) \
527 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
529 __set_bit(BFQQF_##name, &(bfqq)->flags); \
531 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
533 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
535 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
537 return test_bit(BFQQF_##name, &(bfqq)->flags); \
541 BFQ_BFQQ_FNS(wait_request
);
542 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
543 BFQ_BFQQ_FNS(fifo_expire
);
544 BFQ_BFQQ_FNS(idle_window
);
546 BFQ_BFQQ_FNS(budget_new
);
547 BFQ_BFQQ_FNS(IO_bound
);
550 /* Logging facilities. */
551 #ifdef CONFIG_BFQ_GROUP_IOSCHED
552 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
553 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
555 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
558 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
559 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
560 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
564 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
567 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
568 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
571 #else /* CONFIG_BFQ_GROUP_IOSCHED */
573 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
574 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
575 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
577 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
579 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
581 #define bfq_log(bfqd, fmt, args...) \
582 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
584 /* Expiration reasons. */
585 enum bfqq_expiration
{
586 BFQQE_TOO_IDLE
= 0, /*
587 * queue has been idling for
590 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
591 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
592 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
593 BFQQE_PREEMPTED
/* preemption in progress */
597 #ifdef CONFIG_BFQ_GROUP_IOSCHED
598 /* number of ios merged */
599 struct blkg_rwstat merged
;
600 /* total time spent on device in ns, may not be accurate w/ queueing */
601 struct blkg_rwstat service_time
;
602 /* total time spent waiting in scheduler queue in ns */
603 struct blkg_rwstat wait_time
;
604 /* number of IOs queued up */
605 struct blkg_rwstat queued
;
606 /* total disk time and nr sectors dispatched by this group */
607 struct blkg_stat time
;
608 /* sum of number of ios queued across all samples */
609 struct blkg_stat avg_queue_size_sum
;
610 /* count of samples taken for average */
611 struct blkg_stat avg_queue_size_samples
;
612 /* how many times this group has been removed from service tree */
613 struct blkg_stat dequeue
;
614 /* total time spent waiting for it to be assigned a timeslice. */
615 struct blkg_stat group_wait_time
;
616 /* time spent idling for this blkcg_gq */
617 struct blkg_stat idle_time
;
618 /* total time with empty current active q with other requests queued */
619 struct blkg_stat empty_time
;
620 /* fields after this shouldn't be cleared on stat reset */
621 uint64_t start_group_wait_time
;
622 uint64_t start_idle_time
;
623 uint64_t start_empty_time
;
625 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
628 #ifdef CONFIG_BFQ_GROUP_IOSCHED
631 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
633 * @ps: @blkcg_policy_storage that this structure inherits
634 * @weight: weight of the bfq_group
636 struct bfq_group_data
{
637 /* must be the first member */
638 struct blkcg_policy_data pd
;
640 unsigned short weight
;
644 * struct bfq_group - per (device, cgroup) data structure.
645 * @entity: schedulable entity to insert into the parent group sched_data.
646 * @sched_data: own sched_data, to contain child entities (they may be
647 * both bfq_queues and bfq_groups).
648 * @bfqd: the bfq_data for the device this group acts upon.
649 * @async_bfqq: array of async queues for all the tasks belonging to
650 * the group, one queue per ioprio value per ioprio_class,
651 * except for the idle class that has only one queue.
652 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
653 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
654 * to avoid too many special cases during group creation/
656 * @stats: stats for this bfqg.
658 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
659 * there is a set of bfq_groups, each one collecting the lower-level
660 * entities belonging to the group that are acting on the same device.
662 * Locking works as follows:
663 * o @bfqd is protected by the queue lock, RCU is used to access it
665 * o All the other fields are protected by the @bfqd queue lock.
668 /* must be the first member */
669 struct blkg_policy_data pd
;
671 struct bfq_entity entity
;
672 struct bfq_sched_data sched_data
;
676 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
677 struct bfq_queue
*async_idle_bfqq
;
679 struct bfq_entity
*my_entity
;
681 struct bfqg_stats stats
;
686 struct bfq_sched_data sched_data
;
688 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
689 struct bfq_queue
*async_idle_bfqq
;
691 struct rb_root rq_pos_tree
;
695 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
697 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
699 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
701 return bfqq
? bfqq
->ioprio_class
- 1 :
702 BFQ_DEFAULT_GRP_CLASS
- 1;
705 static struct bfq_service_tree
*
706 bfq_entity_service_tree(struct bfq_entity
*entity
)
708 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
709 unsigned int idx
= bfq_class_idx(entity
);
711 return sched_data
->service_tree
+ idx
;
714 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
716 return bic
->bfqq
[is_sync
];
719 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
722 bic
->bfqq
[is_sync
] = bfqq
;
725 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
727 return bic
->icq
.q
->elevator
->elevator_data
;
730 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
731 static void bfq_put_queue(struct bfq_queue
*bfqq
);
732 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
733 struct bio
*bio
, bool is_sync
,
734 struct bfq_io_cq
*bic
);
735 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
736 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
738 /* Expiration time of sync (0) and async (1) requests, in ns. */
739 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
741 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
742 static const int bfq_back_max
= 16 * 1024;
744 /* Penalty of a backwards seek, in number of sectors. */
745 static const int bfq_back_penalty
= 2;
747 /* Idling period duration, in ns. */
748 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
750 /* Minimum number of assigned budgets for which stats are safe to compute. */
751 static const int bfq_stats_min_budgets
= 194;
753 /* Default maximum budget values, in sectors and number of requests. */
754 static const int bfq_default_max_budget
= 16 * 1024;
756 /* Default timeout values, in jiffies, approximating CFQ defaults. */
757 static const int bfq_timeout
= HZ
/ 8;
759 static struct kmem_cache
*bfq_pool
;
761 /* Below this threshold (in ns), we consider thinktime immediate. */
762 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
764 /* hw_tag detection: parallel requests threshold and min samples needed. */
765 #define BFQ_HW_QUEUE_THRESHOLD 4
766 #define BFQ_HW_QUEUE_SAMPLES 32
768 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
769 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
770 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
771 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
773 /* Min number of samples required to perform peak-rate update */
774 #define BFQ_RATE_MIN_SAMPLES 32
775 /* Min observation time interval required to perform a peak-rate update (ns) */
776 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
777 /* Target observation time interval for a peak-rate update (ns) */
778 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
780 /* Shift used for peak rate fixed precision calculations. */
781 #define BFQ_RATE_SHIFT 16
783 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
784 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
786 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
787 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
790 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
791 * @icq: the iocontext queue.
793 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
795 /* bic->icq is the first member, %NULL will convert to %NULL */
796 return container_of(icq
, struct bfq_io_cq
, icq
);
800 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
801 * @bfqd: the lookup key.
802 * @ioc: the io_context of the process doing I/O.
803 * @q: the request queue.
805 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
806 struct io_context
*ioc
,
807 struct request_queue
*q
)
811 struct bfq_io_cq
*icq
;
813 spin_lock_irqsave(q
->queue_lock
, flags
);
814 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
815 spin_unlock_irqrestore(q
->queue_lock
, flags
);
824 * Scheduler run of queue, if there are requests pending and no one in the
825 * driver that will restart queueing.
827 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
829 if (bfqd
->queued
!= 0) {
830 bfq_log(bfqd
, "schedule dispatch");
831 blk_mq_run_hw_queues(bfqd
->queue
, true);
836 * bfq_gt - compare two timestamps.
840 * Return @a > @b, dealing with wrapping correctly.
842 static int bfq_gt(u64 a
, u64 b
)
844 return (s64
)(a
- b
) > 0;
847 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
849 struct rb_node
*node
= tree
->rb_node
;
851 return rb_entry(node
, struct bfq_entity
, rb_node
);
854 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
856 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
859 * bfq_update_next_in_service - update sd->next_in_service
860 * @sd: sched_data for which to perform the update.
861 * @new_entity: if not NULL, pointer to the entity whose activation,
862 * requeueing or repositionig triggered the invocation of
865 * This function is called to update sd->next_in_service, which, in
866 * its turn, may change as a consequence of the insertion or
867 * extraction of an entity into/from one of the active trees of
868 * sd. These insertions/extractions occur as a consequence of
869 * activations/deactivations of entities, with some activations being
870 * 'true' activations, and other activations being requeueings (i.e.,
871 * implementing the second, requeueing phase of the mechanism used to
872 * reposition an entity in its active tree; see comments on
873 * __bfq_activate_entity and __bfq_requeue_entity for details). In
874 * both the last two activation sub-cases, new_entity points to the
875 * just activated or requeued entity.
877 * Returns true if sd->next_in_service changes in such a way that
878 * entity->parent may become the next_in_service for its parent
881 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
882 struct bfq_entity
*new_entity
)
884 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
885 bool parent_sched_may_change
= false;
888 * If this update is triggered by the activation, requeueing
889 * or repositiong of an entity that does not coincide with
890 * sd->next_in_service, then a full lookup in the active tree
891 * can be avoided. In fact, it is enough to check whether the
892 * just-modified entity has a higher priority than
893 * sd->next_in_service, or, even if it has the same priority
894 * as sd->next_in_service, is eligible and has a lower virtual
895 * finish time than sd->next_in_service. If this compound
896 * condition holds, then the new entity becomes the new
897 * next_in_service. Otherwise no change is needed.
899 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
901 * Flag used to decide whether to replace
902 * sd->next_in_service with new_entity. Tentatively
903 * set to true, and left as true if
904 * sd->next_in_service is NULL.
906 bool replace_next
= true;
909 * If there is already a next_in_service candidate
910 * entity, then compare class priorities or timestamps
911 * to decide whether to replace sd->service_tree with
914 if (next_in_service
) {
915 unsigned int new_entity_class_idx
=
916 bfq_class_idx(new_entity
);
917 struct bfq_service_tree
*st
=
918 sd
->service_tree
+ new_entity_class_idx
;
921 * For efficiency, evaluate the most likely
922 * sub-condition first.
925 (new_entity_class_idx
==
926 bfq_class_idx(next_in_service
)
928 !bfq_gt(new_entity
->start
, st
->vtime
)
930 bfq_gt(next_in_service
->finish
,
933 new_entity_class_idx
<
934 bfq_class_idx(next_in_service
);
938 next_in_service
= new_entity
;
939 } else /* invoked because of a deactivation: lookup needed */
940 next_in_service
= bfq_lookup_next_entity(sd
);
942 if (next_in_service
) {
943 parent_sched_may_change
= !sd
->next_in_service
||
944 bfq_update_parent_budget(next_in_service
);
947 sd
->next_in_service
= next_in_service
;
949 if (!next_in_service
)
950 return parent_sched_may_change
;
952 return parent_sched_may_change
;
955 #ifdef CONFIG_BFQ_GROUP_IOSCHED
956 /* both next loops stop at one of the child entities of the root group */
957 #define for_each_entity(entity) \
958 for (; entity ; entity = entity->parent)
961 * For each iteration, compute parent in advance, so as to be safe if
962 * entity is deallocated during the iteration. Such a deallocation may
963 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
966 #define for_each_entity_safe(entity, parent) \
967 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
970 * Returns true if this budget changes may let next_in_service->parent
971 * become the next_in_service entity for its parent entity.
973 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
975 struct bfq_entity
*bfqg_entity
;
976 struct bfq_group
*bfqg
;
977 struct bfq_sched_data
*group_sd
;
980 group_sd
= next_in_service
->sched_data
;
982 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
984 * bfq_group's my_entity field is not NULL only if the group
985 * is not the root group. We must not touch the root entity
986 * as it must never become an in-service entity.
988 bfqg_entity
= bfqg
->my_entity
;
990 if (bfqg_entity
->budget
> next_in_service
->budget
)
992 bfqg_entity
->budget
= next_in_service
->budget
;
999 * This function tells whether entity stops being a candidate for next
1000 * service, according to the following logic.
1002 * This function is invoked for an entity that is about to be set in
1003 * service. If such an entity is a queue, then the entity is no longer
1004 * a candidate for next service (i.e, a candidate entity to serve
1005 * after the in-service entity is expired). The function then returns
1008 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1010 if (bfq_entity_to_bfqq(entity
))
1016 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1018 * Next two macros are fake loops when cgroups support is not
1019 * enabled. I fact, in such a case, there is only one level to go up
1020 * (to reach the root group).
1022 #define for_each_entity(entity) \
1023 for (; entity ; entity = NULL)
1025 #define for_each_entity_safe(entity, parent) \
1026 for (parent = NULL; entity ; entity = parent)
1028 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1033 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1038 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1041 * Shift for timestamp calculations. This actually limits the maximum
1042 * service allowed in one timestamp delta (small shift values increase it),
1043 * the maximum total weight that can be used for the queues in the system
1044 * (big shift values increase it), and the period of virtual time
1047 #define WFQ_SERVICE_SHIFT 22
1049 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1051 struct bfq_queue
*bfqq
= NULL
;
1053 if (!entity
->my_sched_data
)
1054 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1061 * bfq_delta - map service into the virtual time domain.
1062 * @service: amount of service.
1063 * @weight: scale factor (weight of an entity or weight sum).
1065 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1067 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1074 * bfq_calc_finish - assign the finish time to an entity.
1075 * @entity: the entity to act upon.
1076 * @service: the service to be charged to the entity.
1078 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1080 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1082 entity
->finish
= entity
->start
+
1083 bfq_delta(service
, entity
->weight
);
1086 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1087 "calc_finish: serv %lu, w %d",
1088 service
, entity
->weight
);
1089 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1090 "calc_finish: start %llu, finish %llu, delta %llu",
1091 entity
->start
, entity
->finish
,
1092 bfq_delta(service
, entity
->weight
));
1097 * bfq_entity_of - get an entity from a node.
1098 * @node: the node field of the entity.
1100 * Convert a node pointer to the relative entity. This is used only
1101 * to simplify the logic of some functions and not as the generic
1102 * conversion mechanism because, e.g., in the tree walking functions,
1103 * the check for a %NULL value would be redundant.
1105 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1107 struct bfq_entity
*entity
= NULL
;
1110 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1116 * bfq_extract - remove an entity from a tree.
1117 * @root: the tree root.
1118 * @entity: the entity to remove.
1120 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1122 entity
->tree
= NULL
;
1123 rb_erase(&entity
->rb_node
, root
);
1127 * bfq_idle_extract - extract an entity from the idle tree.
1128 * @st: the service tree of the owning @entity.
1129 * @entity: the entity being removed.
1131 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1132 struct bfq_entity
*entity
)
1134 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1135 struct rb_node
*next
;
1137 if (entity
== st
->first_idle
) {
1138 next
= rb_next(&entity
->rb_node
);
1139 st
->first_idle
= bfq_entity_of(next
);
1142 if (entity
== st
->last_idle
) {
1143 next
= rb_prev(&entity
->rb_node
);
1144 st
->last_idle
= bfq_entity_of(next
);
1147 bfq_extract(&st
->idle
, entity
);
1150 list_del(&bfqq
->bfqq_list
);
1154 * bfq_insert - generic tree insertion.
1156 * @entity: entity to insert.
1158 * This is used for the idle and the active tree, since they are both
1159 * ordered by finish time.
1161 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1163 struct bfq_entity
*entry
;
1164 struct rb_node
**node
= &root
->rb_node
;
1165 struct rb_node
*parent
= NULL
;
1169 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1171 if (bfq_gt(entry
->finish
, entity
->finish
))
1172 node
= &parent
->rb_left
;
1174 node
= &parent
->rb_right
;
1177 rb_link_node(&entity
->rb_node
, parent
, node
);
1178 rb_insert_color(&entity
->rb_node
, root
);
1180 entity
->tree
= root
;
1184 * bfq_update_min - update the min_start field of a entity.
1185 * @entity: the entity to update.
1186 * @node: one of its children.
1188 * This function is called when @entity may store an invalid value for
1189 * min_start due to updates to the active tree. The function assumes
1190 * that the subtree rooted at @node (which may be its left or its right
1191 * child) has a valid min_start value.
1193 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1195 struct bfq_entity
*child
;
1198 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1199 if (bfq_gt(entity
->min_start
, child
->min_start
))
1200 entity
->min_start
= child
->min_start
;
1205 * bfq_update_active_node - recalculate min_start.
1206 * @node: the node to update.
1208 * @node may have changed position or one of its children may have moved,
1209 * this function updates its min_start value. The left and right subtrees
1210 * are assumed to hold a correct min_start value.
1212 static void bfq_update_active_node(struct rb_node
*node
)
1214 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1216 entity
->min_start
= entity
->start
;
1217 bfq_update_min(entity
, node
->rb_right
);
1218 bfq_update_min(entity
, node
->rb_left
);
1222 * bfq_update_active_tree - update min_start for the whole active tree.
1223 * @node: the starting node.
1225 * @node must be the deepest modified node after an update. This function
1226 * updates its min_start using the values held by its children, assuming
1227 * that they did not change, and then updates all the nodes that may have
1228 * changed in the path to the root. The only nodes that may have changed
1229 * are the ones in the path or their siblings.
1231 static void bfq_update_active_tree(struct rb_node
*node
)
1233 struct rb_node
*parent
;
1236 bfq_update_active_node(node
);
1238 parent
= rb_parent(node
);
1242 if (node
== parent
->rb_left
&& parent
->rb_right
)
1243 bfq_update_active_node(parent
->rb_right
);
1244 else if (parent
->rb_left
)
1245 bfq_update_active_node(parent
->rb_left
);
1252 * bfq_active_insert - insert an entity in the active tree of its
1254 * @st: the service tree of the entity.
1255 * @entity: the entity being inserted.
1257 * The active tree is ordered by finish time, but an extra key is kept
1258 * per each node, containing the minimum value for the start times of
1259 * its children (and the node itself), so it's possible to search for
1260 * the eligible node with the lowest finish time in logarithmic time.
1262 static void bfq_active_insert(struct bfq_service_tree
*st
,
1263 struct bfq_entity
*entity
)
1265 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1266 struct rb_node
*node
= &entity
->rb_node
;
1267 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1268 struct bfq_sched_data
*sd
= NULL
;
1269 struct bfq_group
*bfqg
= NULL
;
1270 struct bfq_data
*bfqd
= NULL
;
1273 bfq_insert(&st
->active
, entity
);
1276 node
= node
->rb_left
;
1277 else if (node
->rb_right
)
1278 node
= node
->rb_right
;
1280 bfq_update_active_tree(node
);
1282 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1283 sd
= entity
->sched_data
;
1284 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1285 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1288 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1292 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1293 * @ioprio: the ioprio value to convert.
1295 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1297 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1301 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1302 * @weight: the weight value to convert.
1304 * To preserve as much as possible the old only-ioprio user interface,
1305 * 0 is used as an escape ioprio value for weights (numerically) equal or
1306 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1308 static unsigned short bfq_weight_to_ioprio(int weight
)
1310 return max_t(int, 0,
1311 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1314 static void bfq_get_entity(struct bfq_entity
*entity
)
1316 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1320 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1326 * bfq_find_deepest - find the deepest node that an extraction can modify.
1327 * @node: the node being removed.
1329 * Do the first step of an extraction in an rb tree, looking for the
1330 * node that will replace @node, and returning the deepest node that
1331 * the following modifications to the tree can touch. If @node is the
1332 * last node in the tree return %NULL.
1334 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1336 struct rb_node
*deepest
;
1338 if (!node
->rb_right
&& !node
->rb_left
)
1339 deepest
= rb_parent(node
);
1340 else if (!node
->rb_right
)
1341 deepest
= node
->rb_left
;
1342 else if (!node
->rb_left
)
1343 deepest
= node
->rb_right
;
1345 deepest
= rb_next(node
);
1346 if (deepest
->rb_right
)
1347 deepest
= deepest
->rb_right
;
1348 else if (rb_parent(deepest
) != node
)
1349 deepest
= rb_parent(deepest
);
1356 * bfq_active_extract - remove an entity from the active tree.
1357 * @st: the service_tree containing the tree.
1358 * @entity: the entity being removed.
1360 static void bfq_active_extract(struct bfq_service_tree
*st
,
1361 struct bfq_entity
*entity
)
1363 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1364 struct rb_node
*node
;
1365 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1366 struct bfq_sched_data
*sd
= NULL
;
1367 struct bfq_group
*bfqg
= NULL
;
1368 struct bfq_data
*bfqd
= NULL
;
1371 node
= bfq_find_deepest(&entity
->rb_node
);
1372 bfq_extract(&st
->active
, entity
);
1375 bfq_update_active_tree(node
);
1377 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1378 sd
= entity
->sched_data
;
1379 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1380 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1383 list_del(&bfqq
->bfqq_list
);
1387 * bfq_idle_insert - insert an entity into the idle tree.
1388 * @st: the service tree containing the tree.
1389 * @entity: the entity to insert.
1391 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1392 struct bfq_entity
*entity
)
1394 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1395 struct bfq_entity
*first_idle
= st
->first_idle
;
1396 struct bfq_entity
*last_idle
= st
->last_idle
;
1398 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1399 st
->first_idle
= entity
;
1400 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1401 st
->last_idle
= entity
;
1403 bfq_insert(&st
->idle
, entity
);
1406 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1410 * bfq_forget_entity - do not consider entity any longer for scheduling
1411 * @st: the service tree.
1412 * @entity: the entity being removed.
1413 * @is_in_service: true if entity is currently the in-service entity.
1415 * Forget everything about @entity. In addition, if entity represents
1416 * a queue, and the latter is not in service, then release the service
1417 * reference to the queue (the one taken through bfq_get_entity). In
1418 * fact, in this case, there is really no more service reference to
1419 * the queue, as the latter is also outside any service tree. If,
1420 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1421 * will take care of putting the reference when the queue finally
1422 * stops being served.
1424 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1425 struct bfq_entity
*entity
,
1428 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1430 entity
->on_st
= false;
1431 st
->wsum
-= entity
->weight
;
1432 if (bfqq
&& !is_in_service
)
1433 bfq_put_queue(bfqq
);
1437 * bfq_put_idle_entity - release the idle tree ref of an entity.
1438 * @st: service tree for the entity.
1439 * @entity: the entity being released.
1441 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1442 struct bfq_entity
*entity
)
1444 bfq_idle_extract(st
, entity
);
1445 bfq_forget_entity(st
, entity
,
1446 entity
== entity
->sched_data
->in_service_entity
);
1450 * bfq_forget_idle - update the idle tree if necessary.
1451 * @st: the service tree to act upon.
1453 * To preserve the global O(log N) complexity we only remove one entry here;
1454 * as the idle tree will not grow indefinitely this can be done safely.
1456 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1458 struct bfq_entity
*first_idle
= st
->first_idle
;
1459 struct bfq_entity
*last_idle
= st
->last_idle
;
1461 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1462 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1464 * Forget the whole idle tree, increasing the vtime past
1465 * the last finish time of idle entities.
1467 st
->vtime
= last_idle
->finish
;
1470 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1471 bfq_put_idle_entity(st
, first_idle
);
1474 static struct bfq_service_tree
*
1475 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1476 struct bfq_entity
*entity
)
1478 struct bfq_service_tree
*new_st
= old_st
;
1480 if (entity
->prio_changed
) {
1481 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1482 unsigned short prev_weight
, new_weight
;
1483 struct bfq_data
*bfqd
= NULL
;
1484 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1485 struct bfq_sched_data
*sd
;
1486 struct bfq_group
*bfqg
;
1491 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1493 sd
= entity
->my_sched_data
;
1494 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1495 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1499 old_st
->wsum
-= entity
->weight
;
1501 if (entity
->new_weight
!= entity
->orig_weight
) {
1502 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1503 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1504 pr_crit("update_weight_prio: new_weight %d\n",
1505 entity
->new_weight
);
1506 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1507 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1509 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1511 entity
->orig_weight
= entity
->new_weight
;
1514 bfq_weight_to_ioprio(entity
->orig_weight
);
1518 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1519 entity
->prio_changed
= 0;
1522 * NOTE: here we may be changing the weight too early,
1523 * this will cause unfairness. The correct approach
1524 * would have required additional complexity to defer
1525 * weight changes to the proper time instants (i.e.,
1526 * when entity->finish <= old_st->vtime).
1528 new_st
= bfq_entity_service_tree(entity
);
1530 prev_weight
= entity
->weight
;
1531 new_weight
= entity
->orig_weight
;
1532 entity
->weight
= new_weight
;
1534 new_st
->wsum
+= entity
->weight
;
1536 if (new_st
!= old_st
)
1537 entity
->start
= new_st
->vtime
;
1543 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1544 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1547 * bfq_bfqq_served - update the scheduler status after selection for
1549 * @bfqq: the queue being served.
1550 * @served: bytes to transfer.
1552 * NOTE: this can be optimized, as the timestamps of upper level entities
1553 * are synchronized every time a new bfqq is selected for service. By now,
1554 * we keep it to better check consistency.
1556 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1558 struct bfq_entity
*entity
= &bfqq
->entity
;
1559 struct bfq_service_tree
*st
;
1561 for_each_entity(entity
) {
1562 st
= bfq_entity_service_tree(entity
);
1564 entity
->service
+= served
;
1566 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1567 bfq_forget_idle(st
);
1569 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1570 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1574 * bfq_bfqq_charge_full_budget - set the service to the entity budget.
1575 * @bfqq: the queue that needs a service update.
1577 * When it's not possible to be fair in the service domain, because
1578 * a queue is not consuming its budget fast enough (the meaning of
1579 * fast depends on the timeout parameter), we charge it a full
1580 * budget. In this way we should obtain a sort of time-domain
1581 * fairness among all the seeky/slow queues.
1583 static void bfq_bfqq_charge_full_budget(struct bfq_queue
*bfqq
)
1585 struct bfq_entity
*entity
= &bfqq
->entity
;
1587 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "charge_full_budget");
1589 bfq_bfqq_served(bfqq
, entity
->budget
- entity
->service
);
1592 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1593 struct bfq_service_tree
*st
,
1596 st
= __bfq_entity_update_weight_prio(st
, entity
);
1597 bfq_calc_finish(entity
, entity
->budget
);
1600 * If some queues enjoy backshifting for a while, then their
1601 * (virtual) finish timestamps may happen to become lower and
1602 * lower than the system virtual time. In particular, if
1603 * these queues often happen to be idle for short time
1604 * periods, and during such time periods other queues with
1605 * higher timestamps happen to be busy, then the backshifted
1606 * timestamps of the former queues can become much lower than
1607 * the system virtual time. In fact, to serve the queues with
1608 * higher timestamps while the ones with lower timestamps are
1609 * idle, the system virtual time may be pushed-up to much
1610 * higher values than the finish timestamps of the idle
1611 * queues. As a consequence, the finish timestamps of all new
1612 * or newly activated queues may end up being much larger than
1613 * those of lucky queues with backshifted timestamps. The
1614 * latter queues may then monopolize the device for a lot of
1615 * time. This would simply break service guarantees.
1617 * To reduce this problem, push up a little bit the
1618 * backshifted timestamps of the queue associated with this
1619 * entity (only a queue can happen to have the backshifted
1620 * flag set): just enough to let the finish timestamp of the
1621 * queue be equal to the current value of the system virtual
1622 * time. This may introduce a little unfairness among queues
1623 * with backshifted timestamps, but it does not break
1624 * worst-case fairness guarantees.
1626 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1627 unsigned long delta
= st
->vtime
- entity
->finish
;
1629 entity
->start
+= delta
;
1630 entity
->finish
+= delta
;
1633 bfq_active_insert(st
, entity
);
1637 * __bfq_activate_entity - handle activation of entity.
1638 * @entity: the entity being activated.
1639 * @non_blocking_wait_rq: true if entity was waiting for a request
1641 * Called for a 'true' activation, i.e., if entity is not active and
1642 * one of its children receives a new request.
1644 * Basically, this function updates the timestamps of entity and
1645 * inserts entity into its active tree, ater possible extracting it
1646 * from its idle tree.
1648 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1649 bool non_blocking_wait_rq
)
1651 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1652 bool backshifted
= false;
1653 unsigned long long min_vstart
;
1655 /* See comments on bfq_fqq_update_budg_for_activation */
1656 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1658 min_vstart
= entity
->finish
;
1660 min_vstart
= st
->vtime
;
1662 if (entity
->tree
== &st
->idle
) {
1664 * Must be on the idle tree, bfq_idle_extract() will
1667 bfq_idle_extract(st
, entity
);
1668 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1669 min_vstart
: entity
->finish
;
1672 * The finish time of the entity may be invalid, and
1673 * it is in the past for sure, otherwise the queue
1674 * would have been on the idle tree.
1676 entity
->start
= min_vstart
;
1677 st
->wsum
+= entity
->weight
;
1679 * entity is about to be inserted into a service tree,
1680 * and then set in service: get a reference to make
1681 * sure entity does not disappear until it is no
1682 * longer in service or scheduled for service.
1684 bfq_get_entity(entity
);
1686 entity
->on_st
= true;
1689 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1693 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1694 * @entity: the entity being requeued or repositioned.
1696 * Requeueing is needed if this entity stops being served, which
1697 * happens if a leaf descendant entity has expired. On the other hand,
1698 * repositioning is needed if the next_inservice_entity for the child
1699 * entity has changed. See the comments inside the function for
1702 * Basically, this function: 1) removes entity from its active tree if
1703 * present there, 2) updates the timestamps of entity and 3) inserts
1704 * entity back into its active tree (in the new, right position for
1705 * the new values of the timestamps).
1707 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
1709 struct bfq_sched_data
*sd
= entity
->sched_data
;
1710 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1712 if (entity
== sd
->in_service_entity
) {
1714 * We are requeueing the current in-service entity,
1715 * which may have to be done for one of the following
1717 * - entity represents the in-service queue, and the
1718 * in-service queue is being requeued after an
1720 * - entity represents a group, and its budget has
1721 * changed because one of its child entities has
1722 * just been either activated or requeued for some
1723 * reason; the timestamps of the entity need then to
1724 * be updated, and the entity needs to be enqueued
1725 * or repositioned accordingly.
1727 * In particular, before requeueing, the start time of
1728 * the entity must be moved forward to account for the
1729 * service that the entity has received while in
1730 * service. This is done by the next instructions. The
1731 * finish time will then be updated according to this
1732 * new value of the start time, and to the budget of
1735 bfq_calc_finish(entity
, entity
->service
);
1736 entity
->start
= entity
->finish
;
1738 * In addition, if the entity had more than one child
1739 * when set in service, then was not extracted from
1740 * the active tree. This implies that the position of
1741 * the entity in the active tree may need to be
1742 * changed now, because we have just updated the start
1743 * time of the entity, and we will update its finish
1744 * time in a moment (the requeueing is then, more
1745 * precisely, a repositioning in this case). To
1746 * implement this repositioning, we: 1) dequeue the
1747 * entity here, 2) update the finish time and
1748 * requeue the entity according to the new
1752 bfq_active_extract(st
, entity
);
1753 } else { /* The entity is already active, and not in service */
1755 * In this case, this function gets called only if the
1756 * next_in_service entity below this entity has
1757 * changed, and this change has caused the budget of
1758 * this entity to change, which, finally implies that
1759 * the finish time of this entity must be
1760 * updated. Such an update may cause the scheduling,
1761 * i.e., the position in the active tree, of this
1762 * entity to change. We handle this change by: 1)
1763 * dequeueing the entity here, 2) updating the finish
1764 * time and requeueing the entity according to the new
1765 * timestamps below. This is the same approach as the
1766 * non-extracted-entity sub-case above.
1768 bfq_active_extract(st
, entity
);
1771 bfq_update_fin_time_enqueue(entity
, st
, false);
1774 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1775 struct bfq_sched_data
*sd
,
1776 bool non_blocking_wait_rq
)
1778 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1780 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
1782 * in service or already queued on the active tree,
1783 * requeue or reposition
1785 __bfq_requeue_entity(entity
);
1788 * Not in service and not queued on its active tree:
1789 * the activity is idle and this is a true activation.
1791 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
1796 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1797 * and activate, requeue or reposition all ancestors
1798 * for which such an update becomes necessary.
1799 * @entity: the entity to activate.
1800 * @non_blocking_wait_rq: true if this entity was waiting for a request
1801 * @requeue: true if this is a requeue, which implies that bfqq is
1802 * being expired; thus ALL its ancestors stop being served and must
1803 * therefore be requeued
1805 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1806 bool non_blocking_wait_rq
,
1809 struct bfq_sched_data
*sd
;
1811 for_each_entity(entity
) {
1812 sd
= entity
->sched_data
;
1813 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
1815 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
1821 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1822 * @entity: the entity to deactivate.
1823 * @ins_into_idle_tree: if false, the entity will not be put into the
1826 * Deactivates an entity, independently from its previous state. Must
1827 * be invoked only if entity is on a service tree. Extracts the entity
1828 * from that tree, and if necessary and allowed, puts it on the idle
1831 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
1832 bool ins_into_idle_tree
)
1834 struct bfq_sched_data
*sd
= entity
->sched_data
;
1835 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1836 int is_in_service
= entity
== sd
->in_service_entity
;
1838 if (!entity
->on_st
) /* entity never activated, or already inactive */
1842 bfq_calc_finish(entity
, entity
->service
);
1844 if (entity
->tree
== &st
->active
)
1845 bfq_active_extract(st
, entity
);
1846 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
1847 bfq_idle_extract(st
, entity
);
1849 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
1850 bfq_forget_entity(st
, entity
, is_in_service
);
1852 bfq_idle_insert(st
, entity
);
1858 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1859 * @entity: the entity to deactivate.
1860 * @ins_into_idle_tree: true if the entity can be put on the idle tree
1862 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
1863 bool ins_into_idle_tree
,
1866 struct bfq_sched_data
*sd
;
1867 struct bfq_entity
*parent
= NULL
;
1869 for_each_entity_safe(entity
, parent
) {
1870 sd
= entity
->sched_data
;
1872 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
1874 * entity is not in any tree any more, so
1875 * this deactivation is a no-op, and there is
1876 * nothing to change for upper-level entities
1877 * (in case of expiration, this can never
1883 if (sd
->next_in_service
== entity
)
1885 * entity was the next_in_service entity,
1886 * then, since entity has just been
1887 * deactivated, a new one must be found.
1889 bfq_update_next_in_service(sd
, NULL
);
1891 if (sd
->next_in_service
)
1893 * The parent entity is still backlogged,
1894 * because next_in_service is not NULL. So, no
1895 * further upwards deactivation must be
1896 * performed. Yet, next_in_service has
1897 * changed. Then the schedule does need to be
1903 * If we get here, then the parent is no more
1904 * backlogged and we need to propagate the
1905 * deactivation upwards. Thus let the loop go on.
1909 * Also let parent be queued into the idle tree on
1910 * deactivation, to preserve service guarantees, and
1911 * assuming that who invoked this function does not
1912 * need parent entities too to be removed completely.
1914 ins_into_idle_tree
= true;
1918 * If the deactivation loop is fully executed, then there are
1919 * no more entities to touch and next loop is not executed at
1920 * all. Otherwise, requeue remaining entities if they are
1921 * about to stop receiving service, or reposition them if this
1925 for_each_entity(entity
) {
1927 * Invoke __bfq_requeue_entity on entity, even if
1928 * already active, to requeue/reposition it in the
1929 * active tree (because sd->next_in_service has
1932 __bfq_requeue_entity(entity
);
1934 sd
= entity
->sched_data
;
1935 if (!bfq_update_next_in_service(sd
, entity
) &&
1938 * next_in_service unchanged or not causing
1939 * any change in entity->parent->sd, and no
1940 * requeueing needed for expiration: stop
1948 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1949 * if needed, to have at least one entity eligible.
1950 * @st: the service tree to act upon.
1952 * Assumes that st is not empty.
1954 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
1956 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
1958 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
1959 return root_entity
->min_start
;
1964 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
1966 if (new_value
> st
->vtime
) {
1967 st
->vtime
= new_value
;
1968 bfq_forget_idle(st
);
1973 * bfq_first_active_entity - find the eligible entity with
1974 * the smallest finish time
1975 * @st: the service tree to select from.
1976 * @vtime: the system virtual to use as a reference for eligibility
1978 * This function searches the first schedulable entity, starting from the
1979 * root of the tree and going on the left every time on this side there is
1980 * a subtree with at least one eligible (start >= vtime) entity. The path on
1981 * the right is followed only if a) the left subtree contains no eligible
1982 * entities and b) no eligible entity has been found yet.
1984 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
1987 struct bfq_entity
*entry
, *first
= NULL
;
1988 struct rb_node
*node
= st
->active
.rb_node
;
1991 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
1993 if (!bfq_gt(entry
->start
, vtime
))
1996 if (node
->rb_left
) {
1997 entry
= rb_entry(node
->rb_left
,
1998 struct bfq_entity
, rb_node
);
1999 if (!bfq_gt(entry
->min_start
, vtime
)) {
2000 node
= node
->rb_left
;
2006 node
= node
->rb_right
;
2013 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2014 * @st: the service tree.
2016 * If there is no in-service entity for the sched_data st belongs to,
2017 * then return the entity that will be set in service if:
2018 * 1) the parent entity this st belongs to is set in service;
2019 * 2) no entity belonging to such parent entity undergoes a state change
2020 * that would influence the timestamps of the entity (e.g., becomes idle,
2021 * becomes backlogged, changes its budget, ...).
2023 * In this first case, update the virtual time in @st too (see the
2024 * comments on this update inside the function).
2026 * In constrast, if there is an in-service entity, then return the
2027 * entity that would be set in service if not only the above
2028 * conditions, but also the next one held true: the currently
2029 * in-service entity, on expiration,
2030 * 1) gets a finish time equal to the current one, or
2031 * 2) is not eligible any more, or
2034 static struct bfq_entity
*
2035 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2037 struct bfq_entity
*entity
;
2040 if (RB_EMPTY_ROOT(&st
->active
))
2044 * Get the value of the system virtual time for which at
2045 * least one entity is eligible.
2047 new_vtime
= bfq_calc_vtime_jump(st
);
2050 * If there is no in-service entity for the sched_data this
2051 * active tree belongs to, then push the system virtual time
2052 * up to the value that guarantees that at least one entity is
2053 * eligible. If, instead, there is an in-service entity, then
2054 * do not make any such update, because there is already an
2055 * eligible entity, namely the in-service one (even if the
2056 * entity is not on st, because it was extracted when set in
2060 bfq_update_vtime(st
, new_vtime
);
2062 entity
= bfq_first_active_entity(st
, new_vtime
);
2068 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2069 * @sd: the sched_data.
2071 * This function is invoked when there has been a change in the trees
2072 * for sd, and we need know what is the new next entity after this
2075 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2077 struct bfq_service_tree
*st
= sd
->service_tree
;
2078 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2079 struct bfq_entity
*entity
= NULL
;
2083 * Choose from idle class, if needed to guarantee a minimum
2084 * bandwidth to this class (and if there is some active entity
2085 * in idle class). This should also mitigate
2086 * priority-inversion problems in case a low priority task is
2087 * holding file system resources.
2089 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2090 BFQ_CL_IDLE_TIMEOUT
)) {
2091 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2092 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2093 /* About to be served if backlogged, or not yet backlogged */
2094 sd
->bfq_class_idle_last_service
= jiffies
;
2098 * Find the next entity to serve for the highest-priority
2099 * class, unless the idle class needs to be served.
2101 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2102 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2103 sd
->in_service_entity
);
2115 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2117 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2119 return sd
->next_in_service
!= sd
->in_service_entity
;
2123 * Get next queue for service.
2125 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2127 struct bfq_entity
*entity
= NULL
;
2128 struct bfq_sched_data
*sd
;
2129 struct bfq_queue
*bfqq
;
2131 if (bfqd
->busy_queues
== 0)
2135 * Traverse the path from the root to the leaf entity to
2136 * serve. Set in service all the entities visited along the
2139 sd
= &bfqd
->root_group
->sched_data
;
2140 for (; sd
; sd
= entity
->my_sched_data
) {
2142 * WARNING. We are about to set the in-service entity
2143 * to sd->next_in_service, i.e., to the (cached) value
2144 * returned by bfq_lookup_next_entity(sd) the last
2145 * time it was invoked, i.e., the last time when the
2146 * service order in sd changed as a consequence of the
2147 * activation or deactivation of an entity. In this
2148 * respect, if we execute bfq_lookup_next_entity(sd)
2149 * in this very moment, it may, although with low
2150 * probability, yield a different entity than that
2151 * pointed to by sd->next_in_service. This rare event
2152 * happens in case there was no CLASS_IDLE entity to
2153 * serve for sd when bfq_lookup_next_entity(sd) was
2154 * invoked for the last time, while there is now one
2157 * If the above event happens, then the scheduling of
2158 * such entity in CLASS_IDLE is postponed until the
2159 * service of the sd->next_in_service entity
2160 * finishes. In fact, when the latter is expired,
2161 * bfq_lookup_next_entity(sd) gets called again,
2162 * exactly to update sd->next_in_service.
2165 /* Make next_in_service entity become in_service_entity */
2166 entity
= sd
->next_in_service
;
2167 sd
->in_service_entity
= entity
;
2170 * Reset the accumulator of the amount of service that
2171 * the entity is about to receive.
2173 entity
->service
= 0;
2176 * If entity is no longer a candidate for next
2177 * service, then we extract it from its active tree,
2178 * for the following reason. To further boost the
2179 * throughput in some special case, BFQ needs to know
2180 * which is the next candidate entity to serve, while
2181 * there is already an entity in service. In this
2182 * respect, to make it easy to compute/update the next
2183 * candidate entity to serve after the current
2184 * candidate has been set in service, there is a case
2185 * where it is necessary to extract the current
2186 * candidate from its service tree. Such a case is
2187 * when the entity just set in service cannot be also
2188 * a candidate for next service. Details about when
2189 * this conditions holds are reported in the comments
2190 * on the function bfq_no_longer_next_in_service()
2193 if (bfq_no_longer_next_in_service(entity
))
2194 bfq_active_extract(bfq_entity_service_tree(entity
),
2198 * For the same reason why we may have just extracted
2199 * entity from its active tree, we may need to update
2200 * next_in_service for the sched_data of entity too,
2201 * regardless of whether entity has been extracted.
2202 * In fact, even if entity has not been extracted, a
2203 * descendant entity may get extracted. Such an event
2204 * would cause a change in next_in_service for the
2205 * level of the descendant entity, and thus possibly
2206 * back to upper levels.
2208 * We cannot perform the resulting needed update
2209 * before the end of this loop, because, to know which
2210 * is the correct next-to-serve candidate entity for
2211 * each level, we need first to find the leaf entity
2212 * to set in service. In fact, only after we know
2213 * which is the next-to-serve leaf entity, we can
2214 * discover whether the parent entity of the leaf
2215 * entity becomes the next-to-serve, and so on.
2220 bfqq
= bfq_entity_to_bfqq(entity
);
2223 * We can finally update all next-to-serve entities along the
2224 * path from the leaf entity just set in service to the root.
2226 for_each_entity(entity
) {
2227 struct bfq_sched_data
*sd
= entity
->sched_data
;
2229 if (!bfq_update_next_in_service(sd
, NULL
))
2236 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2238 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2239 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2240 struct bfq_entity
*entity
= in_serv_entity
;
2242 if (bfqd
->in_service_bic
) {
2243 put_io_context(bfqd
->in_service_bic
->icq
.ioc
);
2244 bfqd
->in_service_bic
= NULL
;
2247 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2248 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2249 bfqd
->in_service_queue
= NULL
;
2252 * When this function is called, all in-service entities have
2253 * been properly deactivated or requeued, so we can safely
2254 * execute the final step: reset in_service_entity along the
2255 * path from entity to the root.
2257 for_each_entity(entity
)
2258 entity
->sched_data
->in_service_entity
= NULL
;
2261 * in_serv_entity is no longer in service, so, if it is in no
2262 * service tree either, then release the service reference to
2263 * the queue it represents (taken with bfq_get_entity).
2265 if (!in_serv_entity
->on_st
)
2266 bfq_put_queue(in_serv_bfqq
);
2269 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2270 bool ins_into_idle_tree
, bool expiration
)
2272 struct bfq_entity
*entity
= &bfqq
->entity
;
2274 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2277 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2279 struct bfq_entity
*entity
= &bfqq
->entity
;
2281 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2283 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2286 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2288 struct bfq_entity
*entity
= &bfqq
->entity
;
2290 bfq_activate_requeue_entity(entity
, false,
2291 bfqq
== bfqd
->in_service_queue
);
2294 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2297 * Called when the bfqq no longer has requests pending, remove it from
2298 * the service tree. As a special case, it can be invoked during an
2301 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2304 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2306 bfq_clear_bfqq_busy(bfqq
);
2308 bfqd
->busy_queues
--;
2310 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2312 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2316 * Called when an inactive queue receives a new request.
2318 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2320 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2322 bfq_activate_bfqq(bfqd
, bfqq
);
2324 bfq_mark_bfqq_busy(bfqq
);
2325 bfqd
->busy_queues
++;
2328 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2330 /* bfqg stats flags */
2331 enum bfqg_stats_flags
{
2332 BFQG_stats_waiting
= 0,
2337 #define BFQG_FLAG_FNS(name) \
2338 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2340 stats->flags |= (1 << BFQG_stats_##name); \
2342 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2344 stats->flags &= ~(1 << BFQG_stats_##name); \
2346 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2348 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2351 BFQG_FLAG_FNS(waiting)
2352 BFQG_FLAG_FNS(idling
)
2353 BFQG_FLAG_FNS(empty
)
2354 #undef BFQG_FLAG_FNS
2356 /* This should be called with the queue_lock held. */
2357 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2359 unsigned long long now
;
2361 if (!bfqg_stats_waiting(stats
))
2364 now
= sched_clock();
2365 if (time_after64(now
, stats
->start_group_wait_time
))
2366 blkg_stat_add(&stats
->group_wait_time
,
2367 now
- stats
->start_group_wait_time
);
2368 bfqg_stats_clear_waiting(stats
);
2371 /* This should be called with the queue_lock held. */
2372 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2373 struct bfq_group
*curr_bfqg
)
2375 struct bfqg_stats
*stats
= &bfqg
->stats
;
2377 if (bfqg_stats_waiting(stats
))
2379 if (bfqg
== curr_bfqg
)
2381 stats
->start_group_wait_time
= sched_clock();
2382 bfqg_stats_mark_waiting(stats
);
2385 /* This should be called with the queue_lock held. */
2386 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2388 unsigned long long now
;
2390 if (!bfqg_stats_empty(stats
))
2393 now
= sched_clock();
2394 if (time_after64(now
, stats
->start_empty_time
))
2395 blkg_stat_add(&stats
->empty_time
,
2396 now
- stats
->start_empty_time
);
2397 bfqg_stats_clear_empty(stats
);
2400 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2402 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2405 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2407 struct bfqg_stats
*stats
= &bfqg
->stats
;
2409 if (blkg_rwstat_total(&stats
->queued
))
2413 * group is already marked empty. This can happen if bfqq got new
2414 * request in parent group and moved to this group while being added
2415 * to service tree. Just ignore the event and move on.
2417 if (bfqg_stats_empty(stats
))
2420 stats
->start_empty_time
= sched_clock();
2421 bfqg_stats_mark_empty(stats
);
2424 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2426 struct bfqg_stats
*stats
= &bfqg
->stats
;
2428 if (bfqg_stats_idling(stats
)) {
2429 unsigned long long now
= sched_clock();
2431 if (time_after64(now
, stats
->start_idle_time
))
2432 blkg_stat_add(&stats
->idle_time
,
2433 now
- stats
->start_idle_time
);
2434 bfqg_stats_clear_idling(stats
);
2438 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2440 struct bfqg_stats
*stats
= &bfqg
->stats
;
2442 stats
->start_idle_time
= sched_clock();
2443 bfqg_stats_mark_idling(stats
);
2446 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2448 struct bfqg_stats
*stats
= &bfqg
->stats
;
2450 blkg_stat_add(&stats
->avg_queue_size_sum
,
2451 blkg_rwstat_total(&stats
->queued
));
2452 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2453 bfqg_stats_update_group_wait_time(stats
);
2457 * blk-cgroup policy-related handlers
2458 * The following functions help in converting between blk-cgroup
2459 * internal structures and BFQ-specific structures.
2462 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2464 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2467 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2469 return pd_to_blkg(&bfqg
->pd
);
2472 static struct blkcg_policy blkcg_policy_bfq
;
2474 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2476 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2480 * bfq_group handlers
2481 * The following functions help in navigating the bfq_group hierarchy
2482 * by allowing to find the parent of a bfq_group or the bfq_group
2483 * associated to a bfq_queue.
2486 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2488 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2490 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2493 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2495 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2497 return group_entity
? container_of(group_entity
, struct bfq_group
,
2499 bfqq
->bfqd
->root_group
;
2503 * The following two functions handle get and put of a bfq_group by
2504 * wrapping the related blk-cgroup hooks.
2507 static void bfqg_get(struct bfq_group
*bfqg
)
2509 return blkg_get(bfqg_to_blkg(bfqg
));
2512 static void bfqg_put(struct bfq_group
*bfqg
)
2514 return blkg_put(bfqg_to_blkg(bfqg
));
2517 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2518 struct bfq_queue
*bfqq
,
2521 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2522 bfqg_stats_end_empty_time(&bfqg
->stats
);
2523 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2524 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2527 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2529 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2532 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2534 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2537 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2538 uint64_t start_time
, uint64_t io_start_time
,
2541 struct bfqg_stats
*stats
= &bfqg
->stats
;
2542 unsigned long long now
= sched_clock();
2544 if (time_after64(now
, io_start_time
))
2545 blkg_rwstat_add(&stats
->service_time
, op
,
2546 now
- io_start_time
);
2547 if (time_after64(io_start_time
, start_time
))
2548 blkg_rwstat_add(&stats
->wait_time
, op
,
2549 io_start_time
- start_time
);
2553 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2555 /* queued stats shouldn't be cleared */
2556 blkg_rwstat_reset(&stats
->merged
);
2557 blkg_rwstat_reset(&stats
->service_time
);
2558 blkg_rwstat_reset(&stats
->wait_time
);
2559 blkg_stat_reset(&stats
->time
);
2560 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2561 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2562 blkg_stat_reset(&stats
->dequeue
);
2563 blkg_stat_reset(&stats
->group_wait_time
);
2564 blkg_stat_reset(&stats
->idle_time
);
2565 blkg_stat_reset(&stats
->empty_time
);
2569 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2574 /* queued stats shouldn't be cleared */
2575 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2576 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2577 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2578 blkg_stat_add_aux(&from
->time
, &from
->time
);
2579 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2580 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2581 &from
->avg_queue_size_samples
);
2582 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2583 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2584 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2585 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2589 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2590 * recursive stats can still account for the amount used by this bfqg after
2593 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2595 struct bfq_group
*parent
;
2597 if (!bfqg
) /* root_group */
2600 parent
= bfqg_parent(bfqg
);
2602 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2604 if (unlikely(!parent
))
2607 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2608 bfqg_stats_reset(&bfqg
->stats
);
2611 static void bfq_init_entity(struct bfq_entity
*entity
,
2612 struct bfq_group
*bfqg
)
2614 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2616 entity
->weight
= entity
->new_weight
;
2617 entity
->orig_weight
= entity
->new_weight
;
2619 bfqq
->ioprio
= bfqq
->new_ioprio
;
2620 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2623 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2624 entity
->sched_data
= &bfqg
->sched_data
;
2627 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2629 blkg_rwstat_exit(&stats
->merged
);
2630 blkg_rwstat_exit(&stats
->service_time
);
2631 blkg_rwstat_exit(&stats
->wait_time
);
2632 blkg_rwstat_exit(&stats
->queued
);
2633 blkg_stat_exit(&stats
->time
);
2634 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2635 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2636 blkg_stat_exit(&stats
->dequeue
);
2637 blkg_stat_exit(&stats
->group_wait_time
);
2638 blkg_stat_exit(&stats
->idle_time
);
2639 blkg_stat_exit(&stats
->empty_time
);
2642 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2644 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2645 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2646 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2647 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2648 blkg_stat_init(&stats
->time
, gfp
) ||
2649 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2650 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2651 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2652 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2653 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2654 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2655 bfqg_stats_exit(stats
);
2662 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2664 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2667 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2669 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2672 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2674 struct bfq_group_data
*bgd
;
2676 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2682 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2684 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2686 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2687 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2690 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2692 kfree(cpd_to_bfqgd(cpd
));
2695 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
2697 struct bfq_group
*bfqg
;
2699 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
2703 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
2711 static void bfq_pd_init(struct blkg_policy_data
*pd
)
2713 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
2714 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
2715 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
2716 struct bfq_entity
*entity
= &bfqg
->entity
;
2717 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
2719 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
2720 entity
->my_sched_data
= &bfqg
->sched_data
;
2721 bfqg
->my_entity
= entity
; /*
2722 * the root_group's will be set to NULL
2723 * in bfq_init_queue()
2728 static void bfq_pd_free(struct blkg_policy_data
*pd
)
2730 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2732 bfqg_stats_exit(&bfqg
->stats
);
2736 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
2738 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2740 bfqg_stats_reset(&bfqg
->stats
);
2743 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
2744 struct bfq_group
*parent
)
2746 struct bfq_entity
*entity
;
2748 entity
= &bfqg
->entity
;
2749 entity
->parent
= parent
->my_entity
;
2750 entity
->sched_data
= &parent
->sched_data
;
2753 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
2754 struct blkcg
*blkcg
)
2756 struct blkcg_gq
*blkg
;
2758 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
2760 return blkg_to_bfqg(blkg
);
2764 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
2765 struct blkcg
*blkcg
)
2767 struct bfq_group
*bfqg
, *parent
;
2768 struct bfq_entity
*entity
;
2770 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
2772 if (unlikely(!bfqg
))
2776 * Update chain of bfq_groups as we might be handling a leaf group
2777 * which, along with some of its relatives, has not been hooked yet
2778 * to the private hierarchy of BFQ.
2780 entity
= &bfqg
->entity
;
2781 for_each_entity(entity
) {
2782 bfqg
= container_of(entity
, struct bfq_group
, entity
);
2783 if (bfqg
!= bfqd
->root_group
) {
2784 parent
= bfqg_parent(bfqg
);
2786 parent
= bfqd
->root_group
;
2787 bfq_group_set_parent(bfqg
, parent
);
2794 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
2795 struct bfq_queue
*bfqq
,
2797 enum bfqq_expiration reason
);
2800 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2801 * @bfqd: queue descriptor.
2802 * @bfqq: the queue to move.
2803 * @bfqg: the group to move to.
2805 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2806 * it on the new one. Avoid putting the entity on the old group idle tree.
2808 * Must be called under the queue lock; the cgroup owning @bfqg must
2809 * not disappear (by now this just means that we are called under
2812 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2813 struct bfq_group
*bfqg
)
2815 struct bfq_entity
*entity
= &bfqq
->entity
;
2817 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2818 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2819 * from data structures related to current group. Otherwise we
2820 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2823 if (bfqq
== bfqd
->in_service_queue
)
2824 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
2825 false, BFQQE_PREEMPTED
);
2827 if (bfq_bfqq_busy(bfqq
))
2828 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
2829 else if (entity
->on_st
)
2830 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
2831 bfqg_put(bfqq_group(bfqq
));
2834 * Here we use a reference to bfqg. We don't need a refcounter
2835 * as the cgroup reference will not be dropped, so that its
2836 * destroy() callback will not be invoked.
2838 entity
->parent
= bfqg
->my_entity
;
2839 entity
->sched_data
= &bfqg
->sched_data
;
2842 if (bfq_bfqq_busy(bfqq
))
2843 bfq_activate_bfqq(bfqd
, bfqq
);
2845 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
2846 bfq_schedule_dispatch(bfqd
);
2850 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2851 * @bfqd: the queue descriptor.
2852 * @bic: the bic to move.
2853 * @blkcg: the blk-cgroup to move to.
2855 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2856 * has to make sure that the reference to cgroup is valid across the call.
2858 * NOTE: an alternative approach might have been to store the current
2859 * cgroup in bfqq and getting a reference to it, reducing the lookup
2860 * time here, at the price of slightly more complex code.
2862 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
2863 struct bfq_io_cq
*bic
,
2864 struct blkcg
*blkcg
)
2866 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
2867 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
2868 struct bfq_group
*bfqg
;
2869 struct bfq_entity
*entity
;
2871 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
2873 if (unlikely(!bfqg
))
2874 bfqg
= bfqd
->root_group
;
2877 entity
= &async_bfqq
->entity
;
2879 if (entity
->sched_data
!= &bfqg
->sched_data
) {
2880 bic_set_bfqq(bic
, NULL
, 0);
2881 bfq_log_bfqq(bfqd
, async_bfqq
,
2882 "bic_change_group: %p %d",
2885 bfq_put_queue(async_bfqq
);
2890 entity
= &sync_bfqq
->entity
;
2891 if (entity
->sched_data
!= &bfqg
->sched_data
)
2892 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
2898 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
2900 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
2901 struct bfq_group
*bfqg
= NULL
;
2905 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
2908 * Check whether blkcg has changed. The condition may trigger
2909 * spuriously on a newly created cic but there's no harm.
2911 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
2914 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
2915 bic
->blkcg_serial_nr
= serial_nr
;
2921 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
2922 * @st: the service tree being flushed.
2924 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
2926 struct bfq_entity
*entity
= st
->first_idle
;
2928 for (; entity
; entity
= st
->first_idle
)
2929 __bfq_deactivate_entity(entity
, false);
2933 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
2934 * @bfqd: the device data structure with the root group.
2935 * @entity: the entity to move.
2937 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
2938 struct bfq_entity
*entity
)
2940 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2942 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
2946 * bfq_reparent_active_entities - move to the root group all active
2948 * @bfqd: the device data structure with the root group.
2949 * @bfqg: the group to move from.
2950 * @st: the service tree with the entities.
2952 * Needs queue_lock to be taken and reference to be valid over the call.
2954 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
2955 struct bfq_group
*bfqg
,
2956 struct bfq_service_tree
*st
)
2958 struct rb_root
*active
= &st
->active
;
2959 struct bfq_entity
*entity
= NULL
;
2961 if (!RB_EMPTY_ROOT(&st
->active
))
2962 entity
= bfq_entity_of(rb_first(active
));
2964 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
2965 bfq_reparent_leaf_entity(bfqd
, entity
);
2967 if (bfqg
->sched_data
.in_service_entity
)
2968 bfq_reparent_leaf_entity(bfqd
,
2969 bfqg
->sched_data
.in_service_entity
);
2973 * bfq_pd_offline - deactivate the entity associated with @pd,
2974 * and reparent its children entities.
2975 * @pd: descriptor of the policy going offline.
2977 * blkio already grabs the queue_lock for us, so no need to use
2980 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
2982 struct bfq_service_tree
*st
;
2983 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2984 struct bfq_data
*bfqd
= bfqg
->bfqd
;
2985 struct bfq_entity
*entity
= bfqg
->my_entity
;
2986 unsigned long flags
;
2989 if (!entity
) /* root group */
2992 spin_lock_irqsave(&bfqd
->lock
, flags
);
2994 * Empty all service_trees belonging to this group before
2995 * deactivating the group itself.
2997 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
2998 st
= bfqg
->sched_data
.service_tree
+ i
;
3001 * The idle tree may still contain bfq_queues belonging
3002 * to exited task because they never migrated to a different
3003 * cgroup from the one being destroyed now. No one else
3004 * can access them so it's safe to act without any lock.
3006 bfq_flush_idle_tree(st
);
3009 * It may happen that some queues are still active
3010 * (busy) upon group destruction (if the corresponding
3011 * processes have been forced to terminate). We move
3012 * all the leaf entities corresponding to these queues
3013 * to the root_group.
3014 * Also, it may happen that the group has an entity
3015 * in service, which is disconnected from the active
3016 * tree: it must be moved, too.
3017 * There is no need to put the sync queues, as the
3018 * scheduler has taken no reference.
3020 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3023 __bfq_deactivate_entity(entity
, false);
3024 bfq_put_async_queues(bfqd
, bfqg
);
3026 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
3028 * @blkg is going offline and will be ignored by
3029 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3030 * that they don't get lost. If IOs complete after this point, the
3031 * stats for them will be lost. Oh well...
3033 bfqg_stats_xfer_dead(bfqg
);
3036 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3038 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3039 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3040 unsigned int val
= 0;
3043 val
= bfqgd
->weight
;
3045 seq_printf(sf
, "%u\n", val
);
3050 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3051 struct cftype
*cftype
,
3054 struct blkcg
*blkcg
= css_to_blkcg(css
);
3055 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3056 struct blkcg_gq
*blkg
;
3059 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3063 spin_lock_irq(&blkcg
->lock
);
3064 bfqgd
->weight
= (unsigned short)val
;
3065 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3066 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3071 * Setting the prio_changed flag of the entity
3072 * to 1 with new_weight == weight would re-set
3073 * the value of the weight to its ioprio mapping.
3074 * Set the flag only if necessary.
3076 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3077 bfqg
->entity
.new_weight
= (unsigned short)val
;
3079 * Make sure that the above new value has been
3080 * stored in bfqg->entity.new_weight before
3081 * setting the prio_changed flag. In fact,
3082 * this flag may be read asynchronously (in
3083 * critical sections protected by a different
3084 * lock than that held here), and finding this
3085 * flag set may cause the execution of the code
3086 * for updating parameters whose value may
3087 * depend also on bfqg->entity.new_weight (in
3088 * __bfq_entity_update_weight_prio).
3089 * This barrier makes sure that the new value
3090 * of bfqg->entity.new_weight is correctly
3091 * seen in that code.
3094 bfqg
->entity
.prio_changed
= 1;
3097 spin_unlock_irq(&blkcg
->lock
);
3102 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3103 char *buf
, size_t nbytes
,
3107 /* First unsigned long found in the file is used */
3108 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3113 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3116 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3118 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3119 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3123 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3125 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3126 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3130 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3131 struct blkg_policy_data
*pd
, int off
)
3133 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3134 &blkcg_policy_bfq
, off
);
3135 return __blkg_prfill_u64(sf
, pd
, sum
);
3138 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3139 struct blkg_policy_data
*pd
, int off
)
3141 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3144 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3147 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3149 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3150 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3151 seq_cft(sf
)->private, false);
3155 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3157 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3158 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3159 seq_cft(sf
)->private, true);
3163 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3166 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3168 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3171 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3173 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3174 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3178 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3179 struct blkg_policy_data
*pd
, int off
)
3181 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3182 offsetof(struct blkcg_gq
, stat_bytes
));
3183 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3184 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3186 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3189 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3191 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3192 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3197 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3198 struct blkg_policy_data
*pd
, int off
)
3200 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3201 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3205 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3206 v
= div64_u64(v
, samples
);
3208 __blkg_prfill_u64(sf
, pd
, v
);
3212 /* print avg_queue_size */
3213 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3215 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3216 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3221 static struct bfq_group
*
3222 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3226 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3230 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3233 static struct cftype bfq_blkcg_legacy_files
[] = {
3235 .name
= "bfq.weight",
3236 .flags
= CFTYPE_NOT_ON_ROOT
,
3237 .seq_show
= bfq_io_show_weight
,
3238 .write_u64
= bfq_io_set_weight_legacy
,
3241 /* statistics, covers only the tasks in the bfqg */
3244 .private = offsetof(struct bfq_group
, stats
.time
),
3245 .seq_show
= bfqg_print_stat
,
3248 .name
= "bfq.sectors",
3249 .seq_show
= bfqg_print_stat_sectors
,
3252 .name
= "bfq.io_service_bytes",
3253 .private = (unsigned long)&blkcg_policy_bfq
,
3254 .seq_show
= blkg_print_stat_bytes
,
3257 .name
= "bfq.io_serviced",
3258 .private = (unsigned long)&blkcg_policy_bfq
,
3259 .seq_show
= blkg_print_stat_ios
,
3262 .name
= "bfq.io_service_time",
3263 .private = offsetof(struct bfq_group
, stats
.service_time
),
3264 .seq_show
= bfqg_print_rwstat
,
3267 .name
= "bfq.io_wait_time",
3268 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3269 .seq_show
= bfqg_print_rwstat
,
3272 .name
= "bfq.io_merged",
3273 .private = offsetof(struct bfq_group
, stats
.merged
),
3274 .seq_show
= bfqg_print_rwstat
,
3277 .name
= "bfq.io_queued",
3278 .private = offsetof(struct bfq_group
, stats
.queued
),
3279 .seq_show
= bfqg_print_rwstat
,
3282 /* the same statictics which cover the bfqg and its descendants */
3284 .name
= "bfq.time_recursive",
3285 .private = offsetof(struct bfq_group
, stats
.time
),
3286 .seq_show
= bfqg_print_stat_recursive
,
3289 .name
= "bfq.sectors_recursive",
3290 .seq_show
= bfqg_print_stat_sectors_recursive
,
3293 .name
= "bfq.io_service_bytes_recursive",
3294 .private = (unsigned long)&blkcg_policy_bfq
,
3295 .seq_show
= blkg_print_stat_bytes_recursive
,
3298 .name
= "bfq.io_serviced_recursive",
3299 .private = (unsigned long)&blkcg_policy_bfq
,
3300 .seq_show
= blkg_print_stat_ios_recursive
,
3303 .name
= "bfq.io_service_time_recursive",
3304 .private = offsetof(struct bfq_group
, stats
.service_time
),
3305 .seq_show
= bfqg_print_rwstat_recursive
,
3308 .name
= "bfq.io_wait_time_recursive",
3309 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3310 .seq_show
= bfqg_print_rwstat_recursive
,
3313 .name
= "bfq.io_merged_recursive",
3314 .private = offsetof(struct bfq_group
, stats
.merged
),
3315 .seq_show
= bfqg_print_rwstat_recursive
,
3318 .name
= "bfq.io_queued_recursive",
3319 .private = offsetof(struct bfq_group
, stats
.queued
),
3320 .seq_show
= bfqg_print_rwstat_recursive
,
3323 .name
= "bfq.avg_queue_size",
3324 .seq_show
= bfqg_print_avg_queue_size
,
3327 .name
= "bfq.group_wait_time",
3328 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3329 .seq_show
= bfqg_print_stat
,
3332 .name
= "bfq.idle_time",
3333 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3334 .seq_show
= bfqg_print_stat
,
3337 .name
= "bfq.empty_time",
3338 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3339 .seq_show
= bfqg_print_stat
,
3342 .name
= "bfq.dequeue",
3343 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3344 .seq_show
= bfqg_print_stat
,
3349 static struct cftype bfq_blkg_files
[] = {
3351 .name
= "bfq.weight",
3352 .flags
= CFTYPE_NOT_ON_ROOT
,
3353 .seq_show
= bfq_io_show_weight
,
3354 .write
= bfq_io_set_weight
,
3359 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3361 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3362 struct bfq_queue
*bfqq
, unsigned int op
) { }
3364 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3366 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3367 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3368 uint64_t start_time
, uint64_t io_start_time
,
3369 unsigned int op
) { }
3371 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3372 struct bfq_group
*curr_bfqg
) { }
3373 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3374 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3375 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3376 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3377 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3378 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3380 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3381 struct bfq_group
*bfqg
) {}
3383 static void bfq_init_entity(struct bfq_entity
*entity
,
3384 struct bfq_group
*bfqg
)
3386 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3388 entity
->weight
= entity
->new_weight
;
3389 entity
->orig_weight
= entity
->new_weight
;
3391 bfqq
->ioprio
= bfqq
->new_ioprio
;
3392 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3394 entity
->sched_data
= &bfqg
->sched_data
;
3397 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3399 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3400 struct blkcg
*blkcg
)
3402 return bfqd
->root_group
;
3405 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3407 return bfqq
->bfqd
->root_group
;
3410 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3413 struct bfq_group
*bfqg
;
3416 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3420 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3421 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3425 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3427 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3428 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3430 #define bfq_sample_valid(samples) ((samples) > 80)
3433 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3434 * We choose the request that is closesr to the head right now. Distance
3435 * behind the head is penalized and only allowed to a certain extent.
3437 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3438 struct request
*rq1
,
3439 struct request
*rq2
,
3442 sector_t s1
, s2
, d1
= 0, d2
= 0;
3443 unsigned long back_max
;
3444 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3445 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3446 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3448 if (!rq1
|| rq1
== rq2
)
3453 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3455 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3457 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3459 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3462 s1
= blk_rq_pos(rq1
);
3463 s2
= blk_rq_pos(rq2
);
3466 * By definition, 1KiB is 2 sectors.
3468 back_max
= bfqd
->bfq_back_max
* 2;
3471 * Strict one way elevator _except_ in the case where we allow
3472 * short backward seeks which are biased as twice the cost of a
3473 * similar forward seek.
3477 else if (s1
+ back_max
>= last
)
3478 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3480 wrap
|= BFQ_RQ1_WRAP
;
3484 else if (s2
+ back_max
>= last
)
3485 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3487 wrap
|= BFQ_RQ2_WRAP
;
3489 /* Found required data */
3492 * By doing switch() on the bit mask "wrap" we avoid having to
3493 * check two variables for all permutations: --> faster!
3496 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3511 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3514 * Since both rqs are wrapped,
3515 * start with the one that's further behind head
3516 * (--> only *one* back seek required),
3517 * since back seek takes more time than forward.
3527 * Return expired entry, or NULL to just start from scratch in rbtree.
3529 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3530 struct request
*last
)
3534 if (bfq_bfqq_fifo_expire(bfqq
))
3537 bfq_mark_bfqq_fifo_expire(bfqq
);
3539 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3541 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3544 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3548 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3549 struct bfq_queue
*bfqq
,
3550 struct request
*last
)
3552 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3553 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3554 struct request
*next
, *prev
= NULL
;
3556 /* Follow expired path, else get first next available. */
3557 next
= bfq_check_fifo(bfqq
, last
);
3562 prev
= rb_entry_rq(rbprev
);
3565 next
= rb_entry_rq(rbnext
);
3567 rbnext
= rb_first(&bfqq
->sort_list
);
3568 if (rbnext
&& rbnext
!= &last
->rb_node
)
3569 next
= rb_entry_rq(rbnext
);
3572 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3575 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3576 struct bfq_queue
*bfqq
)
3578 return blk_rq_sectors(rq
);
3582 * bfq_updated_next_req - update the queue after a new next_rq selection.
3583 * @bfqd: the device data the queue belongs to.
3584 * @bfqq: the queue to update.
3586 * If the first request of a queue changes we make sure that the queue
3587 * has enough budget to serve at least its first request (if the
3588 * request has grown). We do this because if the queue has not enough
3589 * budget for its first request, it has to go through two dispatch
3590 * rounds to actually get it dispatched.
3592 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3593 struct bfq_queue
*bfqq
)
3595 struct bfq_entity
*entity
= &bfqq
->entity
;
3596 struct request
*next_rq
= bfqq
->next_rq
;
3597 unsigned long new_budget
;
3602 if (bfqq
== bfqd
->in_service_queue
)
3604 * In order not to break guarantees, budgets cannot be
3605 * changed after an entity has been selected.
3609 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
3610 bfq_serv_to_charge(next_rq
, bfqq
));
3611 if (entity
->budget
!= new_budget
) {
3612 entity
->budget
= new_budget
;
3613 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
3615 bfq_requeue_bfqq(bfqd
, bfqq
);
3619 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
3621 struct bfq_entity
*entity
= &bfqq
->entity
;
3623 return entity
->budget
- entity
->service
;
3627 * If enough samples have been computed, return the current max budget
3628 * stored in bfqd, which is dynamically updated according to the
3629 * estimated disk peak rate; otherwise return the default max budget
3631 static int bfq_max_budget(struct bfq_data
*bfqd
)
3633 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3634 return bfq_default_max_budget
;
3636 return bfqd
->bfq_max_budget
;
3640 * Return min budget, which is a fraction of the current or default
3641 * max budget (trying with 1/32)
3643 static int bfq_min_budget(struct bfq_data
*bfqd
)
3645 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3646 return bfq_default_max_budget
/ 32;
3648 return bfqd
->bfq_max_budget
/ 32;
3651 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3652 struct bfq_queue
*bfqq
,
3654 enum bfqq_expiration reason
);
3657 * The next function, invoked after the input queue bfqq switches from
3658 * idle to busy, updates the budget of bfqq. The function also tells
3659 * whether the in-service queue should be expired, by returning
3660 * true. The purpose of expiring the in-service queue is to give bfqq
3661 * the chance to possibly preempt the in-service queue, and the reason
3662 * for preempting the in-service queue is to achieve the following
3663 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
3664 * expired because it has remained idle.
3666 * In particular, bfqq may have expired for one of the following two
3669 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3670 * and did not make it to issue a new request before its last
3671 * request was served;
3673 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3674 * a new request before the expiration of the idling-time.
3676 * Even if bfqq has expired for one of the above reasons, the process
3677 * associated with the queue may be however issuing requests greedily,
3678 * and thus be sensitive to the bandwidth it receives (bfqq may have
3679 * remained idle for other reasons: CPU high load, bfqq not enjoying
3680 * idling, I/O throttling somewhere in the path from the process to
3681 * the I/O scheduler, ...). But if, after every expiration for one of
3682 * the above two reasons, bfqq has to wait for the service of at least
3683 * one full budget of another queue before being served again, then
3684 * bfqq is likely to get a much lower bandwidth or resource time than
3685 * its reserved ones. To address this issue, two countermeasures need
3688 * First, the budget and the timestamps of bfqq need to be updated in
3689 * a special way on bfqq reactivation: they need to be updated as if
3690 * bfqq did not remain idle and did not expire. In fact, if they are
3691 * computed as if bfqq expired and remained idle until reactivation,
3692 * then the process associated with bfqq is treated as if, instead of
3693 * being greedy, it stopped issuing requests when bfqq remained idle,
3694 * and restarts issuing requests only on this reactivation. In other
3695 * words, the scheduler does not help the process recover the "service
3696 * hole" between bfqq expiration and reactivation. As a consequence,
3697 * the process receives a lower bandwidth than its reserved one. In
3698 * contrast, to recover this hole, the budget must be updated as if
3699 * bfqq was not expired at all before this reactivation, i.e., it must
3700 * be set to the value of the remaining budget when bfqq was
3701 * expired. Along the same line, timestamps need to be assigned the
3702 * value they had the last time bfqq was selected for service, i.e.,
3703 * before last expiration. Thus timestamps need to be back-shifted
3704 * with respect to their normal computation (see [1] for more details
3705 * on this tricky aspect).
3707 * Secondly, to allow the process to recover the hole, the in-service
3708 * queue must be expired too, to give bfqq the chance to preempt it
3709 * immediately. In fact, if bfqq has to wait for a full budget of the
3710 * in-service queue to be completed, then it may become impossible to
3711 * let the process recover the hole, even if the back-shifted
3712 * timestamps of bfqq are lower than those of the in-service queue. If
3713 * this happens for most or all of the holes, then the process may not
3714 * receive its reserved bandwidth. In this respect, it is worth noting
3715 * that, being the service of outstanding requests unpreemptible, a
3716 * little fraction of the holes may however be unrecoverable, thereby
3717 * causing a little loss of bandwidth.
3719 * The last important point is detecting whether bfqq does need this
3720 * bandwidth recovery. In this respect, the next function deems the
3721 * process associated with bfqq greedy, and thus allows it to recover
3722 * the hole, if: 1) the process is waiting for the arrival of a new
3723 * request (which implies that bfqq expired for one of the above two
3724 * reasons), and 2) such a request has arrived soon. The first
3725 * condition is controlled through the flag non_blocking_wait_rq,
3726 * while the second through the flag arrived_in_time. If both
3727 * conditions hold, then the function computes the budget in the
3728 * above-described special way, and signals that the in-service queue
3729 * should be expired. Timestamp back-shifting is done later in
3730 * __bfq_activate_entity.
3732 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
3733 struct bfq_queue
*bfqq
,
3734 bool arrived_in_time
)
3736 struct bfq_entity
*entity
= &bfqq
->entity
;
3738 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
3740 * We do not clear the flag non_blocking_wait_rq here, as
3741 * the latter is used in bfq_activate_bfqq to signal
3742 * that timestamps need to be back-shifted (and is
3743 * cleared right after).
3747 * In next assignment we rely on that either
3748 * entity->service or entity->budget are not updated
3749 * on expiration if bfqq is empty (see
3750 * __bfq_bfqq_recalc_budget). Thus both quantities
3751 * remain unchanged after such an expiration, and the
3752 * following statement therefore assigns to
3753 * entity->budget the remaining budget on such an
3754 * expiration. For clarity, entity->service is not
3755 * updated on expiration in any case, and, in normal
3756 * operation, is reset only when bfqq is selected for
3757 * service (see bfq_get_next_queue).
3759 entity
->budget
= min_t(unsigned long,
3760 bfq_bfqq_budget_left(bfqq
),
3766 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
3767 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
3768 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
3772 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
3773 struct bfq_queue
*bfqq
,
3776 bool bfqq_wants_to_preempt
,
3778 * See the comments on
3779 * bfq_bfqq_update_budg_for_activation for
3780 * details on the usage of the next variable.
3782 arrived_in_time
= ktime_get_ns() <=
3783 bfqq
->ttime
.last_end_request
+
3784 bfqd
->bfq_slice_idle
* 3;
3786 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
3789 * Update budget and check whether bfqq may want to preempt
3790 * the in-service queue.
3792 bfqq_wants_to_preempt
=
3793 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
3796 if (!bfq_bfqq_IO_bound(bfqq
)) {
3797 if (arrived_in_time
) {
3798 bfqq
->requests_within_timer
++;
3799 if (bfqq
->requests_within_timer
>=
3800 bfqd
->bfq_requests_within_timer
)
3801 bfq_mark_bfqq_IO_bound(bfqq
);
3803 bfqq
->requests_within_timer
= 0;
3806 bfq_add_bfqq_busy(bfqd
, bfqq
);
3809 * Expire in-service queue only if preemption may be needed
3810 * for guarantees. In this respect, the function
3811 * next_queue_may_preempt just checks a simple, necessary
3812 * condition, and not a sufficient condition based on
3813 * timestamps. In fact, for the latter condition to be
3814 * evaluated, timestamps would need first to be updated, and
3815 * this operation is quite costly (see the comments on the
3816 * function bfq_bfqq_update_budg_for_activation).
3818 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
3819 next_queue_may_preempt(bfqd
))
3820 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3821 false, BFQQE_PREEMPTED
);
3824 static void bfq_add_request(struct request
*rq
)
3826 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3827 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3828 struct request
*next_rq
, *prev
;
3830 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
3831 bfqq
->queued
[rq_is_sync(rq
)]++;
3834 elv_rb_add(&bfqq
->sort_list
, rq
);
3837 * Check if this request is a better next-serve candidate.
3839 prev
= bfqq
->next_rq
;
3840 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
3841 bfqq
->next_rq
= next_rq
;
3843 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
3844 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, rq
);
3845 else if (prev
!= bfqq
->next_rq
)
3846 bfq_updated_next_req(bfqd
, bfqq
);
3849 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
3851 struct request_queue
*q
)
3853 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
3857 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
3862 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
3865 return abs(blk_rq_pos(rq
) - last_pos
);
3870 #if 0 /* Still not clear if we can do without next two functions */
3871 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
3873 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3875 bfqd
->rq_in_driver
++;
3878 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
3880 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3882 bfqd
->rq_in_driver
--;
3886 static void bfq_remove_request(struct request_queue
*q
,
3889 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
3890 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3891 const int sync
= rq_is_sync(rq
);
3893 if (bfqq
->next_rq
== rq
) {
3894 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
3895 bfq_updated_next_req(bfqd
, bfqq
);
3898 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
3899 list_del_init(&rq
->queuelist
);
3900 bfqq
->queued
[sync
]--;
3902 elv_rb_del(&bfqq
->sort_list
, rq
);
3904 elv_rqhash_del(q
, rq
);
3905 if (q
->last_merge
== rq
)
3906 q
->last_merge
= NULL
;
3908 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
3909 bfqq
->next_rq
= NULL
;
3911 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
3912 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
3914 * bfqq emptied. In normal operation, when
3915 * bfqq is empty, bfqq->entity.service and
3916 * bfqq->entity.budget must contain,
3917 * respectively, the service received and the
3918 * budget used last time bfqq emptied. These
3919 * facts do not hold in this case, as at least
3920 * this last removal occurred while bfqq is
3921 * not in service. To avoid inconsistencies,
3922 * reset both bfqq->entity.service and
3923 * bfqq->entity.budget, if bfqq has still a
3924 * process that may issue I/O requests to it.
3926 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
3930 if (rq
->cmd_flags
& REQ_META
)
3931 bfqq
->meta_pending
--;
3933 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
3936 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
3938 struct request_queue
*q
= hctx
->queue
;
3939 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3940 struct request
*free
= NULL
;
3942 * bfq_bic_lookup grabs the queue_lock: invoke it now and
3943 * store its return value for later use, to avoid nesting
3944 * queue_lock inside the bfqd->lock. We assume that the bic
3945 * returned by bfq_bic_lookup does not go away before
3946 * bfqd->lock is taken.
3948 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
3951 spin_lock_irq(&bfqd
->lock
);
3954 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
3956 bfqd
->bio_bfqq
= NULL
;
3957 bfqd
->bio_bic
= bic
;
3959 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
3962 blk_mq_free_request(free
);
3963 spin_unlock_irq(&bfqd
->lock
);
3968 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
3971 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
3972 struct request
*__rq
;
3974 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
3975 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
3977 return ELEVATOR_FRONT_MERGE
;
3980 return ELEVATOR_NO_MERGE
;
3983 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
3984 enum elv_merge type
)
3986 if (type
== ELEVATOR_FRONT_MERGE
&&
3987 rb_prev(&req
->rb_node
) &&
3989 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
3990 struct request
, rb_node
))) {
3991 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
3992 struct bfq_data
*bfqd
= bfqq
->bfqd
;
3993 struct request
*prev
, *next_rq
;
3995 /* Reposition request in its sort_list */
3996 elv_rb_del(&bfqq
->sort_list
, req
);
3997 elv_rb_add(&bfqq
->sort_list
, req
);
3999 /* Choose next request to be served for bfqq */
4000 prev
= bfqq
->next_rq
;
4001 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4002 bfqd
->last_position
);
4003 bfqq
->next_rq
= next_rq
;
4005 * If next_rq changes, update the queue's budget to fit
4008 if (prev
!= bfqq
->next_rq
)
4009 bfq_updated_next_req(bfqd
, bfqq
);
4013 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4014 struct request
*next
)
4016 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4018 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4020 spin_lock_irq(&bfqq
->bfqd
->lock
);
4023 * If next and rq belong to the same bfq_queue and next is older
4024 * than rq, then reposition rq in the fifo (by substituting next
4025 * with rq). Otherwise, if next and rq belong to different
4026 * bfq_queues, never reposition rq: in fact, we would have to
4027 * reposition it with respect to next's position in its own fifo,
4028 * which would most certainly be too expensive with respect to
4031 if (bfqq
== next_bfqq
&&
4032 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4033 next
->fifo_time
< rq
->fifo_time
) {
4034 list_del_init(&rq
->queuelist
);
4035 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4036 rq
->fifo_time
= next
->fifo_time
;
4039 if (bfqq
->next_rq
== next
)
4042 bfq_remove_request(q
, next
);
4044 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4046 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4049 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
4052 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4053 bool is_sync
= op_is_sync(bio
->bi_opf
);
4054 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4057 * Disallow merge of a sync bio into an async request.
4059 if (is_sync
&& !rq_is_sync(rq
))
4063 * Lookup the bfqq that this bio will be queued with. Allow
4064 * merge only if rq is queued there.
4069 return bfqq
== RQ_BFQQ(rq
);
4072 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
4073 struct bfq_queue
*bfqq
)
4076 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
4077 bfq_mark_bfqq_budget_new(bfqq
);
4078 bfq_clear_bfqq_fifo_expire(bfqq
);
4080 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
4082 bfq_log_bfqq(bfqd
, bfqq
,
4083 "set_in_service_queue, cur-budget = %d",
4084 bfqq
->entity
.budget
);
4087 bfqd
->in_service_queue
= bfqq
;
4091 * Get and set a new queue for service.
4093 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
4095 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
4097 __bfq_set_in_service_queue(bfqd
, bfqq
);
4101 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
4103 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4104 struct bfq_io_cq
*bic
;
4107 /* Processes have exited, don't wait. */
4108 bic
= bfqd
->in_service_bic
;
4109 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
4112 bfq_mark_bfqq_wait_request(bfqq
);
4115 * We don't want to idle for seeks, but we do want to allow
4116 * fair distribution of slice time for a process doing back-to-back
4117 * seeks. So allow a little bit of time for him to submit a new rq.
4119 sl
= bfqd
->bfq_slice_idle
;
4121 * Grant only minimum idle time if the queue is seeky.
4123 if (BFQQ_SEEKY(bfqq
))
4124 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
4126 bfqd
->last_idling_start
= ktime_get();
4127 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
4129 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
4133 * Set the maximum time for the in-service queue to consume its
4134 * budget. This prevents seeky processes from lowering the disk
4135 * throughput (always guaranteed with a time slice scheme as in CFQ).
4137 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
)
4139 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4140 unsigned int timeout_coeff
= bfqq
->entity
.weight
/
4141 bfqq
->entity
.orig_weight
;
4143 bfqd
->last_budget_start
= ktime_get();
4145 bfq_clear_bfqq_budget_new(bfqq
);
4146 bfqq
->budget_timeout
= jiffies
+
4147 bfqd
->bfq_timeout
* timeout_coeff
;
4149 bfq_log_bfqq(bfqd
, bfqq
, "set budget_timeout %u",
4150 jiffies_to_msecs(bfqd
->bfq_timeout
* timeout_coeff
));
4154 * In autotuning mode, max_budget is dynamically recomputed as the
4155 * amount of sectors transferred in timeout at the estimated peak
4156 * rate. This enables BFQ to utilize a full timeslice with a full
4157 * budget, even if the in-service queue is served at peak rate. And
4158 * this maximises throughput with sequential workloads.
4160 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
4162 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
4163 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
4166 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
4169 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
4170 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
4171 bfqd
->peak_rate_samples
= 1;
4172 bfqd
->sequential_samples
= 0;
4173 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
4175 } else /* no new rq dispatched, just reset the number of samples */
4176 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
4179 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4180 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
4181 bfqd
->tot_sectors_dispatched
);
4184 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
4186 u32 rate
, weight
, divisor
;
4189 * For the convergence property to hold (see comments on
4190 * bfq_update_peak_rate()) and for the assessment to be
4191 * reliable, a minimum number of samples must be present, and
4192 * a minimum amount of time must have elapsed. If not so, do
4193 * not compute new rate. Just reset parameters, to get ready
4194 * for a new evaluation attempt.
4196 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
4197 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
4198 goto reset_computation
;
4201 * If a new request completion has occurred after last
4202 * dispatch, then, to approximate the rate at which requests
4203 * have been served by the device, it is more precise to
4204 * extend the observation interval to the last completion.
4206 bfqd
->delta_from_first
=
4207 max_t(u64
, bfqd
->delta_from_first
,
4208 bfqd
->last_completion
- bfqd
->first_dispatch
);
4211 * Rate computed in sects/usec, and not sects/nsec, for
4214 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
4215 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
4218 * Peak rate not updated if:
4219 * - the percentage of sequential dispatches is below 3/4 of the
4220 * total, and rate is below the current estimated peak rate
4221 * - rate is unreasonably high (> 20M sectors/sec)
4223 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
4224 rate
<= bfqd
->peak_rate
) ||
4225 rate
> 20<<BFQ_RATE_SHIFT
)
4226 goto reset_computation
;
4229 * We have to update the peak rate, at last! To this purpose,
4230 * we use a low-pass filter. We compute the smoothing constant
4231 * of the filter as a function of the 'weight' of the new
4234 * As can be seen in next formulas, we define this weight as a
4235 * quantity proportional to how sequential the workload is,
4236 * and to how long the observation time interval is.
4238 * The weight runs from 0 to 8. The maximum value of the
4239 * weight, 8, yields the minimum value for the smoothing
4240 * constant. At this minimum value for the smoothing constant,
4241 * the measured rate contributes for half of the next value of
4242 * the estimated peak rate.
4244 * So, the first step is to compute the weight as a function
4245 * of how sequential the workload is. Note that the weight
4246 * cannot reach 9, because bfqd->sequential_samples cannot
4247 * become equal to bfqd->peak_rate_samples, which, in its
4248 * turn, holds true because bfqd->sequential_samples is not
4249 * incremented for the first sample.
4251 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
4254 * Second step: further refine the weight as a function of the
4255 * duration of the observation interval.
4257 weight
= min_t(u32
, 8,
4258 div_u64(weight
* bfqd
->delta_from_first
,
4259 BFQ_RATE_REF_INTERVAL
));
4262 * Divisor ranging from 10, for minimum weight, to 2, for
4265 divisor
= 10 - weight
;
4268 * Finally, update peak rate:
4270 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4272 bfqd
->peak_rate
*= divisor
-1;
4273 bfqd
->peak_rate
/= divisor
;
4274 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
4276 bfqd
->peak_rate
+= rate
;
4277 if (bfqd
->bfq_user_max_budget
== 0)
4278 bfqd
->bfq_max_budget
=
4279 bfq_calc_max_budget(bfqd
);
4282 bfq_reset_rate_computation(bfqd
, rq
);
4286 * Update the read/write peak rate (the main quantity used for
4287 * auto-tuning, see update_thr_responsiveness_params()).
4289 * It is not trivial to estimate the peak rate (correctly): because of
4290 * the presence of sw and hw queues between the scheduler and the
4291 * device components that finally serve I/O requests, it is hard to
4292 * say exactly when a given dispatched request is served inside the
4293 * device, and for how long. As a consequence, it is hard to know
4294 * precisely at what rate a given set of requests is actually served
4297 * On the opposite end, the dispatch time of any request is trivially
4298 * available, and, from this piece of information, the "dispatch rate"
4299 * of requests can be immediately computed. So, the idea in the next
4300 * function is to use what is known, namely request dispatch times
4301 * (plus, when useful, request completion times), to estimate what is
4302 * unknown, namely in-device request service rate.
4304 * The main issue is that, because of the above facts, the rate at
4305 * which a certain set of requests is dispatched over a certain time
4306 * interval can vary greatly with respect to the rate at which the
4307 * same requests are then served. But, since the size of any
4308 * intermediate queue is limited, and the service scheme is lossless
4309 * (no request is silently dropped), the following obvious convergence
4310 * property holds: the number of requests dispatched MUST become
4311 * closer and closer to the number of requests completed as the
4312 * observation interval grows. This is the key property used in
4313 * the next function to estimate the peak service rate as a function
4314 * of the observed dispatch rate. The function assumes to be invoked
4315 * on every request dispatch.
4317 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
4319 u64 now_ns
= ktime_get_ns();
4321 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
4322 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
4323 bfqd
->peak_rate_samples
);
4324 bfq_reset_rate_computation(bfqd
, rq
);
4325 goto update_last_values
; /* will add one sample */
4329 * Device idle for very long: the observation interval lasting
4330 * up to this dispatch cannot be a valid observation interval
4331 * for computing a new peak rate (similarly to the late-
4332 * completion event in bfq_completed_request()). Go to
4333 * update_rate_and_reset to have the following three steps
4335 * - close the observation interval at the last (previous)
4336 * request dispatch or completion
4337 * - compute rate, if possible, for that observation interval
4338 * - start a new observation interval with this dispatch
4340 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
4341 bfqd
->rq_in_driver
== 0)
4342 goto update_rate_and_reset
;
4344 /* Update sampling information */
4345 bfqd
->peak_rate_samples
++;
4347 if ((bfqd
->rq_in_driver
> 0 ||
4348 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
4349 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
4350 bfqd
->sequential_samples
++;
4352 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
4354 /* Reset max observed rq size every 32 dispatches */
4355 if (likely(bfqd
->peak_rate_samples
% 32))
4356 bfqd
->last_rq_max_size
=
4357 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
4359 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
4361 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
4363 /* Target observation interval not yet reached, go on sampling */
4364 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
4365 goto update_last_values
;
4367 update_rate_and_reset
:
4368 bfq_update_rate_reset(bfqd
, rq
);
4370 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
4371 bfqd
->last_dispatch
= now_ns
;
4375 * Remove request from internal lists.
4377 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
4379 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4382 * For consistency, the next instruction should have been
4383 * executed after removing the request from the queue and
4384 * dispatching it. We execute instead this instruction before
4385 * bfq_remove_request() (and hence introduce a temporary
4386 * inconsistency), for efficiency. In fact, should this
4387 * dispatch occur for a non in-service bfqq, this anticipated
4388 * increment prevents two counters related to bfqq->dispatched
4389 * from risking to be, first, uselessly decremented, and then
4390 * incremented again when the (new) value of bfqq->dispatched
4391 * happens to be taken into account.
4394 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
4396 bfq_remove_request(q
, rq
);
4399 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4401 if (RB_EMPTY_ROOT(&bfqq
->sort_list
))
4402 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
4404 bfq_requeue_bfqq(bfqd
, bfqq
);
4407 * All in-service entities must have been properly deactivated
4408 * or requeued before executing the next function, which
4409 * resets all in-service entites as no more in service.
4411 __bfq_bfqd_reset_in_service(bfqd
);
4415 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4416 * @bfqd: device data.
4417 * @bfqq: queue to update.
4418 * @reason: reason for expiration.
4420 * Handle the feedback on @bfqq budget at queue expiration.
4421 * See the body for detailed comments.
4423 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
4424 struct bfq_queue
*bfqq
,
4425 enum bfqq_expiration reason
)
4427 struct request
*next_rq
;
4428 int budget
, min_budget
;
4430 budget
= bfqq
->max_budget
;
4431 min_budget
= bfq_min_budget(bfqd
);
4433 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
4434 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
4435 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
4436 budget
, bfq_min_budget(bfqd
));
4437 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
4438 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
4440 if (bfq_bfqq_sync(bfqq
)) {
4443 * Caveat: in all the following cases we trade latency
4446 case BFQQE_TOO_IDLE
:
4448 * This is the only case where we may reduce
4449 * the budget: if there is no request of the
4450 * process still waiting for completion, then
4451 * we assume (tentatively) that the timer has
4452 * expired because the batch of requests of
4453 * the process could have been served with a
4454 * smaller budget. Hence, betting that
4455 * process will behave in the same way when it
4456 * becomes backlogged again, we reduce its
4457 * next budget. As long as we guess right,
4458 * this budget cut reduces the latency
4459 * experienced by the process.
4461 * However, if there are still outstanding
4462 * requests, then the process may have not yet
4463 * issued its next request just because it is
4464 * still waiting for the completion of some of
4465 * the still outstanding ones. So in this
4466 * subcase we do not reduce its budget, on the
4467 * contrary we increase it to possibly boost
4468 * the throughput, as discussed in the
4469 * comments to the BUDGET_TIMEOUT case.
4471 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
4472 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4474 if (budget
> 5 * min_budget
)
4475 budget
-= 4 * min_budget
;
4477 budget
= min_budget
;
4480 case BFQQE_BUDGET_TIMEOUT
:
4482 * We double the budget here because it gives
4483 * the chance to boost the throughput if this
4484 * is not a seeky process (and has bumped into
4485 * this timeout because of, e.g., ZBR).
4487 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
4489 case BFQQE_BUDGET_EXHAUSTED
:
4491 * The process still has backlog, and did not
4492 * let either the budget timeout or the disk
4493 * idling timeout expire. Hence it is not
4494 * seeky, has a short thinktime and may be
4495 * happy with a higher budget too. So
4496 * definitely increase the budget of this good
4497 * candidate to boost the disk throughput.
4499 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
4501 case BFQQE_NO_MORE_REQUESTS
:
4503 * For queues that expire for this reason, it
4504 * is particularly important to keep the
4505 * budget close to the actual service they
4506 * need. Doing so reduces the timestamp
4507 * misalignment problem described in the
4508 * comments in the body of
4509 * __bfq_activate_entity. In fact, suppose
4510 * that a queue systematically expires for
4511 * BFQQE_NO_MORE_REQUESTS and presents a
4512 * new request in time to enjoy timestamp
4513 * back-shifting. The larger the budget of the
4514 * queue is with respect to the service the
4515 * queue actually requests in each service
4516 * slot, the more times the queue can be
4517 * reactivated with the same virtual finish
4518 * time. It follows that, even if this finish
4519 * time is pushed to the system virtual time
4520 * to reduce the consequent timestamp
4521 * misalignment, the queue unjustly enjoys for
4522 * many re-activations a lower finish time
4523 * than all newly activated queues.
4525 * The service needed by bfqq is measured
4526 * quite precisely by bfqq->entity.service.
4527 * Since bfqq does not enjoy device idling,
4528 * bfqq->entity.service is equal to the number
4529 * of sectors that the process associated with
4530 * bfqq requested to read/write before waiting
4531 * for request completions, or blocking for
4534 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
4541 * Async queues get always the maximum possible
4542 * budget, as for them we do not care about latency
4543 * (in addition, their ability to dispatch is limited
4544 * by the charging factor).
4546 budget
= bfqd
->bfq_max_budget
;
4549 bfqq
->max_budget
= budget
;
4551 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
4552 !bfqd
->bfq_user_max_budget
)
4553 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
4556 * If there is still backlog, then assign a new budget, making
4557 * sure that it is large enough for the next request. Since
4558 * the finish time of bfqq must be kept in sync with the
4559 * budget, be sure to call __bfq_bfqq_expire() *after* this
4562 * If there is no backlog, then no need to update the budget;
4563 * it will be updated on the arrival of a new request.
4565 next_rq
= bfqq
->next_rq
;
4567 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
4568 bfq_serv_to_charge(next_rq
, bfqq
));
4570 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
4571 next_rq
? blk_rq_sectors(next_rq
) : 0,
4572 bfqq
->entity
.budget
);
4576 * Return true if the process associated with bfqq is "slow". The slow
4577 * flag is used, in addition to the budget timeout, to reduce the
4578 * amount of service provided to seeky processes, and thus reduce
4579 * their chances to lower the throughput. More details in the comments
4580 * on the function bfq_bfqq_expire().
4582 * An important observation is in order: as discussed in the comments
4583 * on the function bfq_update_peak_rate(), with devices with internal
4584 * queues, it is hard if ever possible to know when and for how long
4585 * an I/O request is processed by the device (apart from the trivial
4586 * I/O pattern where a new request is dispatched only after the
4587 * previous one has been completed). This makes it hard to evaluate
4588 * the real rate at which the I/O requests of each bfq_queue are
4589 * served. In fact, for an I/O scheduler like BFQ, serving a
4590 * bfq_queue means just dispatching its requests during its service
4591 * slot (i.e., until the budget of the queue is exhausted, or the
4592 * queue remains idle, or, finally, a timeout fires). But, during the
4593 * service slot of a bfq_queue, around 100 ms at most, the device may
4594 * be even still processing requests of bfq_queues served in previous
4595 * service slots. On the opposite end, the requests of the in-service
4596 * bfq_queue may be completed after the service slot of the queue
4599 * Anyway, unless more sophisticated solutions are used
4600 * (where possible), the sum of the sizes of the requests dispatched
4601 * during the service slot of a bfq_queue is probably the only
4602 * approximation available for the service received by the bfq_queue
4603 * during its service slot. And this sum is the quantity used in this
4604 * function to evaluate the I/O speed of a process.
4606 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
4607 bool compensate
, enum bfqq_expiration reason
,
4608 unsigned long *delta_ms
)
4610 ktime_t delta_ktime
;
4612 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
4614 if (!bfq_bfqq_sync(bfqq
))
4618 delta_ktime
= bfqd
->last_idling_start
;
4620 delta_ktime
= ktime_get();
4621 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
4622 delta_usecs
= ktime_to_us(delta_ktime
);
4624 /* don't use too short time intervals */
4625 if (delta_usecs
< 1000) {
4626 if (blk_queue_nonrot(bfqd
->queue
))
4628 * give same worst-case guarantees as idling
4631 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
4632 else /* charge at least one seek */
4633 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
4638 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
4641 * Use only long (> 20ms) intervals to filter out excessive
4642 * spikes in service rate estimation.
4644 if (delta_usecs
> 20000) {
4646 * Caveat for rotational devices: processes doing I/O
4647 * in the slower disk zones tend to be slow(er) even
4648 * if not seeky. In this respect, the estimated peak
4649 * rate is likely to be an average over the disk
4650 * surface. Accordingly, to not be too harsh with
4651 * unlucky processes, a process is deemed slow only if
4652 * its rate has been lower than half of the estimated
4655 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
4658 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
4664 * Return the farthest past time instant according to jiffies
4667 static unsigned long bfq_smallest_from_now(void)
4669 return jiffies
- MAX_JIFFY_OFFSET
;
4673 * bfq_bfqq_expire - expire a queue.
4674 * @bfqd: device owning the queue.
4675 * @bfqq: the queue to expire.
4676 * @compensate: if true, compensate for the time spent idling.
4677 * @reason: the reason causing the expiration.
4680 * If the process associated with the queue is slow (i.e., seeky), or
4681 * in case of budget timeout, or, finally, if it is async, we
4682 * artificially charge it an entire budget (independently of the
4683 * actual service it received). As a consequence, the queue will get
4684 * higher timestamps than the correct ones upon reactivation, and
4685 * hence it will be rescheduled as if it had received more service
4686 * than what it actually received. In the end, this class of processes
4687 * will receive less service in proportion to how slowly they consume
4688 * their budgets (and hence how seriously they tend to lower the
4691 * In contrast, when a queue expires because it has been idling for
4692 * too much or because it exhausted its budget, we do not touch the
4693 * amount of service it has received. Hence when the queue will be
4694 * reactivated and its timestamps updated, the latter will be in sync
4695 * with the actual service received by the queue until expiration.
4697 * Charging a full budget to the first type of queues and the exact
4698 * service to the others has the effect of using the WF2Q+ policy to
4699 * schedule the former on a timeslice basis, without violating the
4700 * service domain guarantees of the latter.
4702 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4703 struct bfq_queue
*bfqq
,
4705 enum bfqq_expiration reason
)
4708 unsigned long delta
= 0;
4709 struct bfq_entity
*entity
= &bfqq
->entity
;
4713 * Check whether the process is slow (see bfq_bfqq_is_slow).
4715 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
4718 * As above explained, 'punish' slow (i.e., seeky), timed-out
4719 * and async queues, to favor sequential sync workloads.
4721 if (slow
|| reason
== BFQQE_BUDGET_TIMEOUT
)
4722 bfq_bfqq_charge_full_budget(bfqq
);
4724 if (reason
== BFQQE_TOO_IDLE
&&
4725 entity
->service
<= 2 * entity
->budget
/ 10)
4726 bfq_clear_bfqq_IO_bound(bfqq
);
4728 bfq_log_bfqq(bfqd
, bfqq
,
4729 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
4730 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
4733 * Increase, decrease or leave budget unchanged according to
4736 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
4738 __bfq_bfqq_expire(bfqd
, bfqq
);
4740 /* mark bfqq as waiting a request only if a bic still points to it */
4741 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
4742 reason
!= BFQQE_BUDGET_TIMEOUT
&&
4743 reason
!= BFQQE_BUDGET_EXHAUSTED
)
4744 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
4748 * Budget timeout is not implemented through a dedicated timer, but
4749 * just checked on request arrivals and completions, as well as on
4750 * idle timer expirations.
4752 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
4754 if (bfq_bfqq_budget_new(bfqq
) ||
4755 time_is_after_jiffies(bfqq
->budget_timeout
))
4761 * If we expire a queue that is actively waiting (i.e., with the
4762 * device idled) for the arrival of a new request, then we may incur
4763 * the timestamp misalignment problem described in the body of the
4764 * function __bfq_activate_entity. Hence we return true only if this
4765 * condition does not hold, or if the queue is slow enough to deserve
4766 * only to be kicked off for preserving a high throughput.
4768 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
4770 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4771 "may_budget_timeout: wait_request %d left %d timeout %d",
4772 bfq_bfqq_wait_request(bfqq
),
4773 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
4774 bfq_bfqq_budget_timeout(bfqq
));
4776 return (!bfq_bfqq_wait_request(bfqq
) ||
4777 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
4779 bfq_bfqq_budget_timeout(bfqq
);
4783 * For a queue that becomes empty, device idling is allowed only if
4784 * this function returns true for the queue. And this function returns
4785 * true only if idling is beneficial for throughput.
4787 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
4789 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4790 bool idling_boosts_thr
;
4792 if (bfqd
->strict_guarantees
)
4796 * The value of the next variable is computed considering that
4797 * idling is usually beneficial for the throughput if:
4798 * (a) the device is not NCQ-capable, or
4799 * (b) regardless of the presence of NCQ, the request pattern
4800 * for bfqq is I/O-bound (possible throughput losses
4801 * caused by granting idling to seeky queues are mitigated
4802 * by the fact that, in all scenarios where boosting
4803 * throughput is the best thing to do, i.e., in all
4804 * symmetric scenarios, only a minimal idle time is
4805 * allowed to seeky queues).
4807 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
4810 * We have now the components we need to compute the return
4811 * value of the function, which is true only if both the
4812 * following conditions hold:
4813 * 1) bfqq is sync, because idling make sense only for sync queues;
4814 * 2) idling boosts the throughput.
4816 return bfq_bfqq_sync(bfqq
) && idling_boosts_thr
;
4820 * If the in-service queue is empty but the function bfq_bfqq_may_idle
4821 * returns true, then:
4822 * 1) the queue must remain in service and cannot be expired, and
4823 * 2) the device must be idled to wait for the possible arrival of a new
4824 * request for the queue.
4825 * See the comments on the function bfq_bfqq_may_idle for the reasons
4826 * why performing device idling is the best choice to boost the throughput
4827 * and preserve service guarantees when bfq_bfqq_may_idle itself
4830 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
4832 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4834 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
4835 bfq_bfqq_may_idle(bfqq
);
4839 * Select a queue for service. If we have a current queue in service,
4840 * check whether to continue servicing it, or retrieve and set a new one.
4842 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
4844 struct bfq_queue
*bfqq
;
4845 struct request
*next_rq
;
4846 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
4848 bfqq
= bfqd
->in_service_queue
;
4852 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
4854 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
4855 !bfq_bfqq_wait_request(bfqq
) &&
4856 !bfq_bfqq_must_idle(bfqq
))
4861 * This loop is rarely executed more than once. Even when it
4862 * happens, it is much more convenient to re-execute this loop
4863 * than to return NULL and trigger a new dispatch to get a
4866 next_rq
= bfqq
->next_rq
;
4868 * If bfqq has requests queued and it has enough budget left to
4869 * serve them, keep the queue, otherwise expire it.
4872 if (bfq_serv_to_charge(next_rq
, bfqq
) >
4873 bfq_bfqq_budget_left(bfqq
)) {
4875 * Expire the queue for budget exhaustion,
4876 * which makes sure that the next budget is
4877 * enough to serve the next request, even if
4878 * it comes from the fifo expired path.
4880 reason
= BFQQE_BUDGET_EXHAUSTED
;
4884 * The idle timer may be pending because we may
4885 * not disable disk idling even when a new request
4888 if (bfq_bfqq_wait_request(bfqq
)) {
4890 * If we get here: 1) at least a new request
4891 * has arrived but we have not disabled the
4892 * timer because the request was too small,
4893 * 2) then the block layer has unplugged
4894 * the device, causing the dispatch to be
4897 * Since the device is unplugged, now the
4898 * requests are probably large enough to
4899 * provide a reasonable throughput.
4900 * So we disable idling.
4902 bfq_clear_bfqq_wait_request(bfqq
);
4903 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
4904 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
4911 * No requests pending. However, if the in-service queue is idling
4912 * for a new request, or has requests waiting for a completion and
4913 * may idle after their completion, then keep it anyway.
4915 if (bfq_bfqq_wait_request(bfqq
) ||
4916 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
4921 reason
= BFQQE_NO_MORE_REQUESTS
;
4923 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
4925 bfqq
= bfq_set_in_service_queue(bfqd
);
4927 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
4932 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
4934 bfq_log(bfqd
, "select_queue: no queue returned");
4940 * Dispatch next request from bfqq.
4942 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
4943 struct bfq_queue
*bfqq
)
4945 struct request
*rq
= bfqq
->next_rq
;
4946 unsigned long service_to_charge
;
4948 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
4950 bfq_bfqq_served(bfqq
, service_to_charge
);
4952 bfq_dispatch_remove(bfqd
->queue
, rq
);
4954 if (!bfqd
->in_service_bic
) {
4955 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
4956 bfqd
->in_service_bic
= RQ_BIC(rq
);
4960 * Expire bfqq, pretending that its budget expired, if bfqq
4961 * belongs to CLASS_IDLE and other queues are waiting for
4964 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
4970 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
4974 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
4976 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
4979 * Avoiding lock: a race on bfqd->busy_queues should cause at
4980 * most a call to dispatch for nothing
4982 return !list_empty_careful(&bfqd
->dispatch
) ||
4983 bfqd
->busy_queues
> 0;
4986 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
4988 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
4989 struct request
*rq
= NULL
;
4990 struct bfq_queue
*bfqq
= NULL
;
4992 if (!list_empty(&bfqd
->dispatch
)) {
4993 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
4995 list_del_init(&rq
->queuelist
);
5001 * Increment counters here, because this
5002 * dispatch does not follow the standard
5003 * dispatch flow (where counters are
5008 goto inc_in_driver_start_rq
;
5012 * We exploit the put_rq_private hook to decrement
5013 * rq_in_driver, but put_rq_private will not be
5014 * invoked on this request. So, to avoid unbalance,
5015 * just start this request, without incrementing
5016 * rq_in_driver. As a negative consequence,
5017 * rq_in_driver is deceptively lower than it should be
5018 * while this request is in service. This may cause
5019 * bfq_schedule_dispatch to be invoked uselessly.
5021 * As for implementing an exact solution, the
5022 * put_request hook, if defined, is probably invoked
5023 * also on this request. So, by exploiting this hook,
5024 * we could 1) increment rq_in_driver here, and 2)
5025 * decrement it in put_request. Such a solution would
5026 * let the value of the counter be always accurate,
5027 * but it would entail using an extra interface
5028 * function. This cost seems higher than the benefit,
5029 * being the frequency of non-elevator-private
5030 * requests very low.
5035 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
5037 if (bfqd
->busy_queues
== 0)
5041 * Force device to serve one request at a time if
5042 * strict_guarantees is true. Forcing this service scheme is
5043 * currently the ONLY way to guarantee that the request
5044 * service order enforced by the scheduler is respected by a
5045 * queueing device. Otherwise the device is free even to make
5046 * some unlucky request wait for as long as the device
5049 * Of course, serving one request at at time may cause loss of
5052 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
5055 bfqq
= bfq_select_queue(bfqd
);
5059 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
5062 inc_in_driver_start_rq
:
5063 bfqd
->rq_in_driver
++;
5065 rq
->rq_flags
|= RQF_STARTED
;
5071 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5073 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5076 spin_lock_irq(&bfqd
->lock
);
5077 rq
= __bfq_dispatch_request(hctx
);
5078 spin_unlock_irq(&bfqd
->lock
);
5084 * Task holds one reference to the queue, dropped when task exits. Each rq
5085 * in-flight on this queue also holds a reference, dropped when rq is freed.
5087 * Scheduler lock must be held here. Recall not to use bfqq after calling
5088 * this function on it.
5090 static void bfq_put_queue(struct bfq_queue
*bfqq
)
5092 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5093 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
5097 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
5104 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
5106 kmem_cache_free(bfq_pool
, bfqq
);
5107 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5112 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5114 if (bfqq
== bfqd
->in_service_queue
) {
5115 __bfq_bfqq_expire(bfqd
, bfqq
);
5116 bfq_schedule_dispatch(bfqd
);
5119 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
5121 bfq_put_queue(bfqq
); /* release process reference */
5124 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
5126 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
5127 struct bfq_data
*bfqd
;
5130 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
5133 unsigned long flags
;
5135 spin_lock_irqsave(&bfqd
->lock
, flags
);
5136 bfq_exit_bfqq(bfqd
, bfqq
);
5137 bic_set_bfqq(bic
, NULL
, is_sync
);
5138 spin_unlock_irq(&bfqd
->lock
);
5142 static void bfq_exit_icq(struct io_cq
*icq
)
5144 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
5146 bfq_exit_icq_bfqq(bic
, true);
5147 bfq_exit_icq_bfqq(bic
, false);
5151 * Update the entity prio values; note that the new values will not
5152 * be used until the next (re)activation.
5155 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
5157 struct task_struct
*tsk
= current
;
5159 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5164 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
5165 switch (ioprio_class
) {
5167 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
5168 "bfq: bad prio class %d\n", ioprio_class
);
5169 case IOPRIO_CLASS_NONE
:
5171 * No prio set, inherit CPU scheduling settings.
5173 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
5174 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
5176 case IOPRIO_CLASS_RT
:
5177 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5178 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
5180 case IOPRIO_CLASS_BE
:
5181 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5182 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
5184 case IOPRIO_CLASS_IDLE
:
5185 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
5186 bfqq
->new_ioprio
= 7;
5187 bfq_clear_bfqq_idle_window(bfqq
);
5191 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
5192 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
5194 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
5197 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
5198 bfqq
->entity
.prio_changed
= 1;
5201 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
5203 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
5204 struct bfq_queue
*bfqq
;
5205 int ioprio
= bic
->icq
.ioc
->ioprio
;
5208 * This condition may trigger on a newly created bic, be sure to
5209 * drop the lock before returning.
5211 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
5214 bic
->ioprio
= ioprio
;
5216 bfqq
= bic_to_bfqq(bic
, false);
5218 /* release process reference on this queue */
5219 bfq_put_queue(bfqq
);
5220 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
5221 bic_set_bfqq(bic
, bfqq
, false);
5224 bfqq
= bic_to_bfqq(bic
, true);
5226 bfq_set_next_ioprio_data(bfqq
, bic
);
5229 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5230 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
5232 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
5233 INIT_LIST_HEAD(&bfqq
->fifo
);
5239 bfq_set_next_ioprio_data(bfqq
, bic
);
5242 if (!bfq_class_idle(bfqq
))
5243 bfq_mark_bfqq_idle_window(bfqq
);
5244 bfq_mark_bfqq_sync(bfqq
);
5246 bfq_clear_bfqq_sync(bfqq
);
5248 /* set end request to minus infinity from now */
5249 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
5251 bfq_mark_bfqq_IO_bound(bfqq
);
5255 /* Tentative initial value to trade off between thr and lat */
5256 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
5257 bfqq
->budget_timeout
= bfq_smallest_from_now();
5259 /* first request is almost certainly seeky */
5260 bfqq
->seek_history
= 1;
5263 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
5264 struct bfq_group
*bfqg
,
5265 int ioprio_class
, int ioprio
)
5267 switch (ioprio_class
) {
5268 case IOPRIO_CLASS_RT
:
5269 return &bfqg
->async_bfqq
[0][ioprio
];
5270 case IOPRIO_CLASS_NONE
:
5271 ioprio
= IOPRIO_NORM
;
5273 case IOPRIO_CLASS_BE
:
5274 return &bfqg
->async_bfqq
[1][ioprio
];
5275 case IOPRIO_CLASS_IDLE
:
5276 return &bfqg
->async_idle_bfqq
;
5282 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
5283 struct bio
*bio
, bool is_sync
,
5284 struct bfq_io_cq
*bic
)
5286 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
5287 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
5288 struct bfq_queue
**async_bfqq
= NULL
;
5289 struct bfq_queue
*bfqq
;
5290 struct bfq_group
*bfqg
;
5294 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
5296 bfqq
= &bfqd
->oom_bfqq
;
5301 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
5308 bfqq
= kmem_cache_alloc_node(bfq_pool
,
5309 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
5313 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
5315 bfq_init_entity(&bfqq
->entity
, bfqg
);
5316 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
5318 bfqq
= &bfqd
->oom_bfqq
;
5319 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
5324 * Pin the queue now that it's allocated, scheduler exit will
5329 * Extra group reference, w.r.t. sync
5330 * queue. This extra reference is removed
5331 * only if bfqq->bfqg disappears, to
5332 * guarantee that this queue is not freed
5333 * until its group goes away.
5335 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
5341 bfqq
->ref
++; /* get a process reference to this queue */
5342 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
5347 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
5348 struct bfq_queue
*bfqq
)
5350 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
5351 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
5353 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
5355 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
5356 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
5357 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
5358 ttime
->ttime_samples
);
5362 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5365 bfqq
->seek_history
<<= 1;
5366 bfqq
->seek_history
|=
5367 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
5368 (!blk_queue_nonrot(bfqd
->queue
) ||
5369 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
5373 * Disable idle window if the process thinks too long or seeks so much that
5374 * it doesn't matter.
5376 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
5377 struct bfq_queue
*bfqq
,
5378 struct bfq_io_cq
*bic
)
5382 /* Don't idle for async or idle io prio class. */
5383 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
5386 enable_idle
= bfq_bfqq_idle_window(bfqq
);
5388 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
5389 bfqd
->bfq_slice_idle
== 0 ||
5390 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
)))
5392 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
5393 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
)
5398 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
5402 bfq_mark_bfqq_idle_window(bfqq
);
5404 bfq_clear_bfqq_idle_window(bfqq
);
5408 * Called when a new fs request (rq) is added to bfqq. Check if there's
5409 * something we should do about it.
5411 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5414 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
5416 if (rq
->cmd_flags
& REQ_META
)
5417 bfqq
->meta_pending
++;
5419 bfq_update_io_thinktime(bfqd
, bfqq
);
5420 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
5421 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
5423 bfq_update_idle_window(bfqd
, bfqq
, bic
);
5425 bfq_log_bfqq(bfqd
, bfqq
,
5426 "rq_enqueued: idle_window=%d (seeky %d)",
5427 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
5429 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
5431 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
5432 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
5433 blk_rq_sectors(rq
) < 32;
5434 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
5437 * There is just this request queued: if the request
5438 * is small and the queue is not to be expired, then
5441 * In this way, if the device is being idled to wait
5442 * for a new request from the in-service queue, we
5443 * avoid unplugging the device and committing the
5444 * device to serve just a small request. On the
5445 * contrary, we wait for the block layer to decide
5446 * when to unplug the device: hopefully, new requests
5447 * will be merged to this one quickly, then the device
5448 * will be unplugged and larger requests will be
5451 if (small_req
&& !budget_timeout
)
5455 * A large enough request arrived, or the queue is to
5456 * be expired: in both cases disk idling is to be
5457 * stopped, so clear wait_request flag and reset
5460 bfq_clear_bfqq_wait_request(bfqq
);
5461 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
5462 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
5465 * The queue is not empty, because a new request just
5466 * arrived. Hence we can safely expire the queue, in
5467 * case of budget timeout, without risking that the
5468 * timestamps of the queue are not updated correctly.
5469 * See [1] for more details.
5472 bfq_bfqq_expire(bfqd
, bfqq
, false,
5473 BFQQE_BUDGET_TIMEOUT
);
5477 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
5479 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5481 bfq_add_request(rq
);
5483 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
5484 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
5486 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
5489 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
5492 struct request_queue
*q
= hctx
->queue
;
5493 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5495 spin_lock_irq(&bfqd
->lock
);
5496 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
5497 spin_unlock_irq(&bfqd
->lock
);
5501 spin_unlock_irq(&bfqd
->lock
);
5503 blk_mq_sched_request_inserted(rq
);
5505 spin_lock_irq(&bfqd
->lock
);
5506 if (at_head
|| blk_rq_is_passthrough(rq
)) {
5508 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
5510 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
5512 __bfq_insert_request(bfqd
, rq
);
5514 if (rq_mergeable(rq
)) {
5515 elv_rqhash_add(q
, rq
);
5521 spin_unlock_irq(&bfqd
->lock
);
5524 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
5525 struct list_head
*list
, bool at_head
)
5527 while (!list_empty(list
)) {
5530 rq
= list_first_entry(list
, struct request
, queuelist
);
5531 list_del_init(&rq
->queuelist
);
5532 bfq_insert_request(hctx
, rq
, at_head
);
5536 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
5538 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
5539 bfqd
->rq_in_driver
);
5541 if (bfqd
->hw_tag
== 1)
5545 * This sample is valid if the number of outstanding requests
5546 * is large enough to allow a queueing behavior. Note that the
5547 * sum is not exact, as it's not taking into account deactivated
5550 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
5553 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
5556 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
5557 bfqd
->max_rq_in_driver
= 0;
5558 bfqd
->hw_tag_samples
= 0;
5561 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
5566 bfq_update_hw_tag(bfqd
);
5568 bfqd
->rq_in_driver
--;
5571 now_ns
= ktime_get_ns();
5573 bfqq
->ttime
.last_end_request
= now_ns
;
5576 * Using us instead of ns, to get a reasonable precision in
5577 * computing rate in next check.
5579 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
5582 * If the request took rather long to complete, and, according
5583 * to the maximum request size recorded, this completion latency
5584 * implies that the request was certainly served at a very low
5585 * rate (less than 1M sectors/sec), then the whole observation
5586 * interval that lasts up to this time instant cannot be a
5587 * valid time interval for computing a new peak rate. Invoke
5588 * bfq_update_rate_reset to have the following three steps
5590 * - close the observation interval at the last (previous)
5591 * request dispatch or completion
5592 * - compute rate, if possible, for that observation interval
5593 * - reset to zero samples, which will trigger a proper
5594 * re-initialization of the observation interval on next
5597 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
5598 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
5599 1UL<<(BFQ_RATE_SHIFT
- 10))
5600 bfq_update_rate_reset(bfqd
, NULL
);
5601 bfqd
->last_completion
= now_ns
;
5604 * If this is the in-service queue, check if it needs to be expired,
5605 * or if we want to idle in case it has no pending requests.
5607 if (bfqd
->in_service_queue
== bfqq
) {
5608 if (bfq_bfqq_budget_new(bfqq
))
5609 bfq_set_budget_timeout(bfqd
);
5611 if (bfq_bfqq_must_idle(bfqq
)) {
5612 bfq_arm_slice_timer(bfqd
);
5614 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
5615 bfq_bfqq_expire(bfqd
, bfqq
, false,
5616 BFQQE_BUDGET_TIMEOUT
);
5617 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
5618 (bfqq
->dispatched
== 0 ||
5619 !bfq_bfqq_may_idle(bfqq
)))
5620 bfq_bfqq_expire(bfqd
, bfqq
, false,
5621 BFQQE_NO_MORE_REQUESTS
);
5625 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
5629 bfq_put_queue(bfqq
);
5632 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
5634 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5635 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5637 if (rq
->rq_flags
& RQF_STARTED
)
5638 bfqg_stats_update_completion(bfqq_group(bfqq
),
5639 rq_start_time_ns(rq
),
5640 rq_io_start_time_ns(rq
),
5643 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
5644 unsigned long flags
;
5646 spin_lock_irqsave(&bfqd
->lock
, flags
);
5648 bfq_completed_request(bfqq
, bfqd
);
5649 bfq_put_rq_priv_body(bfqq
);
5651 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5654 * Request rq may be still/already in the scheduler,
5655 * in which case we need to remove it. And we cannot
5656 * defer such a check and removal, to avoid
5657 * inconsistencies in the time interval from the end
5658 * of this function to the start of the deferred work.
5659 * This situation seems to occur only in process
5660 * context, as a consequence of a merge. In the
5661 * current version of the code, this implies that the
5665 if (!RB_EMPTY_NODE(&rq
->rb_node
))
5666 bfq_remove_request(q
, rq
);
5667 bfq_put_rq_priv_body(bfqq
);
5670 rq
->elv
.priv
[0] = NULL
;
5671 rq
->elv
.priv
[1] = NULL
;
5675 * Allocate bfq data structures associated with this request.
5677 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
5680 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5681 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
5682 const int is_sync
= rq_is_sync(rq
);
5683 struct bfq_queue
*bfqq
;
5685 spin_lock_irq(&bfqd
->lock
);
5687 bfq_check_ioprio_change(bic
, bio
);
5692 bfq_bic_update_cgroup(bic
, bio
);
5694 bfqq
= bic_to_bfqq(bic
, is_sync
);
5695 if (!bfqq
|| bfqq
== &bfqd
->oom_bfqq
) {
5697 bfq_put_queue(bfqq
);
5698 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
5699 bic_set_bfqq(bic
, bfqq
, is_sync
);
5704 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
5705 rq
, bfqq
, bfqq
->ref
);
5707 rq
->elv
.priv
[0] = bic
;
5708 rq
->elv
.priv
[1] = bfqq
;
5710 spin_unlock_irq(&bfqd
->lock
);
5715 spin_unlock_irq(&bfqd
->lock
);
5720 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
5722 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5723 enum bfqq_expiration reason
;
5724 unsigned long flags
;
5726 spin_lock_irqsave(&bfqd
->lock
, flags
);
5727 bfq_clear_bfqq_wait_request(bfqq
);
5729 if (bfqq
!= bfqd
->in_service_queue
) {
5730 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5734 if (bfq_bfqq_budget_timeout(bfqq
))
5736 * Also here the queue can be safely expired
5737 * for budget timeout without wasting
5740 reason
= BFQQE_BUDGET_TIMEOUT
;
5741 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
5743 * The queue may not be empty upon timer expiration,
5744 * because we may not disable the timer when the
5745 * first request of the in-service queue arrives
5746 * during disk idling.
5748 reason
= BFQQE_TOO_IDLE
;
5750 goto schedule_dispatch
;
5752 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
5755 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
5756 bfq_schedule_dispatch(bfqd
);
5760 * Handler of the expiration of the timer running if the in-service queue
5761 * is idling inside its time slice.
5763 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
5765 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
5767 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5770 * Theoretical race here: the in-service queue can be NULL or
5771 * different from the queue that was idling if a new request
5772 * arrives for the current queue and there is a full dispatch
5773 * cycle that changes the in-service queue. This can hardly
5774 * happen, but in the worst case we just expire a queue too
5778 bfq_idle_slice_timer_body(bfqq
);
5780 return HRTIMER_NORESTART
;
5783 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
5784 struct bfq_queue
**bfqq_ptr
)
5786 struct bfq_queue
*bfqq
= *bfqq_ptr
;
5788 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
5790 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
5792 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
5794 bfq_put_queue(bfqq
);
5800 * Release all the bfqg references to its async queues. If we are
5801 * deallocating the group these queues may still contain requests, so
5802 * we reparent them to the root cgroup (i.e., the only one that will
5803 * exist for sure until all the requests on a device are gone).
5805 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
5809 for (i
= 0; i
< 2; i
++)
5810 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
5811 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
5813 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
5816 static void bfq_exit_queue(struct elevator_queue
*e
)
5818 struct bfq_data
*bfqd
= e
->elevator_data
;
5819 struct bfq_queue
*bfqq
, *n
;
5821 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5823 spin_lock_irq(&bfqd
->lock
);
5824 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
5825 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
5826 spin_unlock_irq(&bfqd
->lock
);
5828 hrtimer_cancel(&bfqd
->idle_slice_timer
);
5830 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5831 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
5833 spin_lock_irq(&bfqd
->lock
);
5834 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
5835 kfree(bfqd
->root_group
);
5836 spin_unlock_irq(&bfqd
->lock
);
5842 static void bfq_init_root_group(struct bfq_group
*root_group
,
5843 struct bfq_data
*bfqd
)
5847 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5848 root_group
->entity
.parent
= NULL
;
5849 root_group
->my_entity
= NULL
;
5850 root_group
->bfqd
= bfqd
;
5852 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
5853 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
5854 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
5857 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
5859 struct bfq_data
*bfqd
;
5860 struct elevator_queue
*eq
;
5862 eq
= elevator_alloc(q
, e
);
5866 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
5868 kobject_put(&eq
->kobj
);
5871 eq
->elevator_data
= bfqd
;
5873 spin_lock_irq(q
->queue_lock
);
5875 spin_unlock_irq(q
->queue_lock
);
5878 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
5879 * Grab a permanent reference to it, so that the normal code flow
5880 * will not attempt to free it.
5882 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
5883 bfqd
->oom_bfqq
.ref
++;
5884 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
5885 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
5886 bfqd
->oom_bfqq
.entity
.new_weight
=
5887 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
5889 * Trigger weight initialization, according to ioprio, at the
5890 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
5891 * class won't be changed any more.
5893 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
5897 INIT_LIST_HEAD(&bfqd
->dispatch
);
5899 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
5901 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
5903 INIT_LIST_HEAD(&bfqd
->active_list
);
5904 INIT_LIST_HEAD(&bfqd
->idle_list
);
5908 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
5910 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
5911 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
5912 bfqd
->bfq_back_max
= bfq_back_max
;
5913 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
5914 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
5915 bfqd
->bfq_timeout
= bfq_timeout
;
5917 bfqd
->bfq_requests_within_timer
= 120;
5919 spin_lock_init(&bfqd
->lock
);
5922 * The invocation of the next bfq_create_group_hierarchy
5923 * function is the head of a chain of function calls
5924 * (bfq_create_group_hierarchy->blkcg_activate_policy->
5925 * blk_mq_freeze_queue) that may lead to the invocation of the
5926 * has_work hook function. For this reason,
5927 * bfq_create_group_hierarchy is invoked only after all
5928 * scheduler data has been initialized, apart from the fields
5929 * that can be initialized only after invoking
5930 * bfq_create_group_hierarchy. This, in particular, enables
5931 * has_work to correctly return false. Of course, to avoid
5932 * other inconsistencies, the blk-mq stack must then refrain
5933 * from invoking further scheduler hooks before this init
5934 * function is finished.
5936 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
5937 if (!bfqd
->root_group
)
5939 bfq_init_root_group(bfqd
->root_group
, bfqd
);
5940 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
5947 kobject_put(&eq
->kobj
);
5951 static void bfq_slab_kill(void)
5953 kmem_cache_destroy(bfq_pool
);
5956 static int __init
bfq_slab_setup(void)
5958 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
5964 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
5966 return sprintf(page
, "%u\n", var
);
5969 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
5972 unsigned long new_val
;
5973 int ret
= kstrtoul(page
, 10, &new_val
);
5981 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
5982 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
5984 struct bfq_data *bfqd = e->elevator_data; \
5985 u64 __data = __VAR; \
5987 __data = jiffies_to_msecs(__data); \
5988 else if (__CONV == 2) \
5989 __data = div_u64(__data, NSEC_PER_MSEC); \
5990 return bfq_var_show(__data, (page)); \
5992 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
5993 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
5994 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
5995 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
5996 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
5997 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
5998 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
5999 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
6000 #undef SHOW_FUNCTION
6002 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6003 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6005 struct bfq_data *bfqd = e->elevator_data; \
6006 u64 __data = __VAR; \
6007 __data = div_u64(__data, NSEC_PER_USEC); \
6008 return bfq_var_show(__data, (page)); \
6010 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
6011 #undef USEC_SHOW_FUNCTION
6013 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6015 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
6017 struct bfq_data *bfqd = e->elevator_data; \
6018 unsigned long uninitialized_var(__data); \
6019 int ret = bfq_var_store(&__data, (page), count); \
6020 if (__data < (MIN)) \
6022 else if (__data > (MAX)) \
6025 *(__PTR) = msecs_to_jiffies(__data); \
6026 else if (__CONV == 2) \
6027 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6029 *(__PTR) = __data; \
6032 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
6034 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
6036 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
6037 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
6039 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
6040 #undef STORE_FUNCTION
6042 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
6043 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
6045 struct bfq_data *bfqd = e->elevator_data; \
6046 unsigned long uninitialized_var(__data); \
6047 int ret = bfq_var_store(&__data, (page), count); \
6048 if (__data < (MIN)) \
6050 else if (__data > (MAX)) \
6052 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
6055 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
6057 #undef USEC_STORE_FUNCTION
6059 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
6060 const char *page
, size_t count
)
6062 struct bfq_data
*bfqd
= e
->elevator_data
;
6063 unsigned long uninitialized_var(__data
);
6064 int ret
= bfq_var_store(&__data
, (page
), count
);
6067 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6069 if (__data
> INT_MAX
)
6071 bfqd
->bfq_max_budget
= __data
;
6074 bfqd
->bfq_user_max_budget
= __data
;
6080 * Leaving this name to preserve name compatibility with cfq
6081 * parameters, but this timeout is used for both sync and async.
6083 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
6084 const char *page
, size_t count
)
6086 struct bfq_data
*bfqd
= e
->elevator_data
;
6087 unsigned long uninitialized_var(__data
);
6088 int ret
= bfq_var_store(&__data
, (page
), count
);
6092 else if (__data
> INT_MAX
)
6095 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
6096 if (bfqd
->bfq_user_max_budget
== 0)
6097 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6102 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
6103 const char *page
, size_t count
)
6105 struct bfq_data
*bfqd
= e
->elevator_data
;
6106 unsigned long uninitialized_var(__data
);
6107 int ret
= bfq_var_store(&__data
, (page
), count
);
6111 if (!bfqd
->strict_guarantees
&& __data
== 1
6112 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
6113 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
6115 bfqd
->strict_guarantees
= __data
;
6120 #define BFQ_ATTR(name) \
6121 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
6123 static struct elv_fs_entry bfq_attrs
[] = {
6124 BFQ_ATTR(fifo_expire_sync
),
6125 BFQ_ATTR(fifo_expire_async
),
6126 BFQ_ATTR(back_seek_max
),
6127 BFQ_ATTR(back_seek_penalty
),
6128 BFQ_ATTR(slice_idle
),
6129 BFQ_ATTR(slice_idle_us
),
6130 BFQ_ATTR(max_budget
),
6131 BFQ_ATTR(timeout_sync
),
6132 BFQ_ATTR(strict_guarantees
),
6136 static struct elevator_type iosched_bfq_mq
= {
6138 .get_rq_priv
= bfq_get_rq_private
,
6139 .put_rq_priv
= bfq_put_rq_private
,
6140 .exit_icq
= bfq_exit_icq
,
6141 .insert_requests
= bfq_insert_requests
,
6142 .dispatch_request
= bfq_dispatch_request
,
6143 .next_request
= elv_rb_latter_request
,
6144 .former_request
= elv_rb_former_request
,
6145 .allow_merge
= bfq_allow_bio_merge
,
6146 .bio_merge
= bfq_bio_merge
,
6147 .request_merge
= bfq_request_merge
,
6148 .requests_merged
= bfq_requests_merged
,
6149 .request_merged
= bfq_request_merged
,
6150 .has_work
= bfq_has_work
,
6151 .init_sched
= bfq_init_queue
,
6152 .exit_sched
= bfq_exit_queue
,
6156 .icq_size
= sizeof(struct bfq_io_cq
),
6157 .icq_align
= __alignof__(struct bfq_io_cq
),
6158 .elevator_attrs
= bfq_attrs
,
6159 .elevator_name
= "bfq",
6160 .elevator_owner
= THIS_MODULE
,
6163 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6164 static struct blkcg_policy blkcg_policy_bfq
= {
6165 .dfl_cftypes
= bfq_blkg_files
,
6166 .legacy_cftypes
= bfq_blkcg_legacy_files
,
6168 .cpd_alloc_fn
= bfq_cpd_alloc
,
6169 .cpd_init_fn
= bfq_cpd_init
,
6170 .cpd_bind_fn
= bfq_cpd_init
,
6171 .cpd_free_fn
= bfq_cpd_free
,
6173 .pd_alloc_fn
= bfq_pd_alloc
,
6174 .pd_init_fn
= bfq_pd_init
,
6175 .pd_offline_fn
= bfq_pd_offline
,
6176 .pd_free_fn
= bfq_pd_free
,
6177 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
6181 static int __init
bfq_init(void)
6185 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6186 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
6192 if (bfq_slab_setup())
6195 ret
= elv_register(&iosched_bfq_mq
);
6202 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6203 blkcg_policy_unregister(&blkcg_policy_bfq
);
6208 static void __exit
bfq_exit(void)
6210 elv_unregister(&iosched_bfq_mq
);
6211 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6212 blkcg_policy_unregister(&blkcg_policy_bfq
);
6217 module_init(bfq_init
);
6218 module_exit(bfq_exit
);
6220 MODULE_AUTHOR("Paolo Valente");
6221 MODULE_LICENSE("GPL");
6222 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");