2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
127 #define BFQ_SOFTRT_WEIGHT_FACTOR 100
132 * struct bfq_service_tree - per ioprio_class service tree.
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
139 struct bfq_service_tree
{
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active
;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
145 /* idle entity with minimum F_i */
146 struct bfq_entity
*first_idle
;
147 /* idle entity with maximum F_i */
148 struct bfq_entity
*last_idle
;
150 /* scheduler virtual time */
152 /* scheduler weight sum; active and idle entities contribute to it */
157 * struct bfq_sched_data - multi-class scheduler.
159 * bfq_sched_data is the basic scheduler queue. It supports three
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
173 struct bfq_sched_data
{
174 /* entity in service */
175 struct bfq_entity
*in_service_entity
;
176 /* head-of-line entity (see comments above) */
177 struct bfq_entity
*next_in_service
;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service
;
186 * struct bfq_weight_counter - counter of the number of all active entities
187 * with a given weight.
189 struct bfq_weight_counter
{
190 unsigned int weight
; /* weight of the entities this counter refers to */
191 unsigned int num_active
; /* nr of active entities with this weight */
193 * Weights tree member (see bfq_data's @queue_weights_tree and
194 * @group_weights_tree)
196 struct rb_node weights_node
;
200 * struct bfq_entity - schedulable entity.
202 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
203 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
204 * entity belongs to the sched_data of the parent group in the cgroup
205 * hierarchy. Non-leaf entities have also their own sched_data, stored
208 * Each entity stores independently its priority values; this would
209 * allow different weights on different devices, but this
210 * functionality is not exported to userspace by now. Priorities and
211 * weights are updated lazily, first storing the new values into the
212 * new_* fields, then setting the @prio_changed flag. As soon as
213 * there is a transition in the entity state that allows the priority
214 * update to take place the effective and the requested priority
215 * values are synchronized.
217 * Unless cgroups are used, the weight value is calculated from the
218 * ioprio to export the same interface as CFQ. When dealing with
219 * ``well-behaved'' queues (i.e., queues that do not spend too much
220 * time to consume their budget and have true sequential behavior, and
221 * when there are no external factors breaking anticipation) the
222 * relative weights at each level of the cgroups hierarchy should be
223 * guaranteed. All the fields are protected by the queue lock of the
227 /* service_tree member */
228 struct rb_node rb_node
;
229 /* pointer to the weight counter associated with this entity */
230 struct bfq_weight_counter
*weight_counter
;
233 * Flag, true if the entity is on a tree (either the active or
234 * the idle one of its service_tree) or is in service.
238 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
241 /* tree the entity is enqueued into; %NULL if not on a tree */
242 struct rb_root
*tree
;
245 * minimum start time of the (active) subtree rooted at this
246 * entity; used for O(log N) lookups into active trees
250 /* amount of service received during the last service slot */
253 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
256 /* weight of the queue */
258 /* next weight if a change is in progress */
261 /* original weight, used to implement weight boosting */
264 /* parent entity, for hierarchical scheduling */
265 struct bfq_entity
*parent
;
268 * For non-leaf nodes in the hierarchy, the associated
269 * scheduler queue, %NULL on leaf nodes.
271 struct bfq_sched_data
*my_sched_data
;
272 /* the scheduler queue this entity belongs to */
273 struct bfq_sched_data
*sched_data
;
275 /* flag, set to request a weight, ioprio or ioprio_class change */
282 * struct bfq_ttime - per process thinktime stats.
285 /* completion time of the last request */
286 u64 last_end_request
;
288 /* total process thinktime */
290 /* number of thinktime samples */
291 unsigned long ttime_samples
;
292 /* average process thinktime */
297 * struct bfq_queue - leaf schedulable entity.
299 * A bfq_queue is a leaf request queue; it can be associated with an
300 * io_context or more, if it is async or shared between cooperating
301 * processes. @cgroup holds a reference to the cgroup, to be sure that it
302 * does not disappear while a bfqq still references it (mostly to avoid
303 * races between request issuing and task migration followed by cgroup
305 * All the fields are protected by the queue lock of the containing bfqd.
308 /* reference counter */
310 /* parent bfq_data */
311 struct bfq_data
*bfqd
;
313 /* current ioprio and ioprio class */
314 unsigned short ioprio
, ioprio_class
;
315 /* next ioprio and ioprio class if a change is in progress */
316 unsigned short new_ioprio
, new_ioprio_class
;
319 * Shared bfq_queue if queue is cooperating with one or more
322 struct bfq_queue
*new_bfqq
;
323 /* request-position tree member (see bfq_group's @rq_pos_tree) */
324 struct rb_node pos_node
;
325 /* request-position tree root (see bfq_group's @rq_pos_tree) */
326 struct rb_root
*pos_root
;
328 /* sorted list of pending requests */
329 struct rb_root sort_list
;
330 /* if fifo isn't expired, next request to serve */
331 struct request
*next_rq
;
332 /* number of sync and async requests queued */
334 /* number of requests currently allocated */
336 /* number of pending metadata requests */
338 /* fifo list of requests in sort_list */
339 struct list_head fifo
;
341 /* entity representing this queue in the scheduler */
342 struct bfq_entity entity
;
344 /* maximum budget allowed from the feedback mechanism */
346 /* budget expiration (in jiffies) */
347 unsigned long budget_timeout
;
349 /* number of requests on the dispatch list or inside driver */
355 /* node for active/idle bfqq list inside parent bfqd */
356 struct list_head bfqq_list
;
358 /* associated @bfq_ttime struct */
359 struct bfq_ttime ttime
;
361 /* bit vector: a 1 for each seeky requests in history */
364 /* node for the device's burst list */
365 struct hlist_node burst_list_node
;
367 /* position of the last request enqueued */
368 sector_t last_request_pos
;
370 /* Number of consecutive pairs of request completion and
371 * arrival, such that the queue becomes idle after the
372 * completion, but the next request arrives within an idle
373 * time slice; used only if the queue's IO_bound flag has been
376 unsigned int requests_within_timer
;
378 /* pid of the process owning the queue, used for logging purposes */
382 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
383 * if the queue is shared.
385 struct bfq_io_cq
*bic
;
387 /* current maximum weight-raising time for this queue */
388 unsigned long wr_cur_max_time
;
390 * Minimum time instant such that, only if a new request is
391 * enqueued after this time instant in an idle @bfq_queue with
392 * no outstanding requests, then the task associated with the
393 * queue it is deemed as soft real-time (see the comments on
394 * the function bfq_bfqq_softrt_next_start())
396 unsigned long soft_rt_next_start
;
398 * Start time of the current weight-raising period if
399 * the @bfq-queue is being weight-raised, otherwise
400 * finish time of the last weight-raising period.
402 unsigned long last_wr_start_finish
;
403 /* factor by which the weight of this queue is multiplied */
404 unsigned int wr_coeff
;
406 * Time of the last transition of the @bfq_queue from idle to
409 unsigned long last_idle_bklogged
;
411 * Cumulative service received from the @bfq_queue since the
412 * last transition from idle to backlogged.
414 unsigned long service_from_backlogged
;
417 * Value of wr start time when switching to soft rt
419 unsigned long wr_start_at_switch_to_srt
;
421 unsigned long split_time
; /* time of last split */
425 * struct bfq_io_cq - per (request_queue, io_context) structure.
428 /* associated io_cq structure */
429 struct io_cq icq
; /* must be the first member */
430 /* array of two process queues, the sync and the async */
431 struct bfq_queue
*bfqq
[2];
432 /* per (request_queue, blkcg) ioprio */
434 #ifdef CONFIG_BFQ_GROUP_IOSCHED
435 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
438 * Snapshot of the idle window before merging; taken to
439 * remember this value while the queue is merged, so as to be
440 * able to restore it in case of split.
442 bool saved_idle_window
;
444 * Same purpose as the previous two fields for the I/O bound
445 * classification of a queue.
450 * Same purpose as the previous fields for the value of the
451 * field keeping the queue's belonging to a large burst
453 bool saved_in_large_burst
;
455 * True if the queue belonged to a burst list before its merge
456 * with another cooperating queue.
458 bool was_in_burst_list
;
461 * Similar to previous fields: save wr information.
463 unsigned long saved_wr_coeff
;
464 unsigned long saved_last_wr_start_finish
;
465 unsigned long saved_wr_start_at_switch_to_srt
;
466 unsigned int saved_wr_cur_max_time
;
467 struct bfq_ttime saved_ttime
;
470 enum bfq_device_speed
{
476 * struct bfq_data - per-device data structure.
478 * All the fields are protected by @lock.
481 /* device request queue */
482 struct request_queue
*queue
;
484 struct list_head dispatch
;
486 /* root bfq_group for the device */
487 struct bfq_group
*root_group
;
490 * rbtree of weight counters of @bfq_queues, sorted by
491 * weight. Used to keep track of whether all @bfq_queues have
492 * the same weight. The tree contains one counter for each
493 * distinct weight associated to some active and not
494 * weight-raised @bfq_queue (see the comments to the functions
495 * bfq_weights_tree_[add|remove] for further details).
497 struct rb_root queue_weights_tree
;
499 * rbtree of non-queue @bfq_entity weight counters, sorted by
500 * weight. Used to keep track of whether all @bfq_groups have
501 * the same weight. The tree contains one counter for each
502 * distinct weight associated to some active @bfq_group (see
503 * the comments to the functions bfq_weights_tree_[add|remove]
504 * for further details).
506 struct rb_root group_weights_tree
;
509 * Number of bfq_queues containing requests (including the
510 * queue in service, even if it is idling).
513 /* number of weight-raised busy @bfq_queues */
515 /* number of queued requests */
517 /* number of requests dispatched and waiting for completion */
521 * Maximum number of requests in driver in the last
522 * @hw_tag_samples completed requests.
524 int max_rq_in_driver
;
525 /* number of samples used to calculate hw_tag */
527 /* flag set to one if the driver is showing a queueing behavior */
530 /* number of budgets assigned */
531 int budgets_assigned
;
534 * Timer set when idling (waiting) for the next request from
535 * the queue in service.
537 struct hrtimer idle_slice_timer
;
539 /* bfq_queue in service */
540 struct bfq_queue
*in_service_queue
;
541 /* bfq_io_cq (bic) associated with the @in_service_queue */
542 struct bfq_io_cq
*in_service_bic
;
544 /* on-disk position of the last served request */
545 sector_t last_position
;
547 /* time of last request completion (ns) */
550 /* time of first rq dispatch in current observation interval (ns) */
552 /* time of last rq dispatch in current observation interval (ns) */
555 /* beginning of the last budget */
556 ktime_t last_budget_start
;
557 /* beginning of the last idle slice */
558 ktime_t last_idling_start
;
560 /* number of samples in current observation interval */
561 int peak_rate_samples
;
562 /* num of samples of seq dispatches in current observation interval */
563 u32 sequential_samples
;
564 /* total num of sectors transferred in current observation interval */
565 u64 tot_sectors_dispatched
;
566 /* max rq size seen during current observation interval (sectors) */
567 u32 last_rq_max_size
;
568 /* time elapsed from first dispatch in current observ. interval (us) */
569 u64 delta_from_first
;
571 * Current estimate of the device peak rate, measured in
572 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
573 * BFQ_RATE_SHIFT is performed to increase precision in
574 * fixed-point calculations.
578 /* maximum budget allotted to a bfq_queue before rescheduling */
581 /* list of all the bfq_queues active on the device */
582 struct list_head active_list
;
583 /* list of all the bfq_queues idle on the device */
584 struct list_head idle_list
;
587 * Timeout for async/sync requests; when it fires, requests
588 * are served in fifo order.
590 u64 bfq_fifo_expire
[2];
591 /* weight of backward seeks wrt forward ones */
592 unsigned int bfq_back_penalty
;
593 /* maximum allowed backward seek */
594 unsigned int bfq_back_max
;
595 /* maximum idling time */
598 /* user-configured max budget value (0 for auto-tuning) */
599 int bfq_user_max_budget
;
601 * Timeout for bfq_queues to consume their budget; used to
602 * prevent seeky queues from imposing long latencies to
603 * sequential or quasi-sequential ones (this also implies that
604 * seeky queues cannot receive guarantees in the service
605 * domain; after a timeout they are charged for the time they
606 * have been in service, to preserve fairness among them, but
607 * without service-domain guarantees).
609 unsigned int bfq_timeout
;
612 * Number of consecutive requests that must be issued within
613 * the idle time slice to set again idling to a queue which
614 * was marked as non-I/O-bound (see the definition of the
615 * IO_bound flag for further details).
617 unsigned int bfq_requests_within_timer
;
620 * Force device idling whenever needed to provide accurate
621 * service guarantees, without caring about throughput
622 * issues. CAVEAT: this may even increase latencies, in case
623 * of useless idling for processes that did stop doing I/O.
625 bool strict_guarantees
;
628 * Last time at which a queue entered the current burst of
629 * queues being activated shortly after each other; for more
630 * details about this and the following parameters related to
631 * a burst of activations, see the comments on the function
634 unsigned long last_ins_in_burst
;
636 * Reference time interval used to decide whether a queue has
637 * been activated shortly after @last_ins_in_burst.
639 unsigned long bfq_burst_interval
;
640 /* number of queues in the current burst of queue activations */
643 /* common parent entity for the queues in the burst */
644 struct bfq_entity
*burst_parent_entity
;
645 /* Maximum burst size above which the current queue-activation
646 * burst is deemed as 'large'.
648 unsigned long bfq_large_burst_thresh
;
649 /* true if a large queue-activation burst is in progress */
652 * Head of the burst list (as for the above fields, more
653 * details in the comments on the function bfq_handle_burst).
655 struct hlist_head burst_list
;
657 /* if set to true, low-latency heuristics are enabled */
660 * Maximum factor by which the weight of a weight-raised queue
663 unsigned int bfq_wr_coeff
;
664 /* maximum duration of a weight-raising period (jiffies) */
665 unsigned int bfq_wr_max_time
;
667 /* Maximum weight-raising duration for soft real-time processes */
668 unsigned int bfq_wr_rt_max_time
;
670 * Minimum idle period after which weight-raising may be
671 * reactivated for a queue (in jiffies).
673 unsigned int bfq_wr_min_idle_time
;
675 * Minimum period between request arrivals after which
676 * weight-raising may be reactivated for an already busy async
677 * queue (in jiffies).
679 unsigned long bfq_wr_min_inter_arr_async
;
681 /* Max service-rate for a soft real-time queue, in sectors/sec */
682 unsigned int bfq_wr_max_softrt_rate
;
684 * Cached value of the product R*T, used for computing the
685 * maximum duration of weight raising automatically.
688 /* device-speed class for the low-latency heuristic */
689 enum bfq_device_speed device_speed
;
691 /* fallback dummy bfqq for extreme OOM conditions */
692 struct bfq_queue oom_bfqq
;
697 * bic associated with the task issuing current bio for
698 * merging. This and the next field are used as a support to
699 * be able to perform the bic lookup, needed by bio-merge
700 * functions, before the scheduler lock is taken, and thus
701 * avoid taking the request-queue lock while the scheduler
702 * lock is being held.
704 struct bfq_io_cq
*bio_bic
;
705 /* bfqq associated with the task issuing current bio for merging */
706 struct bfq_queue
*bio_bfqq
;
709 * io context to put right after bfqd->lock is released. This
710 * filed is used to perform put_io_context, when needed, to
711 * after the scheduler lock has been released, and thus
712 * prevent an ioc->lock from being possibly taken while the
713 * scheduler lock is being held.
715 struct io_context
*ioc_to_put
;
718 enum bfqq_state_flags
{
719 BFQQF_just_created
= 0, /* queue just allocated */
720 BFQQF_busy
, /* has requests or is in service */
721 BFQQF_wait_request
, /* waiting for a request */
722 BFQQF_non_blocking_wait_rq
, /*
723 * waiting for a request
724 * without idling the device
726 BFQQF_fifo_expire
, /* FIFO checked in this slice */
727 BFQQF_idle_window
, /* slice idling enabled */
728 BFQQF_sync
, /* synchronous queue */
730 * bfqq has timed-out at least once
731 * having consumed at most 2/10 of
734 BFQQF_in_large_burst
, /*
735 * bfqq activated in a large burst,
736 * see comments to bfq_handle_burst.
738 BFQQF_softrt_update
, /*
739 * may need softrt-next-start
742 BFQQF_coop
, /* bfqq is shared */
743 BFQQF_split_coop
/* shared bfqq will be split */
746 #define BFQ_BFQQ_FNS(name) \
747 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
749 __set_bit(BFQQF_##name, &(bfqq)->flags); \
751 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
753 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
755 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
757 return test_bit(BFQQF_##name, &(bfqq)->flags); \
760 BFQ_BFQQ_FNS(just_created
);
762 BFQ_BFQQ_FNS(wait_request
);
763 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
764 BFQ_BFQQ_FNS(fifo_expire
);
765 BFQ_BFQQ_FNS(idle_window
);
767 BFQ_BFQQ_FNS(IO_bound
);
768 BFQ_BFQQ_FNS(in_large_burst
);
770 BFQ_BFQQ_FNS(split_coop
);
771 BFQ_BFQQ_FNS(softrt_update
);
774 /* Logging facilities. */
775 #ifdef CONFIG_BFQ_GROUP_IOSCHED
776 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
777 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
779 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
782 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
783 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
784 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
788 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
791 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
792 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
795 #else /* CONFIG_BFQ_GROUP_IOSCHED */
797 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
798 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
799 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
801 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
803 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
805 #define bfq_log(bfqd, fmt, args...) \
806 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
808 /* Expiration reasons. */
809 enum bfqq_expiration
{
810 BFQQE_TOO_IDLE
= 0, /*
811 * queue has been idling for
814 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
815 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
816 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
817 BFQQE_PREEMPTED
/* preemption in progress */
821 #ifdef CONFIG_BFQ_GROUP_IOSCHED
822 /* number of ios merged */
823 struct blkg_rwstat merged
;
824 /* total time spent on device in ns, may not be accurate w/ queueing */
825 struct blkg_rwstat service_time
;
826 /* total time spent waiting in scheduler queue in ns */
827 struct blkg_rwstat wait_time
;
828 /* number of IOs queued up */
829 struct blkg_rwstat queued
;
830 /* total disk time and nr sectors dispatched by this group */
831 struct blkg_stat time
;
832 /* sum of number of ios queued across all samples */
833 struct blkg_stat avg_queue_size_sum
;
834 /* count of samples taken for average */
835 struct blkg_stat avg_queue_size_samples
;
836 /* how many times this group has been removed from service tree */
837 struct blkg_stat dequeue
;
838 /* total time spent waiting for it to be assigned a timeslice. */
839 struct blkg_stat group_wait_time
;
840 /* time spent idling for this blkcg_gq */
841 struct blkg_stat idle_time
;
842 /* total time with empty current active q with other requests queued */
843 struct blkg_stat empty_time
;
844 /* fields after this shouldn't be cleared on stat reset */
845 uint64_t start_group_wait_time
;
846 uint64_t start_idle_time
;
847 uint64_t start_empty_time
;
849 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
852 #ifdef CONFIG_BFQ_GROUP_IOSCHED
855 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
857 * @ps: @blkcg_policy_storage that this structure inherits
858 * @weight: weight of the bfq_group
860 struct bfq_group_data
{
861 /* must be the first member */
862 struct blkcg_policy_data pd
;
868 * struct bfq_group - per (device, cgroup) data structure.
869 * @entity: schedulable entity to insert into the parent group sched_data.
870 * @sched_data: own sched_data, to contain child entities (they may be
871 * both bfq_queues and bfq_groups).
872 * @bfqd: the bfq_data for the device this group acts upon.
873 * @async_bfqq: array of async queues for all the tasks belonging to
874 * the group, one queue per ioprio value per ioprio_class,
875 * except for the idle class that has only one queue.
876 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
877 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
878 * to avoid too many special cases during group creation/
880 * @stats: stats for this bfqg.
881 * @active_entities: number of active entities belonging to the group;
882 * unused for the root group. Used to know whether there
883 * are groups with more than one active @bfq_entity
884 * (see the comments to the function
885 * bfq_bfqq_may_idle()).
886 * @rq_pos_tree: rbtree sorted by next_request position, used when
887 * determining if two or more queues have interleaving
888 * requests (see bfq_find_close_cooperator()).
890 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
891 * there is a set of bfq_groups, each one collecting the lower-level
892 * entities belonging to the group that are acting on the same device.
894 * Locking works as follows:
895 * o @bfqd is protected by the queue lock, RCU is used to access it
897 * o All the other fields are protected by the @bfqd queue lock.
900 /* must be the first member */
901 struct blkg_policy_data pd
;
903 struct bfq_entity entity
;
904 struct bfq_sched_data sched_data
;
908 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
909 struct bfq_queue
*async_idle_bfqq
;
911 struct bfq_entity
*my_entity
;
915 struct rb_root rq_pos_tree
;
917 struct bfqg_stats stats
;
922 struct bfq_sched_data sched_data
;
924 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
925 struct bfq_queue
*async_idle_bfqq
;
927 struct rb_root rq_pos_tree
;
931 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
933 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
935 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
937 return bfqq
? bfqq
->ioprio_class
- 1 :
938 BFQ_DEFAULT_GRP_CLASS
- 1;
941 static struct bfq_service_tree
*
942 bfq_entity_service_tree(struct bfq_entity
*entity
)
944 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
945 unsigned int idx
= bfq_class_idx(entity
);
947 return sched_data
->service_tree
+ idx
;
950 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
952 return bic
->bfqq
[is_sync
];
955 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
958 bic
->bfqq
[is_sync
] = bfqq
;
961 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
963 return bic
->icq
.q
->elevator
->elevator_data
;
966 #ifdef CONFIG_BFQ_GROUP_IOSCHED
968 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
970 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
973 group_entity
= &bfqq
->bfqd
->root_group
->entity
;
975 return container_of(group_entity
, struct bfq_group
, entity
);
980 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
982 return bfqq
->bfqd
->root_group
;
987 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
988 static void bfq_put_queue(struct bfq_queue
*bfqq
);
989 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
990 struct bio
*bio
, bool is_sync
,
991 struct bfq_io_cq
*bic
);
992 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
993 struct bfq_group
*bfqg
);
994 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
995 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
997 /* Expiration time of sync (0) and async (1) requests, in ns. */
998 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
1000 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
1001 static const int bfq_back_max
= 16 * 1024;
1003 /* Penalty of a backwards seek, in number of sectors. */
1004 static const int bfq_back_penalty
= 2;
1006 /* Idling period duration, in ns. */
1007 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
1009 /* Minimum number of assigned budgets for which stats are safe to compute. */
1010 static const int bfq_stats_min_budgets
= 194;
1012 /* Default maximum budget values, in sectors and number of requests. */
1013 static const int bfq_default_max_budget
= 16 * 1024;
1016 * Async to sync throughput distribution is controlled as follows:
1017 * when an async request is served, the entity is charged the number
1018 * of sectors of the request, multiplied by the factor below
1020 static const int bfq_async_charge_factor
= 10;
1022 /* Default timeout values, in jiffies, approximating CFQ defaults. */
1023 static const int bfq_timeout
= HZ
/ 8;
1025 static struct kmem_cache
*bfq_pool
;
1027 /* Below this threshold (in ns), we consider thinktime immediate. */
1028 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
1030 /* hw_tag detection: parallel requests threshold and min samples needed. */
1031 #define BFQ_HW_QUEUE_THRESHOLD 4
1032 #define BFQ_HW_QUEUE_SAMPLES 32
1034 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
1035 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
1036 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
1037 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
1039 /* Min number of samples required to perform peak-rate update */
1040 #define BFQ_RATE_MIN_SAMPLES 32
1041 /* Min observation time interval required to perform a peak-rate update (ns) */
1042 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
1043 /* Target observation time interval for a peak-rate update (ns) */
1044 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
1046 /* Shift used for peak rate fixed precision calculations. */
1047 #define BFQ_RATE_SHIFT 16
1050 * By default, BFQ computes the duration of the weight raising for
1051 * interactive applications automatically, using the following formula:
1052 * duration = (R / r) * T, where r is the peak rate of the device, and
1053 * R and T are two reference parameters.
1054 * In particular, R is the peak rate of the reference device (see below),
1055 * and T is a reference time: given the systems that are likely to be
1056 * installed on the reference device according to its speed class, T is
1057 * about the maximum time needed, under BFQ and while reading two files in
1058 * parallel, to load typical large applications on these systems.
1059 * In practice, the slower/faster the device at hand is, the more/less it
1060 * takes to load applications with respect to the reference device.
1061 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
1064 * BFQ uses four different reference pairs (R, T), depending on:
1065 * . whether the device is rotational or non-rotational;
1066 * . whether the device is slow, such as old or portable HDDs, as well as
1067 * SD cards, or fast, such as newer HDDs and SSDs.
1069 * The device's speed class is dynamically (re)detected in
1070 * bfq_update_peak_rate() every time the estimated peak rate is updated.
1072 * In the following definitions, R_slow[0]/R_fast[0] and
1073 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
1074 * rotational device, whereas R_slow[1]/R_fast[1] and
1075 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
1076 * non-rotational device. Finally, device_speed_thresh are the
1077 * thresholds used to switch between speed classes. The reference
1078 * rates are not the actual peak rates of the devices used as a
1079 * reference, but slightly lower values. The reason for using these
1080 * slightly lower values is that the peak-rate estimator tends to
1081 * yield slightly lower values than the actual peak rate (it can yield
1082 * the actual peak rate only if there is only one process doing I/O,
1083 * and the process does sequential I/O).
1085 * Both the reference peak rates and the thresholds are measured in
1086 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
1088 static int R_slow
[2] = {1000, 10700};
1089 static int R_fast
[2] = {14000, 33000};
1091 * To improve readability, a conversion function is used to initialize the
1092 * following arrays, which entails that they can be initialized only in a
1095 static int T_slow
[2];
1096 static int T_fast
[2];
1097 static int device_speed_thresh
[2];
1099 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1100 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1102 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1103 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1106 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1107 * @icq: the iocontext queue.
1109 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
1111 /* bic->icq is the first member, %NULL will convert to %NULL */
1112 return container_of(icq
, struct bfq_io_cq
, icq
);
1116 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1117 * @bfqd: the lookup key.
1118 * @ioc: the io_context of the process doing I/O.
1119 * @q: the request queue.
1121 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
1122 struct io_context
*ioc
,
1123 struct request_queue
*q
)
1126 unsigned long flags
;
1127 struct bfq_io_cq
*icq
;
1129 spin_lock_irqsave(q
->queue_lock
, flags
);
1130 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
1131 spin_unlock_irqrestore(q
->queue_lock
, flags
);
1140 * Scheduler run of queue, if there are requests pending and no one in the
1141 * driver that will restart queueing.
1143 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
1145 if (bfqd
->queued
!= 0) {
1146 bfq_log(bfqd
, "schedule dispatch");
1147 blk_mq_run_hw_queues(bfqd
->queue
, true);
1152 * Next two functions release bfqd->lock and put the io context
1153 * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
1154 * to take an ioc->lock while the scheduler lock is being held.
1156 static void bfq_unlock_put_ioc(struct bfq_data
*bfqd
)
1158 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1160 bfqd
->ioc_to_put
= NULL
;
1161 spin_unlock_irq(&bfqd
->lock
);
1164 put_io_context(ioc_to_put
);
1167 static void bfq_unlock_put_ioc_restore(struct bfq_data
*bfqd
,
1168 unsigned long flags
)
1170 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1172 bfqd
->ioc_to_put
= NULL
;
1173 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
1176 put_io_context(ioc_to_put
);
1180 * bfq_gt - compare two timestamps.
1184 * Return @a > @b, dealing with wrapping correctly.
1186 static int bfq_gt(u64 a
, u64 b
)
1188 return (s64
)(a
- b
) > 0;
1191 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
1193 struct rb_node
*node
= tree
->rb_node
;
1195 return rb_entry(node
, struct bfq_entity
, rb_node
);
1198 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
1200 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
1203 * bfq_update_next_in_service - update sd->next_in_service
1204 * @sd: sched_data for which to perform the update.
1205 * @new_entity: if not NULL, pointer to the entity whose activation,
1206 * requeueing or repositionig triggered the invocation of
1209 * This function is called to update sd->next_in_service, which, in
1210 * its turn, may change as a consequence of the insertion or
1211 * extraction of an entity into/from one of the active trees of
1212 * sd. These insertions/extractions occur as a consequence of
1213 * activations/deactivations of entities, with some activations being
1214 * 'true' activations, and other activations being requeueings (i.e.,
1215 * implementing the second, requeueing phase of the mechanism used to
1216 * reposition an entity in its active tree; see comments on
1217 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1218 * both the last two activation sub-cases, new_entity points to the
1219 * just activated or requeued entity.
1221 * Returns true if sd->next_in_service changes in such a way that
1222 * entity->parent may become the next_in_service for its parent
1225 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
1226 struct bfq_entity
*new_entity
)
1228 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
1229 bool parent_sched_may_change
= false;
1232 * If this update is triggered by the activation, requeueing
1233 * or repositiong of an entity that does not coincide with
1234 * sd->next_in_service, then a full lookup in the active tree
1235 * can be avoided. In fact, it is enough to check whether the
1236 * just-modified entity has a higher priority than
1237 * sd->next_in_service, or, even if it has the same priority
1238 * as sd->next_in_service, is eligible and has a lower virtual
1239 * finish time than sd->next_in_service. If this compound
1240 * condition holds, then the new entity becomes the new
1241 * next_in_service. Otherwise no change is needed.
1243 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
1245 * Flag used to decide whether to replace
1246 * sd->next_in_service with new_entity. Tentatively
1247 * set to true, and left as true if
1248 * sd->next_in_service is NULL.
1250 bool replace_next
= true;
1253 * If there is already a next_in_service candidate
1254 * entity, then compare class priorities or timestamps
1255 * to decide whether to replace sd->service_tree with
1258 if (next_in_service
) {
1259 unsigned int new_entity_class_idx
=
1260 bfq_class_idx(new_entity
);
1261 struct bfq_service_tree
*st
=
1262 sd
->service_tree
+ new_entity_class_idx
;
1265 * For efficiency, evaluate the most likely
1266 * sub-condition first.
1269 (new_entity_class_idx
==
1270 bfq_class_idx(next_in_service
)
1272 !bfq_gt(new_entity
->start
, st
->vtime
)
1274 bfq_gt(next_in_service
->finish
,
1275 new_entity
->finish
))
1277 new_entity_class_idx
<
1278 bfq_class_idx(next_in_service
);
1282 next_in_service
= new_entity
;
1283 } else /* invoked because of a deactivation: lookup needed */
1284 next_in_service
= bfq_lookup_next_entity(sd
);
1286 if (next_in_service
) {
1287 parent_sched_may_change
= !sd
->next_in_service
||
1288 bfq_update_parent_budget(next_in_service
);
1291 sd
->next_in_service
= next_in_service
;
1293 if (!next_in_service
)
1294 return parent_sched_may_change
;
1296 return parent_sched_may_change
;
1299 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1300 /* both next loops stop at one of the child entities of the root group */
1301 #define for_each_entity(entity) \
1302 for (; entity ; entity = entity->parent)
1305 * For each iteration, compute parent in advance, so as to be safe if
1306 * entity is deallocated during the iteration. Such a deallocation may
1307 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1308 * containing entity.
1310 #define for_each_entity_safe(entity, parent) \
1311 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1314 * Returns true if this budget changes may let next_in_service->parent
1315 * become the next_in_service entity for its parent entity.
1317 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1319 struct bfq_entity
*bfqg_entity
;
1320 struct bfq_group
*bfqg
;
1321 struct bfq_sched_data
*group_sd
;
1324 group_sd
= next_in_service
->sched_data
;
1326 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
1328 * bfq_group's my_entity field is not NULL only if the group
1329 * is not the root group. We must not touch the root entity
1330 * as it must never become an in-service entity.
1332 bfqg_entity
= bfqg
->my_entity
;
1334 if (bfqg_entity
->budget
> next_in_service
->budget
)
1336 bfqg_entity
->budget
= next_in_service
->budget
;
1343 * This function tells whether entity stops being a candidate for next
1344 * service, according to the following logic.
1346 * This function is invoked for an entity that is about to be set in
1347 * service. If such an entity is a queue, then the entity is no longer
1348 * a candidate for next service (i.e, a candidate entity to serve
1349 * after the in-service entity is expired). The function then returns
1352 * In contrast, the entity could stil be a candidate for next service
1353 * if it is not a queue, and has more than one child. In fact, even if
1354 * one of its children is about to be set in service, other children
1355 * may still be the next to serve. As a consequence, a non-queue
1356 * entity is not a candidate for next-service only if it has only one
1357 * child. And only if this condition holds, then the function returns
1358 * true for a non-queue entity.
1360 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1362 struct bfq_group
*bfqg
;
1364 if (bfq_entity_to_bfqq(entity
))
1367 bfqg
= container_of(entity
, struct bfq_group
, entity
);
1369 if (bfqg
->active_entities
== 1)
1375 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1377 * Next two macros are fake loops when cgroups support is not
1378 * enabled. I fact, in such a case, there is only one level to go up
1379 * (to reach the root group).
1381 #define for_each_entity(entity) \
1382 for (; entity ; entity = NULL)
1384 #define for_each_entity_safe(entity, parent) \
1385 for (parent = NULL; entity ; entity = parent)
1387 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1392 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1397 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1400 * Shift for timestamp calculations. This actually limits the maximum
1401 * service allowed in one timestamp delta (small shift values increase it),
1402 * the maximum total weight that can be used for the queues in the system
1403 * (big shift values increase it), and the period of virtual time
1406 #define WFQ_SERVICE_SHIFT 22
1408 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1410 struct bfq_queue
*bfqq
= NULL
;
1412 if (!entity
->my_sched_data
)
1413 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1420 * bfq_delta - map service into the virtual time domain.
1421 * @service: amount of service.
1422 * @weight: scale factor (weight of an entity or weight sum).
1424 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1426 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1433 * bfq_calc_finish - assign the finish time to an entity.
1434 * @entity: the entity to act upon.
1435 * @service: the service to be charged to the entity.
1437 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1439 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1441 entity
->finish
= entity
->start
+
1442 bfq_delta(service
, entity
->weight
);
1445 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1446 "calc_finish: serv %lu, w %d",
1447 service
, entity
->weight
);
1448 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1449 "calc_finish: start %llu, finish %llu, delta %llu",
1450 entity
->start
, entity
->finish
,
1451 bfq_delta(service
, entity
->weight
));
1456 * bfq_entity_of - get an entity from a node.
1457 * @node: the node field of the entity.
1459 * Convert a node pointer to the relative entity. This is used only
1460 * to simplify the logic of some functions and not as the generic
1461 * conversion mechanism because, e.g., in the tree walking functions,
1462 * the check for a %NULL value would be redundant.
1464 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1466 struct bfq_entity
*entity
= NULL
;
1469 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1475 * bfq_extract - remove an entity from a tree.
1476 * @root: the tree root.
1477 * @entity: the entity to remove.
1479 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1481 entity
->tree
= NULL
;
1482 rb_erase(&entity
->rb_node
, root
);
1486 * bfq_idle_extract - extract an entity from the idle tree.
1487 * @st: the service tree of the owning @entity.
1488 * @entity: the entity being removed.
1490 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1491 struct bfq_entity
*entity
)
1493 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1494 struct rb_node
*next
;
1496 if (entity
== st
->first_idle
) {
1497 next
= rb_next(&entity
->rb_node
);
1498 st
->first_idle
= bfq_entity_of(next
);
1501 if (entity
== st
->last_idle
) {
1502 next
= rb_prev(&entity
->rb_node
);
1503 st
->last_idle
= bfq_entity_of(next
);
1506 bfq_extract(&st
->idle
, entity
);
1509 list_del(&bfqq
->bfqq_list
);
1513 * bfq_insert - generic tree insertion.
1515 * @entity: entity to insert.
1517 * This is used for the idle and the active tree, since they are both
1518 * ordered by finish time.
1520 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1522 struct bfq_entity
*entry
;
1523 struct rb_node
**node
= &root
->rb_node
;
1524 struct rb_node
*parent
= NULL
;
1528 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1530 if (bfq_gt(entry
->finish
, entity
->finish
))
1531 node
= &parent
->rb_left
;
1533 node
= &parent
->rb_right
;
1536 rb_link_node(&entity
->rb_node
, parent
, node
);
1537 rb_insert_color(&entity
->rb_node
, root
);
1539 entity
->tree
= root
;
1543 * bfq_update_min - update the min_start field of a entity.
1544 * @entity: the entity to update.
1545 * @node: one of its children.
1547 * This function is called when @entity may store an invalid value for
1548 * min_start due to updates to the active tree. The function assumes
1549 * that the subtree rooted at @node (which may be its left or its right
1550 * child) has a valid min_start value.
1552 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1554 struct bfq_entity
*child
;
1557 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1558 if (bfq_gt(entity
->min_start
, child
->min_start
))
1559 entity
->min_start
= child
->min_start
;
1564 * bfq_update_active_node - recalculate min_start.
1565 * @node: the node to update.
1567 * @node may have changed position or one of its children may have moved,
1568 * this function updates its min_start value. The left and right subtrees
1569 * are assumed to hold a correct min_start value.
1571 static void bfq_update_active_node(struct rb_node
*node
)
1573 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1575 entity
->min_start
= entity
->start
;
1576 bfq_update_min(entity
, node
->rb_right
);
1577 bfq_update_min(entity
, node
->rb_left
);
1581 * bfq_update_active_tree - update min_start for the whole active tree.
1582 * @node: the starting node.
1584 * @node must be the deepest modified node after an update. This function
1585 * updates its min_start using the values held by its children, assuming
1586 * that they did not change, and then updates all the nodes that may have
1587 * changed in the path to the root. The only nodes that may have changed
1588 * are the ones in the path or their siblings.
1590 static void bfq_update_active_tree(struct rb_node
*node
)
1592 struct rb_node
*parent
;
1595 bfq_update_active_node(node
);
1597 parent
= rb_parent(node
);
1601 if (node
== parent
->rb_left
&& parent
->rb_right
)
1602 bfq_update_active_node(parent
->rb_right
);
1603 else if (parent
->rb_left
)
1604 bfq_update_active_node(parent
->rb_left
);
1610 static void bfq_weights_tree_add(struct bfq_data
*bfqd
,
1611 struct bfq_entity
*entity
,
1612 struct rb_root
*root
);
1614 static void bfq_weights_tree_remove(struct bfq_data
*bfqd
,
1615 struct bfq_entity
*entity
,
1616 struct rb_root
*root
);
1620 * bfq_active_insert - insert an entity in the active tree of its
1622 * @st: the service tree of the entity.
1623 * @entity: the entity being inserted.
1625 * The active tree is ordered by finish time, but an extra key is kept
1626 * per each node, containing the minimum value for the start times of
1627 * its children (and the node itself), so it's possible to search for
1628 * the eligible node with the lowest finish time in logarithmic time.
1630 static void bfq_active_insert(struct bfq_service_tree
*st
,
1631 struct bfq_entity
*entity
)
1633 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1634 struct rb_node
*node
= &entity
->rb_node
;
1635 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1636 struct bfq_sched_data
*sd
= NULL
;
1637 struct bfq_group
*bfqg
= NULL
;
1638 struct bfq_data
*bfqd
= NULL
;
1641 bfq_insert(&st
->active
, entity
);
1644 node
= node
->rb_left
;
1645 else if (node
->rb_right
)
1646 node
= node
->rb_right
;
1648 bfq_update_active_tree(node
);
1650 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1651 sd
= entity
->sched_data
;
1652 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1653 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1656 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1657 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1658 else /* bfq_group */
1659 bfq_weights_tree_add(bfqd
, entity
, &bfqd
->group_weights_tree
);
1661 if (bfqg
!= bfqd
->root_group
)
1662 bfqg
->active_entities
++;
1667 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1668 * @ioprio: the ioprio value to convert.
1670 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1672 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1676 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1677 * @weight: the weight value to convert.
1679 * To preserve as much as possible the old only-ioprio user interface,
1680 * 0 is used as an escape ioprio value for weights (numerically) equal or
1681 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1683 static unsigned short bfq_weight_to_ioprio(int weight
)
1685 return max_t(int, 0,
1686 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1689 static void bfq_get_entity(struct bfq_entity
*entity
)
1691 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1695 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1701 * bfq_find_deepest - find the deepest node that an extraction can modify.
1702 * @node: the node being removed.
1704 * Do the first step of an extraction in an rb tree, looking for the
1705 * node that will replace @node, and returning the deepest node that
1706 * the following modifications to the tree can touch. If @node is the
1707 * last node in the tree return %NULL.
1709 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1711 struct rb_node
*deepest
;
1713 if (!node
->rb_right
&& !node
->rb_left
)
1714 deepest
= rb_parent(node
);
1715 else if (!node
->rb_right
)
1716 deepest
= node
->rb_left
;
1717 else if (!node
->rb_left
)
1718 deepest
= node
->rb_right
;
1720 deepest
= rb_next(node
);
1721 if (deepest
->rb_right
)
1722 deepest
= deepest
->rb_right
;
1723 else if (rb_parent(deepest
) != node
)
1724 deepest
= rb_parent(deepest
);
1731 * bfq_active_extract - remove an entity from the active tree.
1732 * @st: the service_tree containing the tree.
1733 * @entity: the entity being removed.
1735 static void bfq_active_extract(struct bfq_service_tree
*st
,
1736 struct bfq_entity
*entity
)
1738 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1739 struct rb_node
*node
;
1740 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1741 struct bfq_sched_data
*sd
= NULL
;
1742 struct bfq_group
*bfqg
= NULL
;
1743 struct bfq_data
*bfqd
= NULL
;
1746 node
= bfq_find_deepest(&entity
->rb_node
);
1747 bfq_extract(&st
->active
, entity
);
1750 bfq_update_active_tree(node
);
1752 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1753 sd
= entity
->sched_data
;
1754 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1755 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1758 list_del(&bfqq
->bfqq_list
);
1759 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1760 else /* bfq_group */
1761 bfq_weights_tree_remove(bfqd
, entity
,
1762 &bfqd
->group_weights_tree
);
1764 if (bfqg
!= bfqd
->root_group
)
1765 bfqg
->active_entities
--;
1770 * bfq_idle_insert - insert an entity into the idle tree.
1771 * @st: the service tree containing the tree.
1772 * @entity: the entity to insert.
1774 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1775 struct bfq_entity
*entity
)
1777 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1778 struct bfq_entity
*first_idle
= st
->first_idle
;
1779 struct bfq_entity
*last_idle
= st
->last_idle
;
1781 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1782 st
->first_idle
= entity
;
1783 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1784 st
->last_idle
= entity
;
1786 bfq_insert(&st
->idle
, entity
);
1789 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1793 * bfq_forget_entity - do not consider entity any longer for scheduling
1794 * @st: the service tree.
1795 * @entity: the entity being removed.
1796 * @is_in_service: true if entity is currently the in-service entity.
1798 * Forget everything about @entity. In addition, if entity represents
1799 * a queue, and the latter is not in service, then release the service
1800 * reference to the queue (the one taken through bfq_get_entity). In
1801 * fact, in this case, there is really no more service reference to
1802 * the queue, as the latter is also outside any service tree. If,
1803 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1804 * will take care of putting the reference when the queue finally
1805 * stops being served.
1807 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1808 struct bfq_entity
*entity
,
1811 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1813 entity
->on_st
= false;
1814 st
->wsum
-= entity
->weight
;
1815 if (bfqq
&& !is_in_service
)
1816 bfq_put_queue(bfqq
);
1820 * bfq_put_idle_entity - release the idle tree ref of an entity.
1821 * @st: service tree for the entity.
1822 * @entity: the entity being released.
1824 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1825 struct bfq_entity
*entity
)
1827 bfq_idle_extract(st
, entity
);
1828 bfq_forget_entity(st
, entity
,
1829 entity
== entity
->sched_data
->in_service_entity
);
1833 * bfq_forget_idle - update the idle tree if necessary.
1834 * @st: the service tree to act upon.
1836 * To preserve the global O(log N) complexity we only remove one entry here;
1837 * as the idle tree will not grow indefinitely this can be done safely.
1839 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1841 struct bfq_entity
*first_idle
= st
->first_idle
;
1842 struct bfq_entity
*last_idle
= st
->last_idle
;
1844 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1845 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1847 * Forget the whole idle tree, increasing the vtime past
1848 * the last finish time of idle entities.
1850 st
->vtime
= last_idle
->finish
;
1853 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1854 bfq_put_idle_entity(st
, first_idle
);
1857 static struct bfq_service_tree
*
1858 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1859 struct bfq_entity
*entity
)
1861 struct bfq_service_tree
*new_st
= old_st
;
1863 if (entity
->prio_changed
) {
1864 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1865 unsigned int prev_weight
, new_weight
;
1866 struct bfq_data
*bfqd
= NULL
;
1867 struct rb_root
*root
;
1868 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1869 struct bfq_sched_data
*sd
;
1870 struct bfq_group
*bfqg
;
1875 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1877 sd
= entity
->my_sched_data
;
1878 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1879 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1883 old_st
->wsum
-= entity
->weight
;
1885 if (entity
->new_weight
!= entity
->orig_weight
) {
1886 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1887 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1888 pr_crit("update_weight_prio: new_weight %d\n",
1889 entity
->new_weight
);
1890 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1891 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1893 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1895 entity
->orig_weight
= entity
->new_weight
;
1898 bfq_weight_to_ioprio(entity
->orig_weight
);
1902 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1903 entity
->prio_changed
= 0;
1906 * NOTE: here we may be changing the weight too early,
1907 * this will cause unfairness. The correct approach
1908 * would have required additional complexity to defer
1909 * weight changes to the proper time instants (i.e.,
1910 * when entity->finish <= old_st->vtime).
1912 new_st
= bfq_entity_service_tree(entity
);
1914 prev_weight
= entity
->weight
;
1915 new_weight
= entity
->orig_weight
*
1916 (bfqq
? bfqq
->wr_coeff
: 1);
1918 * If the weight of the entity changes, remove the entity
1919 * from its old weight counter (if there is a counter
1920 * associated with the entity), and add it to the counter
1921 * associated with its new weight.
1923 if (prev_weight
!= new_weight
) {
1924 root
= bfqq
? &bfqd
->queue_weights_tree
:
1925 &bfqd
->group_weights_tree
;
1926 bfq_weights_tree_remove(bfqd
, entity
, root
);
1928 entity
->weight
= new_weight
;
1930 * Add the entity to its weights tree only if it is
1931 * not associated with a weight-raised queue.
1933 if (prev_weight
!= new_weight
&&
1934 (bfqq
? bfqq
->wr_coeff
== 1 : 1))
1935 /* If we get here, root has been initialized. */
1936 bfq_weights_tree_add(bfqd
, entity
, root
);
1938 new_st
->wsum
+= entity
->weight
;
1940 if (new_st
!= old_st
)
1941 entity
->start
= new_st
->vtime
;
1947 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1948 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1951 * bfq_bfqq_served - update the scheduler status after selection for
1953 * @bfqq: the queue being served.
1954 * @served: bytes to transfer.
1956 * NOTE: this can be optimized, as the timestamps of upper level entities
1957 * are synchronized every time a new bfqq is selected for service. By now,
1958 * we keep it to better check consistency.
1960 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1962 struct bfq_entity
*entity
= &bfqq
->entity
;
1963 struct bfq_service_tree
*st
;
1965 for_each_entity(entity
) {
1966 st
= bfq_entity_service_tree(entity
);
1968 entity
->service
+= served
;
1970 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1971 bfq_forget_idle(st
);
1973 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1974 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1978 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1979 * of the time interval during which bfqq has been in
1982 * @bfqq: the queue that needs a service update.
1983 * @time_ms: the amount of time during which the queue has received service
1985 * If a queue does not consume its budget fast enough, then providing
1986 * the queue with service fairness may impair throughput, more or less
1987 * severely. For this reason, queues that consume their budget slowly
1988 * are provided with time fairness instead of service fairness. This
1989 * goal is achieved through the BFQ scheduling engine, even if such an
1990 * engine works in the service, and not in the time domain. The trick
1991 * is charging these queues with an inflated amount of service, equal
1992 * to the amount of service that they would have received during their
1993 * service slot if they had been fast, i.e., if their requests had
1994 * been dispatched at a rate equal to the estimated peak rate.
1996 * It is worth noting that time fairness can cause important
1997 * distortions in terms of bandwidth distribution, on devices with
1998 * internal queueing. The reason is that I/O requests dispatched
1999 * during the service slot of a queue may be served after that service
2000 * slot is finished, and may have a total processing time loosely
2001 * correlated with the duration of the service slot. This is
2002 * especially true for short service slots.
2004 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2005 unsigned long time_ms
)
2007 struct bfq_entity
*entity
= &bfqq
->entity
;
2008 int tot_serv_to_charge
= entity
->service
;
2009 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
2011 if (time_ms
> 0 && time_ms
< timeout_ms
)
2012 tot_serv_to_charge
=
2013 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
2015 if (tot_serv_to_charge
< entity
->service
)
2016 tot_serv_to_charge
= entity
->service
;
2018 /* Increase budget to avoid inconsistencies */
2019 if (tot_serv_to_charge
> entity
->budget
)
2020 entity
->budget
= tot_serv_to_charge
;
2022 bfq_bfqq_served(bfqq
,
2023 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
2026 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
2027 struct bfq_service_tree
*st
,
2030 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2032 st
= __bfq_entity_update_weight_prio(st
, entity
);
2033 bfq_calc_finish(entity
, entity
->budget
);
2036 * If some queues enjoy backshifting for a while, then their
2037 * (virtual) finish timestamps may happen to become lower and
2038 * lower than the system virtual time. In particular, if
2039 * these queues often happen to be idle for short time
2040 * periods, and during such time periods other queues with
2041 * higher timestamps happen to be busy, then the backshifted
2042 * timestamps of the former queues can become much lower than
2043 * the system virtual time. In fact, to serve the queues with
2044 * higher timestamps while the ones with lower timestamps are
2045 * idle, the system virtual time may be pushed-up to much
2046 * higher values than the finish timestamps of the idle
2047 * queues. As a consequence, the finish timestamps of all new
2048 * or newly activated queues may end up being much larger than
2049 * those of lucky queues with backshifted timestamps. The
2050 * latter queues may then monopolize the device for a lot of
2051 * time. This would simply break service guarantees.
2053 * To reduce this problem, push up a little bit the
2054 * backshifted timestamps of the queue associated with this
2055 * entity (only a queue can happen to have the backshifted
2056 * flag set): just enough to let the finish timestamp of the
2057 * queue be equal to the current value of the system virtual
2058 * time. This may introduce a little unfairness among queues
2059 * with backshifted timestamps, but it does not break
2060 * worst-case fairness guarantees.
2062 * As a special case, if bfqq is weight-raised, push up
2063 * timestamps much less, to keep very low the probability that
2064 * this push up causes the backshifted finish timestamps of
2065 * weight-raised queues to become higher than the backshifted
2066 * finish timestamps of non weight-raised queues.
2068 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
2069 unsigned long delta
= st
->vtime
- entity
->finish
;
2072 delta
/= bfqq
->wr_coeff
;
2074 entity
->start
+= delta
;
2075 entity
->finish
+= delta
;
2078 bfq_active_insert(st
, entity
);
2082 * __bfq_activate_entity - handle activation of entity.
2083 * @entity: the entity being activated.
2084 * @non_blocking_wait_rq: true if entity was waiting for a request
2086 * Called for a 'true' activation, i.e., if entity is not active and
2087 * one of its children receives a new request.
2089 * Basically, this function updates the timestamps of entity and
2090 * inserts entity into its active tree, ater possible extracting it
2091 * from its idle tree.
2093 static void __bfq_activate_entity(struct bfq_entity
*entity
,
2094 bool non_blocking_wait_rq
)
2096 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2097 bool backshifted
= false;
2098 unsigned long long min_vstart
;
2100 /* See comments on bfq_fqq_update_budg_for_activation */
2101 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
2103 min_vstart
= entity
->finish
;
2105 min_vstart
= st
->vtime
;
2107 if (entity
->tree
== &st
->idle
) {
2109 * Must be on the idle tree, bfq_idle_extract() will
2112 bfq_idle_extract(st
, entity
);
2113 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
2114 min_vstart
: entity
->finish
;
2117 * The finish time of the entity may be invalid, and
2118 * it is in the past for sure, otherwise the queue
2119 * would have been on the idle tree.
2121 entity
->start
= min_vstart
;
2122 st
->wsum
+= entity
->weight
;
2124 * entity is about to be inserted into a service tree,
2125 * and then set in service: get a reference to make
2126 * sure entity does not disappear until it is no
2127 * longer in service or scheduled for service.
2129 bfq_get_entity(entity
);
2131 entity
->on_st
= true;
2134 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
2138 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
2139 * @entity: the entity being requeued or repositioned.
2141 * Requeueing is needed if this entity stops being served, which
2142 * happens if a leaf descendant entity has expired. On the other hand,
2143 * repositioning is needed if the next_inservice_entity for the child
2144 * entity has changed. See the comments inside the function for
2147 * Basically, this function: 1) removes entity from its active tree if
2148 * present there, 2) updates the timestamps of entity and 3) inserts
2149 * entity back into its active tree (in the new, right position for
2150 * the new values of the timestamps).
2152 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
2154 struct bfq_sched_data
*sd
= entity
->sched_data
;
2155 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2157 if (entity
== sd
->in_service_entity
) {
2159 * We are requeueing the current in-service entity,
2160 * which may have to be done for one of the following
2162 * - entity represents the in-service queue, and the
2163 * in-service queue is being requeued after an
2165 * - entity represents a group, and its budget has
2166 * changed because one of its child entities has
2167 * just been either activated or requeued for some
2168 * reason; the timestamps of the entity need then to
2169 * be updated, and the entity needs to be enqueued
2170 * or repositioned accordingly.
2172 * In particular, before requeueing, the start time of
2173 * the entity must be moved forward to account for the
2174 * service that the entity has received while in
2175 * service. This is done by the next instructions. The
2176 * finish time will then be updated according to this
2177 * new value of the start time, and to the budget of
2180 bfq_calc_finish(entity
, entity
->service
);
2181 entity
->start
= entity
->finish
;
2183 * In addition, if the entity had more than one child
2184 * when set in service, then was not extracted from
2185 * the active tree. This implies that the position of
2186 * the entity in the active tree may need to be
2187 * changed now, because we have just updated the start
2188 * time of the entity, and we will update its finish
2189 * time in a moment (the requeueing is then, more
2190 * precisely, a repositioning in this case). To
2191 * implement this repositioning, we: 1) dequeue the
2192 * entity here, 2) update the finish time and
2193 * requeue the entity according to the new
2197 bfq_active_extract(st
, entity
);
2198 } else { /* The entity is already active, and not in service */
2200 * In this case, this function gets called only if the
2201 * next_in_service entity below this entity has
2202 * changed, and this change has caused the budget of
2203 * this entity to change, which, finally implies that
2204 * the finish time of this entity must be
2205 * updated. Such an update may cause the scheduling,
2206 * i.e., the position in the active tree, of this
2207 * entity to change. We handle this change by: 1)
2208 * dequeueing the entity here, 2) updating the finish
2209 * time and requeueing the entity according to the new
2210 * timestamps below. This is the same approach as the
2211 * non-extracted-entity sub-case above.
2213 bfq_active_extract(st
, entity
);
2216 bfq_update_fin_time_enqueue(entity
, st
, false);
2219 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2220 struct bfq_sched_data
*sd
,
2221 bool non_blocking_wait_rq
)
2223 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2225 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
2227 * in service or already queued on the active tree,
2228 * requeue or reposition
2230 __bfq_requeue_entity(entity
);
2233 * Not in service and not queued on its active tree:
2234 * the activity is idle and this is a true activation.
2236 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
2241 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
2242 * and activate, requeue or reposition all ancestors
2243 * for which such an update becomes necessary.
2244 * @entity: the entity to activate.
2245 * @non_blocking_wait_rq: true if this entity was waiting for a request
2246 * @requeue: true if this is a requeue, which implies that bfqq is
2247 * being expired; thus ALL its ancestors stop being served and must
2248 * therefore be requeued
2250 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2251 bool non_blocking_wait_rq
,
2254 struct bfq_sched_data
*sd
;
2256 for_each_entity(entity
) {
2257 sd
= entity
->sched_data
;
2258 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
2260 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
2266 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2267 * @entity: the entity to deactivate.
2268 * @ins_into_idle_tree: if false, the entity will not be put into the
2271 * Deactivates an entity, independently from its previous state. Must
2272 * be invoked only if entity is on a service tree. Extracts the entity
2273 * from that tree, and if necessary and allowed, puts it on the idle
2276 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
2277 bool ins_into_idle_tree
)
2279 struct bfq_sched_data
*sd
= entity
->sched_data
;
2280 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2281 int is_in_service
= entity
== sd
->in_service_entity
;
2283 if (!entity
->on_st
) /* entity never activated, or already inactive */
2287 bfq_calc_finish(entity
, entity
->service
);
2289 if (entity
->tree
== &st
->active
)
2290 bfq_active_extract(st
, entity
);
2291 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
2292 bfq_idle_extract(st
, entity
);
2294 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
2295 bfq_forget_entity(st
, entity
, is_in_service
);
2297 bfq_idle_insert(st
, entity
);
2303 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2304 * @entity: the entity to deactivate.
2305 * @ins_into_idle_tree: true if the entity can be put on the idle tree
2307 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
2308 bool ins_into_idle_tree
,
2311 struct bfq_sched_data
*sd
;
2312 struct bfq_entity
*parent
= NULL
;
2314 for_each_entity_safe(entity
, parent
) {
2315 sd
= entity
->sched_data
;
2317 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
2319 * entity is not in any tree any more, so
2320 * this deactivation is a no-op, and there is
2321 * nothing to change for upper-level entities
2322 * (in case of expiration, this can never
2328 if (sd
->next_in_service
== entity
)
2330 * entity was the next_in_service entity,
2331 * then, since entity has just been
2332 * deactivated, a new one must be found.
2334 bfq_update_next_in_service(sd
, NULL
);
2336 if (sd
->next_in_service
)
2338 * The parent entity is still backlogged,
2339 * because next_in_service is not NULL. So, no
2340 * further upwards deactivation must be
2341 * performed. Yet, next_in_service has
2342 * changed. Then the schedule does need to be
2348 * If we get here, then the parent is no more
2349 * backlogged and we need to propagate the
2350 * deactivation upwards. Thus let the loop go on.
2354 * Also let parent be queued into the idle tree on
2355 * deactivation, to preserve service guarantees, and
2356 * assuming that who invoked this function does not
2357 * need parent entities too to be removed completely.
2359 ins_into_idle_tree
= true;
2363 * If the deactivation loop is fully executed, then there are
2364 * no more entities to touch and next loop is not executed at
2365 * all. Otherwise, requeue remaining entities if they are
2366 * about to stop receiving service, or reposition them if this
2370 for_each_entity(entity
) {
2372 * Invoke __bfq_requeue_entity on entity, even if
2373 * already active, to requeue/reposition it in the
2374 * active tree (because sd->next_in_service has
2377 __bfq_requeue_entity(entity
);
2379 sd
= entity
->sched_data
;
2380 if (!bfq_update_next_in_service(sd
, entity
) &&
2383 * next_in_service unchanged or not causing
2384 * any change in entity->parent->sd, and no
2385 * requeueing needed for expiration: stop
2393 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2394 * if needed, to have at least one entity eligible.
2395 * @st: the service tree to act upon.
2397 * Assumes that st is not empty.
2399 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
2401 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
2403 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
2404 return root_entity
->min_start
;
2409 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2411 if (new_value
> st
->vtime
) {
2412 st
->vtime
= new_value
;
2413 bfq_forget_idle(st
);
2418 * bfq_first_active_entity - find the eligible entity with
2419 * the smallest finish time
2420 * @st: the service tree to select from.
2421 * @vtime: the system virtual to use as a reference for eligibility
2423 * This function searches the first schedulable entity, starting from the
2424 * root of the tree and going on the left every time on this side there is
2425 * a subtree with at least one eligible (start >= vtime) entity. The path on
2426 * the right is followed only if a) the left subtree contains no eligible
2427 * entities and b) no eligible entity has been found yet.
2429 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2432 struct bfq_entity
*entry
, *first
= NULL
;
2433 struct rb_node
*node
= st
->active
.rb_node
;
2436 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2438 if (!bfq_gt(entry
->start
, vtime
))
2441 if (node
->rb_left
) {
2442 entry
= rb_entry(node
->rb_left
,
2443 struct bfq_entity
, rb_node
);
2444 if (!bfq_gt(entry
->min_start
, vtime
)) {
2445 node
= node
->rb_left
;
2451 node
= node
->rb_right
;
2458 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2459 * @st: the service tree.
2461 * If there is no in-service entity for the sched_data st belongs to,
2462 * then return the entity that will be set in service if:
2463 * 1) the parent entity this st belongs to is set in service;
2464 * 2) no entity belonging to such parent entity undergoes a state change
2465 * that would influence the timestamps of the entity (e.g., becomes idle,
2466 * becomes backlogged, changes its budget, ...).
2468 * In this first case, update the virtual time in @st too (see the
2469 * comments on this update inside the function).
2471 * In constrast, if there is an in-service entity, then return the
2472 * entity that would be set in service if not only the above
2473 * conditions, but also the next one held true: the currently
2474 * in-service entity, on expiration,
2475 * 1) gets a finish time equal to the current one, or
2476 * 2) is not eligible any more, or
2479 static struct bfq_entity
*
2480 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2482 struct bfq_entity
*entity
;
2485 if (RB_EMPTY_ROOT(&st
->active
))
2489 * Get the value of the system virtual time for which at
2490 * least one entity is eligible.
2492 new_vtime
= bfq_calc_vtime_jump(st
);
2495 * If there is no in-service entity for the sched_data this
2496 * active tree belongs to, then push the system virtual time
2497 * up to the value that guarantees that at least one entity is
2498 * eligible. If, instead, there is an in-service entity, then
2499 * do not make any such update, because there is already an
2500 * eligible entity, namely the in-service one (even if the
2501 * entity is not on st, because it was extracted when set in
2505 bfq_update_vtime(st
, new_vtime
);
2507 entity
= bfq_first_active_entity(st
, new_vtime
);
2513 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2514 * @sd: the sched_data.
2516 * This function is invoked when there has been a change in the trees
2517 * for sd, and we need know what is the new next entity after this
2520 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2522 struct bfq_service_tree
*st
= sd
->service_tree
;
2523 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2524 struct bfq_entity
*entity
= NULL
;
2528 * Choose from idle class, if needed to guarantee a minimum
2529 * bandwidth to this class (and if there is some active entity
2530 * in idle class). This should also mitigate
2531 * priority-inversion problems in case a low priority task is
2532 * holding file system resources.
2534 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2535 BFQ_CL_IDLE_TIMEOUT
)) {
2536 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2537 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2538 /* About to be served if backlogged, or not yet backlogged */
2539 sd
->bfq_class_idle_last_service
= jiffies
;
2543 * Find the next entity to serve for the highest-priority
2544 * class, unless the idle class needs to be served.
2546 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2547 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2548 sd
->in_service_entity
);
2560 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2562 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2564 return sd
->next_in_service
!= sd
->in_service_entity
;
2568 * Get next queue for service.
2570 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2572 struct bfq_entity
*entity
= NULL
;
2573 struct bfq_sched_data
*sd
;
2574 struct bfq_queue
*bfqq
;
2576 if (bfqd
->busy_queues
== 0)
2580 * Traverse the path from the root to the leaf entity to
2581 * serve. Set in service all the entities visited along the
2584 sd
= &bfqd
->root_group
->sched_data
;
2585 for (; sd
; sd
= entity
->my_sched_data
) {
2587 * WARNING. We are about to set the in-service entity
2588 * to sd->next_in_service, i.e., to the (cached) value
2589 * returned by bfq_lookup_next_entity(sd) the last
2590 * time it was invoked, i.e., the last time when the
2591 * service order in sd changed as a consequence of the
2592 * activation or deactivation of an entity. In this
2593 * respect, if we execute bfq_lookup_next_entity(sd)
2594 * in this very moment, it may, although with low
2595 * probability, yield a different entity than that
2596 * pointed to by sd->next_in_service. This rare event
2597 * happens in case there was no CLASS_IDLE entity to
2598 * serve for sd when bfq_lookup_next_entity(sd) was
2599 * invoked for the last time, while there is now one
2602 * If the above event happens, then the scheduling of
2603 * such entity in CLASS_IDLE is postponed until the
2604 * service of the sd->next_in_service entity
2605 * finishes. In fact, when the latter is expired,
2606 * bfq_lookup_next_entity(sd) gets called again,
2607 * exactly to update sd->next_in_service.
2610 /* Make next_in_service entity become in_service_entity */
2611 entity
= sd
->next_in_service
;
2612 sd
->in_service_entity
= entity
;
2615 * Reset the accumulator of the amount of service that
2616 * the entity is about to receive.
2618 entity
->service
= 0;
2621 * If entity is no longer a candidate for next
2622 * service, then we extract it from its active tree,
2623 * for the following reason. To further boost the
2624 * throughput in some special case, BFQ needs to know
2625 * which is the next candidate entity to serve, while
2626 * there is already an entity in service. In this
2627 * respect, to make it easy to compute/update the next
2628 * candidate entity to serve after the current
2629 * candidate has been set in service, there is a case
2630 * where it is necessary to extract the current
2631 * candidate from its service tree. Such a case is
2632 * when the entity just set in service cannot be also
2633 * a candidate for next service. Details about when
2634 * this conditions holds are reported in the comments
2635 * on the function bfq_no_longer_next_in_service()
2638 if (bfq_no_longer_next_in_service(entity
))
2639 bfq_active_extract(bfq_entity_service_tree(entity
),
2643 * For the same reason why we may have just extracted
2644 * entity from its active tree, we may need to update
2645 * next_in_service for the sched_data of entity too,
2646 * regardless of whether entity has been extracted.
2647 * In fact, even if entity has not been extracted, a
2648 * descendant entity may get extracted. Such an event
2649 * would cause a change in next_in_service for the
2650 * level of the descendant entity, and thus possibly
2651 * back to upper levels.
2653 * We cannot perform the resulting needed update
2654 * before the end of this loop, because, to know which
2655 * is the correct next-to-serve candidate entity for
2656 * each level, we need first to find the leaf entity
2657 * to set in service. In fact, only after we know
2658 * which is the next-to-serve leaf entity, we can
2659 * discover whether the parent entity of the leaf
2660 * entity becomes the next-to-serve, and so on.
2665 bfqq
= bfq_entity_to_bfqq(entity
);
2668 * We can finally update all next-to-serve entities along the
2669 * path from the leaf entity just set in service to the root.
2671 for_each_entity(entity
) {
2672 struct bfq_sched_data
*sd
= entity
->sched_data
;
2674 if (!bfq_update_next_in_service(sd
, NULL
))
2681 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2683 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2684 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2685 struct bfq_entity
*entity
= in_serv_entity
;
2687 if (bfqd
->in_service_bic
) {
2689 * Schedule the release of a reference to
2690 * bfqd->in_service_bic->icq.ioc to right after the
2691 * scheduler lock is released. This ioc is not
2692 * released immediately, to not risk to possibly take
2693 * an ioc->lock while holding the scheduler lock.
2695 bfqd
->ioc_to_put
= bfqd
->in_service_bic
->icq
.ioc
;
2696 bfqd
->in_service_bic
= NULL
;
2699 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2700 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2701 bfqd
->in_service_queue
= NULL
;
2704 * When this function is called, all in-service entities have
2705 * been properly deactivated or requeued, so we can safely
2706 * execute the final step: reset in_service_entity along the
2707 * path from entity to the root.
2709 for_each_entity(entity
)
2710 entity
->sched_data
->in_service_entity
= NULL
;
2713 * in_serv_entity is no longer in service, so, if it is in no
2714 * service tree either, then release the service reference to
2715 * the queue it represents (taken with bfq_get_entity).
2717 if (!in_serv_entity
->on_st
)
2718 bfq_put_queue(in_serv_bfqq
);
2721 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2722 bool ins_into_idle_tree
, bool expiration
)
2724 struct bfq_entity
*entity
= &bfqq
->entity
;
2726 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2729 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2731 struct bfq_entity
*entity
= &bfqq
->entity
;
2733 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2735 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2738 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2740 struct bfq_entity
*entity
= &bfqq
->entity
;
2742 bfq_activate_requeue_entity(entity
, false,
2743 bfqq
== bfqd
->in_service_queue
);
2746 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2749 * Called when the bfqq no longer has requests pending, remove it from
2750 * the service tree. As a special case, it can be invoked during an
2753 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2756 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2758 bfq_clear_bfqq_busy(bfqq
);
2760 bfqd
->busy_queues
--;
2762 if (!bfqq
->dispatched
)
2763 bfq_weights_tree_remove(bfqd
, &bfqq
->entity
,
2764 &bfqd
->queue_weights_tree
);
2766 if (bfqq
->wr_coeff
> 1)
2767 bfqd
->wr_busy_queues
--;
2769 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2771 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2775 * Called when an inactive queue receives a new request.
2777 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2779 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2781 bfq_activate_bfqq(bfqd
, bfqq
);
2783 bfq_mark_bfqq_busy(bfqq
);
2784 bfqd
->busy_queues
++;
2786 if (!bfqq
->dispatched
)
2787 if (bfqq
->wr_coeff
== 1)
2788 bfq_weights_tree_add(bfqd
, &bfqq
->entity
,
2789 &bfqd
->queue_weights_tree
);
2791 if (bfqq
->wr_coeff
> 1)
2792 bfqd
->wr_busy_queues
++;
2795 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2797 /* bfqg stats flags */
2798 enum bfqg_stats_flags
{
2799 BFQG_stats_waiting
= 0,
2804 #define BFQG_FLAG_FNS(name) \
2805 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2807 stats->flags |= (1 << BFQG_stats_##name); \
2809 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2811 stats->flags &= ~(1 << BFQG_stats_##name); \
2813 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2815 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2818 BFQG_FLAG_FNS(waiting)
2819 BFQG_FLAG_FNS(idling
)
2820 BFQG_FLAG_FNS(empty
)
2821 #undef BFQG_FLAG_FNS
2823 /* This should be called with the queue_lock held. */
2824 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2826 unsigned long long now
;
2828 if (!bfqg_stats_waiting(stats
))
2831 now
= sched_clock();
2832 if (time_after64(now
, stats
->start_group_wait_time
))
2833 blkg_stat_add(&stats
->group_wait_time
,
2834 now
- stats
->start_group_wait_time
);
2835 bfqg_stats_clear_waiting(stats
);
2838 /* This should be called with the queue_lock held. */
2839 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2840 struct bfq_group
*curr_bfqg
)
2842 struct bfqg_stats
*stats
= &bfqg
->stats
;
2844 if (bfqg_stats_waiting(stats
))
2846 if (bfqg
== curr_bfqg
)
2848 stats
->start_group_wait_time
= sched_clock();
2849 bfqg_stats_mark_waiting(stats
);
2852 /* This should be called with the queue_lock held. */
2853 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2855 unsigned long long now
;
2857 if (!bfqg_stats_empty(stats
))
2860 now
= sched_clock();
2861 if (time_after64(now
, stats
->start_empty_time
))
2862 blkg_stat_add(&stats
->empty_time
,
2863 now
- stats
->start_empty_time
);
2864 bfqg_stats_clear_empty(stats
);
2867 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2869 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2872 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2874 struct bfqg_stats
*stats
= &bfqg
->stats
;
2876 if (blkg_rwstat_total(&stats
->queued
))
2880 * group is already marked empty. This can happen if bfqq got new
2881 * request in parent group and moved to this group while being added
2882 * to service tree. Just ignore the event and move on.
2884 if (bfqg_stats_empty(stats
))
2887 stats
->start_empty_time
= sched_clock();
2888 bfqg_stats_mark_empty(stats
);
2891 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2893 struct bfqg_stats
*stats
= &bfqg
->stats
;
2895 if (bfqg_stats_idling(stats
)) {
2896 unsigned long long now
= sched_clock();
2898 if (time_after64(now
, stats
->start_idle_time
))
2899 blkg_stat_add(&stats
->idle_time
,
2900 now
- stats
->start_idle_time
);
2901 bfqg_stats_clear_idling(stats
);
2905 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2907 struct bfqg_stats
*stats
= &bfqg
->stats
;
2909 stats
->start_idle_time
= sched_clock();
2910 bfqg_stats_mark_idling(stats
);
2913 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2915 struct bfqg_stats
*stats
= &bfqg
->stats
;
2917 blkg_stat_add(&stats
->avg_queue_size_sum
,
2918 blkg_rwstat_total(&stats
->queued
));
2919 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2920 bfqg_stats_update_group_wait_time(stats
);
2924 * blk-cgroup policy-related handlers
2925 * The following functions help in converting between blk-cgroup
2926 * internal structures and BFQ-specific structures.
2929 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2931 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2934 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2936 return pd_to_blkg(&bfqg
->pd
);
2939 static struct blkcg_policy blkcg_policy_bfq
;
2941 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2943 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2947 * bfq_group handlers
2948 * The following functions help in navigating the bfq_group hierarchy
2949 * by allowing to find the parent of a bfq_group or the bfq_group
2950 * associated to a bfq_queue.
2953 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2955 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2957 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2960 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2962 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2964 return group_entity
? container_of(group_entity
, struct bfq_group
,
2966 bfqq
->bfqd
->root_group
;
2970 * The following two functions handle get and put of a bfq_group by
2971 * wrapping the related blk-cgroup hooks.
2974 static void bfqg_get(struct bfq_group
*bfqg
)
2976 return blkg_get(bfqg_to_blkg(bfqg
));
2979 static void bfqg_put(struct bfq_group
*bfqg
)
2981 return blkg_put(bfqg_to_blkg(bfqg
));
2984 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2985 struct bfq_queue
*bfqq
,
2988 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2989 bfqg_stats_end_empty_time(&bfqg
->stats
);
2990 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2991 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2994 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2996 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2999 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
3001 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
3004 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3005 uint64_t start_time
, uint64_t io_start_time
,
3008 struct bfqg_stats
*stats
= &bfqg
->stats
;
3009 unsigned long long now
= sched_clock();
3011 if (time_after64(now
, io_start_time
))
3012 blkg_rwstat_add(&stats
->service_time
, op
,
3013 now
- io_start_time
);
3014 if (time_after64(io_start_time
, start_time
))
3015 blkg_rwstat_add(&stats
->wait_time
, op
,
3016 io_start_time
- start_time
);
3020 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
3022 /* queued stats shouldn't be cleared */
3023 blkg_rwstat_reset(&stats
->merged
);
3024 blkg_rwstat_reset(&stats
->service_time
);
3025 blkg_rwstat_reset(&stats
->wait_time
);
3026 blkg_stat_reset(&stats
->time
);
3027 blkg_stat_reset(&stats
->avg_queue_size_sum
);
3028 blkg_stat_reset(&stats
->avg_queue_size_samples
);
3029 blkg_stat_reset(&stats
->dequeue
);
3030 blkg_stat_reset(&stats
->group_wait_time
);
3031 blkg_stat_reset(&stats
->idle_time
);
3032 blkg_stat_reset(&stats
->empty_time
);
3036 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
3041 /* queued stats shouldn't be cleared */
3042 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
3043 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
3044 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
3045 blkg_stat_add_aux(&from
->time
, &from
->time
);
3046 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
3047 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
3048 &from
->avg_queue_size_samples
);
3049 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
3050 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
3051 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
3052 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
3056 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
3057 * recursive stats can still account for the amount used by this bfqg after
3060 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
3062 struct bfq_group
*parent
;
3064 if (!bfqg
) /* root_group */
3067 parent
= bfqg_parent(bfqg
);
3069 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
3071 if (unlikely(!parent
))
3074 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
3075 bfqg_stats_reset(&bfqg
->stats
);
3078 static void bfq_init_entity(struct bfq_entity
*entity
,
3079 struct bfq_group
*bfqg
)
3081 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3083 entity
->weight
= entity
->new_weight
;
3084 entity
->orig_weight
= entity
->new_weight
;
3086 bfqq
->ioprio
= bfqq
->new_ioprio
;
3087 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3090 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
3091 entity
->sched_data
= &bfqg
->sched_data
;
3094 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
3096 blkg_rwstat_exit(&stats
->merged
);
3097 blkg_rwstat_exit(&stats
->service_time
);
3098 blkg_rwstat_exit(&stats
->wait_time
);
3099 blkg_rwstat_exit(&stats
->queued
);
3100 blkg_stat_exit(&stats
->time
);
3101 blkg_stat_exit(&stats
->avg_queue_size_sum
);
3102 blkg_stat_exit(&stats
->avg_queue_size_samples
);
3103 blkg_stat_exit(&stats
->dequeue
);
3104 blkg_stat_exit(&stats
->group_wait_time
);
3105 blkg_stat_exit(&stats
->idle_time
);
3106 blkg_stat_exit(&stats
->empty_time
);
3109 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
3111 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
3112 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
3113 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
3114 blkg_rwstat_init(&stats
->queued
, gfp
) ||
3115 blkg_stat_init(&stats
->time
, gfp
) ||
3116 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
3117 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
3118 blkg_stat_init(&stats
->dequeue
, gfp
) ||
3119 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
3120 blkg_stat_init(&stats
->idle_time
, gfp
) ||
3121 blkg_stat_init(&stats
->empty_time
, gfp
)) {
3122 bfqg_stats_exit(stats
);
3129 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
3131 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
3134 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
3136 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
3139 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
3141 struct bfq_group_data
*bgd
;
3143 bgd
= kzalloc(sizeof(*bgd
), gfp
);
3149 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
3151 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
3153 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
3154 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
3157 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
3159 kfree(cpd_to_bfqgd(cpd
));
3162 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
3164 struct bfq_group
*bfqg
;
3166 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
3170 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
3178 static void bfq_pd_init(struct blkg_policy_data
*pd
)
3180 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
3181 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3182 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
3183 struct bfq_entity
*entity
= &bfqg
->entity
;
3184 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
3186 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
3187 entity
->my_sched_data
= &bfqg
->sched_data
;
3188 bfqg
->my_entity
= entity
; /*
3189 * the root_group's will be set to NULL
3190 * in bfq_init_queue()
3193 bfqg
->active_entities
= 0;
3194 bfqg
->rq_pos_tree
= RB_ROOT
;
3197 static void bfq_pd_free(struct blkg_policy_data
*pd
)
3199 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3201 bfqg_stats_exit(&bfqg
->stats
);
3205 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
3207 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3209 bfqg_stats_reset(&bfqg
->stats
);
3212 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
3213 struct bfq_group
*parent
)
3215 struct bfq_entity
*entity
;
3217 entity
= &bfqg
->entity
;
3218 entity
->parent
= parent
->my_entity
;
3219 entity
->sched_data
= &parent
->sched_data
;
3222 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
3223 struct blkcg
*blkcg
)
3225 struct blkcg_gq
*blkg
;
3227 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
3229 return blkg_to_bfqg(blkg
);
3233 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3234 struct blkcg
*blkcg
)
3236 struct bfq_group
*bfqg
, *parent
;
3237 struct bfq_entity
*entity
;
3239 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
3241 if (unlikely(!bfqg
))
3245 * Update chain of bfq_groups as we might be handling a leaf group
3246 * which, along with some of its relatives, has not been hooked yet
3247 * to the private hierarchy of BFQ.
3249 entity
= &bfqg
->entity
;
3250 for_each_entity(entity
) {
3251 bfqg
= container_of(entity
, struct bfq_group
, entity
);
3252 if (bfqg
!= bfqd
->root_group
) {
3253 parent
= bfqg_parent(bfqg
);
3255 parent
= bfqd
->root_group
;
3256 bfq_group_set_parent(bfqg
, parent
);
3263 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
,
3264 struct bfq_queue
*bfqq
);
3265 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3266 struct bfq_queue
*bfqq
,
3268 enum bfqq_expiration reason
);
3271 * bfq_bfqq_move - migrate @bfqq to @bfqg.
3272 * @bfqd: queue descriptor.
3273 * @bfqq: the queue to move.
3274 * @bfqg: the group to move to.
3276 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
3277 * it on the new one. Avoid putting the entity on the old group idle tree.
3279 * Must be called under the queue lock; the cgroup owning @bfqg must
3280 * not disappear (by now this just means that we are called under
3283 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3284 struct bfq_group
*bfqg
)
3286 struct bfq_entity
*entity
= &bfqq
->entity
;
3288 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3289 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3290 * from data structures related to current group. Otherwise we
3291 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3294 if (bfqq
== bfqd
->in_service_queue
)
3295 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3296 false, BFQQE_PREEMPTED
);
3298 if (bfq_bfqq_busy(bfqq
))
3299 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
3300 else if (entity
->on_st
)
3301 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
3302 bfqg_put(bfqq_group(bfqq
));
3305 * Here we use a reference to bfqg. We don't need a refcounter
3306 * as the cgroup reference will not be dropped, so that its
3307 * destroy() callback will not be invoked.
3309 entity
->parent
= bfqg
->my_entity
;
3310 entity
->sched_data
= &bfqg
->sched_data
;
3313 if (bfq_bfqq_busy(bfqq
)) {
3314 bfq_pos_tree_add_move(bfqd
, bfqq
);
3315 bfq_activate_bfqq(bfqd
, bfqq
);
3318 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
3319 bfq_schedule_dispatch(bfqd
);
3323 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3324 * @bfqd: the queue descriptor.
3325 * @bic: the bic to move.
3326 * @blkcg: the blk-cgroup to move to.
3328 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3329 * has to make sure that the reference to cgroup is valid across the call.
3331 * NOTE: an alternative approach might have been to store the current
3332 * cgroup in bfqq and getting a reference to it, reducing the lookup
3333 * time here, at the price of slightly more complex code.
3335 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
3336 struct bfq_io_cq
*bic
,
3337 struct blkcg
*blkcg
)
3339 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
3340 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
3341 struct bfq_group
*bfqg
;
3342 struct bfq_entity
*entity
;
3344 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
3346 if (unlikely(!bfqg
))
3347 bfqg
= bfqd
->root_group
;
3350 entity
= &async_bfqq
->entity
;
3352 if (entity
->sched_data
!= &bfqg
->sched_data
) {
3353 bic_set_bfqq(bic
, NULL
, 0);
3354 bfq_log_bfqq(bfqd
, async_bfqq
,
3355 "bic_change_group: %p %d",
3356 async_bfqq
, async_bfqq
->ref
);
3357 bfq_put_queue(async_bfqq
);
3362 entity
= &sync_bfqq
->entity
;
3363 if (entity
->sched_data
!= &bfqg
->sched_data
)
3364 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
3370 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
3372 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
3373 struct bfq_group
*bfqg
= NULL
;
3377 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
3380 * Check whether blkcg has changed. The condition may trigger
3381 * spuriously on a newly created cic but there's no harm.
3383 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
3386 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
3387 bic
->blkcg_serial_nr
= serial_nr
;
3393 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3394 * @st: the service tree being flushed.
3396 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
3398 struct bfq_entity
*entity
= st
->first_idle
;
3400 for (; entity
; entity
= st
->first_idle
)
3401 __bfq_deactivate_entity(entity
, false);
3405 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3406 * @bfqd: the device data structure with the root group.
3407 * @entity: the entity to move.
3409 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
3410 struct bfq_entity
*entity
)
3412 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3414 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
3418 * bfq_reparent_active_entities - move to the root group all active
3420 * @bfqd: the device data structure with the root group.
3421 * @bfqg: the group to move from.
3422 * @st: the service tree with the entities.
3424 * Needs queue_lock to be taken and reference to be valid over the call.
3426 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
3427 struct bfq_group
*bfqg
,
3428 struct bfq_service_tree
*st
)
3430 struct rb_root
*active
= &st
->active
;
3431 struct bfq_entity
*entity
= NULL
;
3433 if (!RB_EMPTY_ROOT(&st
->active
))
3434 entity
= bfq_entity_of(rb_first(active
));
3436 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3437 bfq_reparent_leaf_entity(bfqd
, entity
);
3439 if (bfqg
->sched_data
.in_service_entity
)
3440 bfq_reparent_leaf_entity(bfqd
,
3441 bfqg
->sched_data
.in_service_entity
);
3445 * bfq_pd_offline - deactivate the entity associated with @pd,
3446 * and reparent its children entities.
3447 * @pd: descriptor of the policy going offline.
3449 * blkio already grabs the queue_lock for us, so no need to use
3452 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3454 struct bfq_service_tree
*st
;
3455 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3456 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3457 struct bfq_entity
*entity
= bfqg
->my_entity
;
3458 unsigned long flags
;
3461 if (!entity
) /* root group */
3464 spin_lock_irqsave(&bfqd
->lock
, flags
);
3466 * Empty all service_trees belonging to this group before
3467 * deactivating the group itself.
3469 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3470 st
= bfqg
->sched_data
.service_tree
+ i
;
3473 * The idle tree may still contain bfq_queues belonging
3474 * to exited task because they never migrated to a different
3475 * cgroup from the one being destroyed now. No one else
3476 * can access them so it's safe to act without any lock.
3478 bfq_flush_idle_tree(st
);
3481 * It may happen that some queues are still active
3482 * (busy) upon group destruction (if the corresponding
3483 * processes have been forced to terminate). We move
3484 * all the leaf entities corresponding to these queues
3485 * to the root_group.
3486 * Also, it may happen that the group has an entity
3487 * in service, which is disconnected from the active
3488 * tree: it must be moved, too.
3489 * There is no need to put the sync queues, as the
3490 * scheduler has taken no reference.
3492 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3495 __bfq_deactivate_entity(entity
, false);
3496 bfq_put_async_queues(bfqd
, bfqg
);
3498 bfq_unlock_put_ioc_restore(bfqd
, flags
);
3500 * @blkg is going offline and will be ignored by
3501 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3502 * that they don't get lost. If IOs complete after this point, the
3503 * stats for them will be lost. Oh well...
3505 bfqg_stats_xfer_dead(bfqg
);
3508 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3510 struct blkcg_gq
*blkg
;
3512 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
3513 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3515 bfq_end_wr_async_queues(bfqd
, bfqg
);
3517 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3520 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3522 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3523 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3524 unsigned int val
= 0;
3527 val
= bfqgd
->weight
;
3529 seq_printf(sf
, "%u\n", val
);
3534 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3535 struct cftype
*cftype
,
3538 struct blkcg
*blkcg
= css_to_blkcg(css
);
3539 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3540 struct blkcg_gq
*blkg
;
3543 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3547 spin_lock_irq(&blkcg
->lock
);
3548 bfqgd
->weight
= (unsigned short)val
;
3549 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3550 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3555 * Setting the prio_changed flag of the entity
3556 * to 1 with new_weight == weight would re-set
3557 * the value of the weight to its ioprio mapping.
3558 * Set the flag only if necessary.
3560 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3561 bfqg
->entity
.new_weight
= (unsigned short)val
;
3563 * Make sure that the above new value has been
3564 * stored in bfqg->entity.new_weight before
3565 * setting the prio_changed flag. In fact,
3566 * this flag may be read asynchronously (in
3567 * critical sections protected by a different
3568 * lock than that held here), and finding this
3569 * flag set may cause the execution of the code
3570 * for updating parameters whose value may
3571 * depend also on bfqg->entity.new_weight (in
3572 * __bfq_entity_update_weight_prio).
3573 * This barrier makes sure that the new value
3574 * of bfqg->entity.new_weight is correctly
3575 * seen in that code.
3578 bfqg
->entity
.prio_changed
= 1;
3581 spin_unlock_irq(&blkcg
->lock
);
3586 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3587 char *buf
, size_t nbytes
,
3591 /* First unsigned long found in the file is used */
3592 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3597 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3600 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3602 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3603 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3607 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3609 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3610 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3614 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3615 struct blkg_policy_data
*pd
, int off
)
3617 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3618 &blkcg_policy_bfq
, off
);
3619 return __blkg_prfill_u64(sf
, pd
, sum
);
3622 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3623 struct blkg_policy_data
*pd
, int off
)
3625 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3628 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3631 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3633 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3634 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3635 seq_cft(sf
)->private, false);
3639 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3641 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3642 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3643 seq_cft(sf
)->private, true);
3647 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3650 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3652 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3655 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3657 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3658 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3662 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3663 struct blkg_policy_data
*pd
, int off
)
3665 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3666 offsetof(struct blkcg_gq
, stat_bytes
));
3667 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3668 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3670 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3673 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3675 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3676 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3681 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3682 struct blkg_policy_data
*pd
, int off
)
3684 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3685 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3689 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3690 v
= div64_u64(v
, samples
);
3692 __blkg_prfill_u64(sf
, pd
, v
);
3696 /* print avg_queue_size */
3697 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3699 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3700 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3705 static struct bfq_group
*
3706 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3710 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3714 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3717 static struct cftype bfq_blkcg_legacy_files
[] = {
3719 .name
= "bfq.weight",
3720 .flags
= CFTYPE_NOT_ON_ROOT
,
3721 .seq_show
= bfq_io_show_weight
,
3722 .write_u64
= bfq_io_set_weight_legacy
,
3725 /* statistics, covers only the tasks in the bfqg */
3728 .private = offsetof(struct bfq_group
, stats
.time
),
3729 .seq_show
= bfqg_print_stat
,
3732 .name
= "bfq.sectors",
3733 .seq_show
= bfqg_print_stat_sectors
,
3736 .name
= "bfq.io_service_bytes",
3737 .private = (unsigned long)&blkcg_policy_bfq
,
3738 .seq_show
= blkg_print_stat_bytes
,
3741 .name
= "bfq.io_serviced",
3742 .private = (unsigned long)&blkcg_policy_bfq
,
3743 .seq_show
= blkg_print_stat_ios
,
3746 .name
= "bfq.io_service_time",
3747 .private = offsetof(struct bfq_group
, stats
.service_time
),
3748 .seq_show
= bfqg_print_rwstat
,
3751 .name
= "bfq.io_wait_time",
3752 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3753 .seq_show
= bfqg_print_rwstat
,
3756 .name
= "bfq.io_merged",
3757 .private = offsetof(struct bfq_group
, stats
.merged
),
3758 .seq_show
= bfqg_print_rwstat
,
3761 .name
= "bfq.io_queued",
3762 .private = offsetof(struct bfq_group
, stats
.queued
),
3763 .seq_show
= bfqg_print_rwstat
,
3766 /* the same statictics which cover the bfqg and its descendants */
3768 .name
= "bfq.time_recursive",
3769 .private = offsetof(struct bfq_group
, stats
.time
),
3770 .seq_show
= bfqg_print_stat_recursive
,
3773 .name
= "bfq.sectors_recursive",
3774 .seq_show
= bfqg_print_stat_sectors_recursive
,
3777 .name
= "bfq.io_service_bytes_recursive",
3778 .private = (unsigned long)&blkcg_policy_bfq
,
3779 .seq_show
= blkg_print_stat_bytes_recursive
,
3782 .name
= "bfq.io_serviced_recursive",
3783 .private = (unsigned long)&blkcg_policy_bfq
,
3784 .seq_show
= blkg_print_stat_ios_recursive
,
3787 .name
= "bfq.io_service_time_recursive",
3788 .private = offsetof(struct bfq_group
, stats
.service_time
),
3789 .seq_show
= bfqg_print_rwstat_recursive
,
3792 .name
= "bfq.io_wait_time_recursive",
3793 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3794 .seq_show
= bfqg_print_rwstat_recursive
,
3797 .name
= "bfq.io_merged_recursive",
3798 .private = offsetof(struct bfq_group
, stats
.merged
),
3799 .seq_show
= bfqg_print_rwstat_recursive
,
3802 .name
= "bfq.io_queued_recursive",
3803 .private = offsetof(struct bfq_group
, stats
.queued
),
3804 .seq_show
= bfqg_print_rwstat_recursive
,
3807 .name
= "bfq.avg_queue_size",
3808 .seq_show
= bfqg_print_avg_queue_size
,
3811 .name
= "bfq.group_wait_time",
3812 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3813 .seq_show
= bfqg_print_stat
,
3816 .name
= "bfq.idle_time",
3817 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3818 .seq_show
= bfqg_print_stat
,
3821 .name
= "bfq.empty_time",
3822 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3823 .seq_show
= bfqg_print_stat
,
3826 .name
= "bfq.dequeue",
3827 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3828 .seq_show
= bfqg_print_stat
,
3833 static struct cftype bfq_blkg_files
[] = {
3835 .name
= "bfq.weight",
3836 .flags
= CFTYPE_NOT_ON_ROOT
,
3837 .seq_show
= bfq_io_show_weight
,
3838 .write
= bfq_io_set_weight
,
3843 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3845 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3846 struct bfq_queue
*bfqq
, unsigned int op
) { }
3848 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3850 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3851 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3852 uint64_t start_time
, uint64_t io_start_time
,
3853 unsigned int op
) { }
3855 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3856 struct bfq_group
*curr_bfqg
) { }
3857 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3858 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3859 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3860 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3861 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3862 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3864 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3865 struct bfq_group
*bfqg
) {}
3867 static void bfq_init_entity(struct bfq_entity
*entity
,
3868 struct bfq_group
*bfqg
)
3870 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3872 entity
->weight
= entity
->new_weight
;
3873 entity
->orig_weight
= entity
->new_weight
;
3875 bfqq
->ioprio
= bfqq
->new_ioprio
;
3876 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3878 entity
->sched_data
= &bfqg
->sched_data
;
3881 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3883 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3885 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3888 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3889 struct blkcg
*blkcg
)
3891 return bfqd
->root_group
;
3894 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3896 return bfqq
->bfqd
->root_group
;
3899 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3902 struct bfq_group
*bfqg
;
3905 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3909 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3910 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3914 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3916 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3917 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3919 #define bfq_sample_valid(samples) ((samples) > 80)
3922 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3923 * We choose the request that is closesr to the head right now. Distance
3924 * behind the head is penalized and only allowed to a certain extent.
3926 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3927 struct request
*rq1
,
3928 struct request
*rq2
,
3931 sector_t s1
, s2
, d1
= 0, d2
= 0;
3932 unsigned long back_max
;
3933 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3934 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3935 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3937 if (!rq1
|| rq1
== rq2
)
3942 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3944 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3946 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3948 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3951 s1
= blk_rq_pos(rq1
);
3952 s2
= blk_rq_pos(rq2
);
3955 * By definition, 1KiB is 2 sectors.
3957 back_max
= bfqd
->bfq_back_max
* 2;
3960 * Strict one way elevator _except_ in the case where we allow
3961 * short backward seeks which are biased as twice the cost of a
3962 * similar forward seek.
3966 else if (s1
+ back_max
>= last
)
3967 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3969 wrap
|= BFQ_RQ1_WRAP
;
3973 else if (s2
+ back_max
>= last
)
3974 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3976 wrap
|= BFQ_RQ2_WRAP
;
3978 /* Found required data */
3981 * By doing switch() on the bit mask "wrap" we avoid having to
3982 * check two variables for all permutations: --> faster!
3985 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
4000 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
4003 * Since both rqs are wrapped,
4004 * start with the one that's further behind head
4005 * (--> only *one* back seek required),
4006 * since back seek takes more time than forward.
4015 static struct bfq_queue
*
4016 bfq_rq_pos_tree_lookup(struct bfq_data
*bfqd
, struct rb_root
*root
,
4017 sector_t sector
, struct rb_node
**ret_parent
,
4018 struct rb_node
***rb_link
)
4020 struct rb_node
**p
, *parent
;
4021 struct bfq_queue
*bfqq
= NULL
;
4029 bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
4032 * Sort strictly based on sector. Smallest to the left,
4033 * largest to the right.
4035 if (sector
> blk_rq_pos(bfqq
->next_rq
))
4036 n
= &(*p
)->rb_right
;
4037 else if (sector
< blk_rq_pos(bfqq
->next_rq
))
4045 *ret_parent
= parent
;
4049 bfq_log(bfqd
, "rq_pos_tree_lookup %llu: returning %d",
4050 (unsigned long long)sector
,
4051 bfqq
? bfqq
->pid
: 0);
4056 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4058 struct rb_node
**p
, *parent
;
4059 struct bfq_queue
*__bfqq
;
4061 if (bfqq
->pos_root
) {
4062 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
4063 bfqq
->pos_root
= NULL
;
4066 if (bfq_class_idle(bfqq
))
4071 bfqq
->pos_root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
4072 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, bfqq
->pos_root
,
4073 blk_rq_pos(bfqq
->next_rq
), &parent
, &p
);
4075 rb_link_node(&bfqq
->pos_node
, parent
, p
);
4076 rb_insert_color(&bfqq
->pos_node
, bfqq
->pos_root
);
4078 bfqq
->pos_root
= NULL
;
4082 * Tell whether there are active queues or groups with differentiated weights.
4084 static bool bfq_differentiated_weights(struct bfq_data
*bfqd
)
4087 * For weights to differ, at least one of the trees must contain
4088 * at least two nodes.
4090 return (!RB_EMPTY_ROOT(&bfqd
->queue_weights_tree
) &&
4091 (bfqd
->queue_weights_tree
.rb_node
->rb_left
||
4092 bfqd
->queue_weights_tree
.rb_node
->rb_right
)
4093 #ifdef CONFIG_BFQ_GROUP_IOSCHED
4095 (!RB_EMPTY_ROOT(&bfqd
->group_weights_tree
) &&
4096 (bfqd
->group_weights_tree
.rb_node
->rb_left
||
4097 bfqd
->group_weights_tree
.rb_node
->rb_right
)
4103 * The following function returns true if every queue must receive the
4104 * same share of the throughput (this condition is used when deciding
4105 * whether idling may be disabled, see the comments in the function
4106 * bfq_bfqq_may_idle()).
4108 * Such a scenario occurs when:
4109 * 1) all active queues have the same weight,
4110 * 2) all active groups at the same level in the groups tree have the same
4112 * 3) all active groups at the same level in the groups tree have the same
4113 * number of children.
4115 * Unfortunately, keeping the necessary state for evaluating exactly the
4116 * above symmetry conditions would be quite complex and time-consuming.
4117 * Therefore this function evaluates, instead, the following stronger
4118 * sub-conditions, for which it is much easier to maintain the needed
4120 * 1) all active queues have the same weight,
4121 * 2) all active groups have the same weight,
4122 * 3) all active groups have at most one active child each.
4123 * In particular, the last two conditions are always true if hierarchical
4124 * support and the cgroups interface are not enabled, thus no state needs
4125 * to be maintained in this case.
4127 static bool bfq_symmetric_scenario(struct bfq_data
*bfqd
)
4129 return !bfq_differentiated_weights(bfqd
);
4133 * If the weight-counter tree passed as input contains no counter for
4134 * the weight of the input entity, then add that counter; otherwise just
4135 * increment the existing counter.
4137 * Note that weight-counter trees contain few nodes in mostly symmetric
4138 * scenarios. For example, if all queues have the same weight, then the
4139 * weight-counter tree for the queues may contain at most one node.
4140 * This holds even if low_latency is on, because weight-raised queues
4141 * are not inserted in the tree.
4142 * In most scenarios, the rate at which nodes are created/destroyed
4143 * should be low too.
4145 static void bfq_weights_tree_add(struct bfq_data
*bfqd
,
4146 struct bfq_entity
*entity
,
4147 struct rb_root
*root
)
4149 struct rb_node
**new = &(root
->rb_node
), *parent
= NULL
;
4152 * Do not insert if the entity is already associated with a
4153 * counter, which happens if:
4154 * 1) the entity is associated with a queue,
4155 * 2) a request arrival has caused the queue to become both
4156 * non-weight-raised, and hence change its weight, and
4157 * backlogged; in this respect, each of the two events
4158 * causes an invocation of this function,
4159 * 3) this is the invocation of this function caused by the
4160 * second event. This second invocation is actually useless,
4161 * and we handle this fact by exiting immediately. More
4162 * efficient or clearer solutions might possibly be adopted.
4164 if (entity
->weight_counter
)
4168 struct bfq_weight_counter
*__counter
= container_of(*new,
4169 struct bfq_weight_counter
,
4173 if (entity
->weight
== __counter
->weight
) {
4174 entity
->weight_counter
= __counter
;
4177 if (entity
->weight
< __counter
->weight
)
4178 new = &((*new)->rb_left
);
4180 new = &((*new)->rb_right
);
4183 entity
->weight_counter
= kzalloc(sizeof(struct bfq_weight_counter
),
4187 * In the unlucky event of an allocation failure, we just
4188 * exit. This will cause the weight of entity to not be
4189 * considered in bfq_differentiated_weights, which, in its
4190 * turn, causes the scenario to be deemed wrongly symmetric in
4191 * case entity's weight would have been the only weight making
4192 * the scenario asymmetric. On the bright side, no unbalance
4193 * will however occur when entity becomes inactive again (the
4194 * invocation of this function is triggered by an activation
4195 * of entity). In fact, bfq_weights_tree_remove does nothing
4196 * if !entity->weight_counter.
4198 if (unlikely(!entity
->weight_counter
))
4201 entity
->weight_counter
->weight
= entity
->weight
;
4202 rb_link_node(&entity
->weight_counter
->weights_node
, parent
, new);
4203 rb_insert_color(&entity
->weight_counter
->weights_node
, root
);
4206 entity
->weight_counter
->num_active
++;
4210 * Decrement the weight counter associated with the entity, and, if the
4211 * counter reaches 0, remove the counter from the tree.
4212 * See the comments to the function bfq_weights_tree_add() for considerations
4215 static void bfq_weights_tree_remove(struct bfq_data
*bfqd
,
4216 struct bfq_entity
*entity
,
4217 struct rb_root
*root
)
4219 if (!entity
->weight_counter
)
4222 entity
->weight_counter
->num_active
--;
4223 if (entity
->weight_counter
->num_active
> 0)
4224 goto reset_entity_pointer
;
4226 rb_erase(&entity
->weight_counter
->weights_node
, root
);
4227 kfree(entity
->weight_counter
);
4229 reset_entity_pointer
:
4230 entity
->weight_counter
= NULL
;
4234 * Return expired entry, or NULL to just start from scratch in rbtree.
4236 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
4237 struct request
*last
)
4241 if (bfq_bfqq_fifo_expire(bfqq
))
4244 bfq_mark_bfqq_fifo_expire(bfqq
);
4246 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
4248 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
4251 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
4255 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
4256 struct bfq_queue
*bfqq
,
4257 struct request
*last
)
4259 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
4260 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
4261 struct request
*next
, *prev
= NULL
;
4263 /* Follow expired path, else get first next available. */
4264 next
= bfq_check_fifo(bfqq
, last
);
4269 prev
= rb_entry_rq(rbprev
);
4272 next
= rb_entry_rq(rbnext
);
4274 rbnext
= rb_first(&bfqq
->sort_list
);
4275 if (rbnext
&& rbnext
!= &last
->rb_node
)
4276 next
= rb_entry_rq(rbnext
);
4279 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
4282 /* see the definition of bfq_async_charge_factor for details */
4283 static unsigned long bfq_serv_to_charge(struct request
*rq
,
4284 struct bfq_queue
*bfqq
)
4286 if (bfq_bfqq_sync(bfqq
) || bfqq
->wr_coeff
> 1)
4287 return blk_rq_sectors(rq
);
4290 * If there are no weight-raised queues, then amplify service
4291 * by just the async charge factor; otherwise amplify service
4292 * by twice the async charge factor, to further reduce latency
4293 * for weight-raised queues.
4295 if (bfqq
->bfqd
->wr_busy_queues
== 0)
4296 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
4298 return blk_rq_sectors(rq
) * 2 * bfq_async_charge_factor
;
4302 * bfq_updated_next_req - update the queue after a new next_rq selection.
4303 * @bfqd: the device data the queue belongs to.
4304 * @bfqq: the queue to update.
4306 * If the first request of a queue changes we make sure that the queue
4307 * has enough budget to serve at least its first request (if the
4308 * request has grown). We do this because if the queue has not enough
4309 * budget for its first request, it has to go through two dispatch
4310 * rounds to actually get it dispatched.
4312 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
4313 struct bfq_queue
*bfqq
)
4315 struct bfq_entity
*entity
= &bfqq
->entity
;
4316 struct request
*next_rq
= bfqq
->next_rq
;
4317 unsigned long new_budget
;
4322 if (bfqq
== bfqd
->in_service_queue
)
4324 * In order not to break guarantees, budgets cannot be
4325 * changed after an entity has been selected.
4329 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
4330 bfq_serv_to_charge(next_rq
, bfqq
));
4331 if (entity
->budget
!= new_budget
) {
4332 entity
->budget
= new_budget
;
4333 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
4335 bfq_requeue_bfqq(bfqd
, bfqq
);
4340 bfq_bfqq_resume_state(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
4342 if (bic
->saved_idle_window
)
4343 bfq_mark_bfqq_idle_window(bfqq
);
4345 bfq_clear_bfqq_idle_window(bfqq
);
4347 if (bic
->saved_IO_bound
)
4348 bfq_mark_bfqq_IO_bound(bfqq
);
4350 bfq_clear_bfqq_IO_bound(bfqq
);
4352 bfqq
->ttime
= bic
->saved_ttime
;
4353 bfqq
->wr_coeff
= bic
->saved_wr_coeff
;
4354 bfqq
->wr_start_at_switch_to_srt
= bic
->saved_wr_start_at_switch_to_srt
;
4355 bfqq
->last_wr_start_finish
= bic
->saved_last_wr_start_finish
;
4356 bfqq
->wr_cur_max_time
= bic
->saved_wr_cur_max_time
;
4358 if (bfqq
->wr_coeff
> 1 && (bfq_bfqq_in_large_burst(bfqq
) ||
4359 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
4360 bfqq
->wr_cur_max_time
))) {
4361 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4362 "resume state: switching off wr");
4367 /* make sure weight will be updated, however we got here */
4368 bfqq
->entity
.prio_changed
= 1;
4371 static int bfqq_process_refs(struct bfq_queue
*bfqq
)
4373 return bfqq
->ref
- bfqq
->allocated
- bfqq
->entity
.on_st
;
4376 /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
4377 static void bfq_reset_burst_list(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4379 struct bfq_queue
*item
;
4380 struct hlist_node
*n
;
4382 hlist_for_each_entry_safe(item
, n
, &bfqd
->burst_list
, burst_list_node
)
4383 hlist_del_init(&item
->burst_list_node
);
4384 hlist_add_head(&bfqq
->burst_list_node
, &bfqd
->burst_list
);
4385 bfqd
->burst_size
= 1;
4386 bfqd
->burst_parent_entity
= bfqq
->entity
.parent
;
4389 /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
4390 static void bfq_add_to_burst(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4392 /* Increment burst size to take into account also bfqq */
4395 if (bfqd
->burst_size
== bfqd
->bfq_large_burst_thresh
) {
4396 struct bfq_queue
*pos
, *bfqq_item
;
4397 struct hlist_node
*n
;
4400 * Enough queues have been activated shortly after each
4401 * other to consider this burst as large.
4403 bfqd
->large_burst
= true;
4406 * We can now mark all queues in the burst list as
4407 * belonging to a large burst.
4409 hlist_for_each_entry(bfqq_item
, &bfqd
->burst_list
,
4411 bfq_mark_bfqq_in_large_burst(bfqq_item
);
4412 bfq_mark_bfqq_in_large_burst(bfqq
);
4415 * From now on, and until the current burst finishes, any
4416 * new queue being activated shortly after the last queue
4417 * was inserted in the burst can be immediately marked as
4418 * belonging to a large burst. So the burst list is not
4419 * needed any more. Remove it.
4421 hlist_for_each_entry_safe(pos
, n
, &bfqd
->burst_list
,
4423 hlist_del_init(&pos
->burst_list_node
);
4425 * Burst not yet large: add bfqq to the burst list. Do
4426 * not increment the ref counter for bfqq, because bfqq
4427 * is removed from the burst list before freeing bfqq
4430 hlist_add_head(&bfqq
->burst_list_node
, &bfqd
->burst_list
);
4434 * If many queues belonging to the same group happen to be created
4435 * shortly after each other, then the processes associated with these
4436 * queues have typically a common goal. In particular, bursts of queue
4437 * creations are usually caused by services or applications that spawn
4438 * many parallel threads/processes. Examples are systemd during boot,
4439 * or git grep. To help these processes get their job done as soon as
4440 * possible, it is usually better to not grant either weight-raising
4441 * or device idling to their queues.
4443 * In this comment we describe, firstly, the reasons why this fact
4444 * holds, and, secondly, the next function, which implements the main
4445 * steps needed to properly mark these queues so that they can then be
4446 * treated in a different way.
4448 * The above services or applications benefit mostly from a high
4449 * throughput: the quicker the requests of the activated queues are
4450 * cumulatively served, the sooner the target job of these queues gets
4451 * completed. As a consequence, weight-raising any of these queues,
4452 * which also implies idling the device for it, is almost always
4453 * counterproductive. In most cases it just lowers throughput.
4455 * On the other hand, a burst of queue creations may be caused also by
4456 * the start of an application that does not consist of a lot of
4457 * parallel I/O-bound threads. In fact, with a complex application,
4458 * several short processes may need to be executed to start-up the
4459 * application. In this respect, to start an application as quickly as
4460 * possible, the best thing to do is in any case to privilege the I/O
4461 * related to the application with respect to all other
4462 * I/O. Therefore, the best strategy to start as quickly as possible
4463 * an application that causes a burst of queue creations is to
4464 * weight-raise all the queues created during the burst. This is the
4465 * exact opposite of the best strategy for the other type of bursts.
4467 * In the end, to take the best action for each of the two cases, the
4468 * two types of bursts need to be distinguished. Fortunately, this
4469 * seems relatively easy, by looking at the sizes of the bursts. In
4470 * particular, we found a threshold such that only bursts with a
4471 * larger size than that threshold are apparently caused by
4472 * services or commands such as systemd or git grep. For brevity,
4473 * hereafter we call just 'large' these bursts. BFQ *does not*
4474 * weight-raise queues whose creation occurs in a large burst. In
4475 * addition, for each of these queues BFQ performs or does not perform
4476 * idling depending on which choice boosts the throughput more. The
4477 * exact choice depends on the device and request pattern at
4480 * Unfortunately, false positives may occur while an interactive task
4481 * is starting (e.g., an application is being started). The
4482 * consequence is that the queues associated with the task do not
4483 * enjoy weight raising as expected. Fortunately these false positives
4484 * are very rare. They typically occur if some service happens to
4485 * start doing I/O exactly when the interactive task starts.
4487 * Turning back to the next function, it implements all the steps
4488 * needed to detect the occurrence of a large burst and to properly
4489 * mark all the queues belonging to it (so that they can then be
4490 * treated in a different way). This goal is achieved by maintaining a
4491 * "burst list" that holds, temporarily, the queues that belong to the
4492 * burst in progress. The list is then used to mark these queues as
4493 * belonging to a large burst if the burst does become large. The main
4494 * steps are the following.
4496 * . when the very first queue is created, the queue is inserted into the
4497 * list (as it could be the first queue in a possible burst)
4499 * . if the current burst has not yet become large, and a queue Q that does
4500 * not yet belong to the burst is activated shortly after the last time
4501 * at which a new queue entered the burst list, then the function appends
4502 * Q to the burst list
4504 * . if, as a consequence of the previous step, the burst size reaches
4505 * the large-burst threshold, then
4507 * . all the queues in the burst list are marked as belonging to a
4510 * . the burst list is deleted; in fact, the burst list already served
4511 * its purpose (keeping temporarily track of the queues in a burst,
4512 * so as to be able to mark them as belonging to a large burst in the
4513 * previous sub-step), and now is not needed any more
4515 * . the device enters a large-burst mode
4517 * . if a queue Q that does not belong to the burst is created while
4518 * the device is in large-burst mode and shortly after the last time
4519 * at which a queue either entered the burst list or was marked as
4520 * belonging to the current large burst, then Q is immediately marked
4521 * as belonging to a large burst.
4523 * . if a queue Q that does not belong to the burst is created a while
4524 * later, i.e., not shortly after, than the last time at which a queue
4525 * either entered the burst list or was marked as belonging to the
4526 * current large burst, then the current burst is deemed as finished and:
4528 * . the large-burst mode is reset if set
4530 * . the burst list is emptied
4532 * . Q is inserted in the burst list, as Q may be the first queue
4533 * in a possible new burst (then the burst list contains just Q
4536 static void bfq_handle_burst(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4539 * If bfqq is already in the burst list or is part of a large
4540 * burst, or finally has just been split, then there is
4541 * nothing else to do.
4543 if (!hlist_unhashed(&bfqq
->burst_list_node
) ||
4544 bfq_bfqq_in_large_burst(bfqq
) ||
4545 time_is_after_eq_jiffies(bfqq
->split_time
+
4546 msecs_to_jiffies(10)))
4550 * If bfqq's creation happens late enough, or bfqq belongs to
4551 * a different group than the burst group, then the current
4552 * burst is finished, and related data structures must be
4555 * In this respect, consider the special case where bfqq is
4556 * the very first queue created after BFQ is selected for this
4557 * device. In this case, last_ins_in_burst and
4558 * burst_parent_entity are not yet significant when we get
4559 * here. But it is easy to verify that, whether or not the
4560 * following condition is true, bfqq will end up being
4561 * inserted into the burst list. In particular the list will
4562 * happen to contain only bfqq. And this is exactly what has
4563 * to happen, as bfqq may be the first queue of the first
4566 if (time_is_before_jiffies(bfqd
->last_ins_in_burst
+
4567 bfqd
->bfq_burst_interval
) ||
4568 bfqq
->entity
.parent
!= bfqd
->burst_parent_entity
) {
4569 bfqd
->large_burst
= false;
4570 bfq_reset_burst_list(bfqd
, bfqq
);
4575 * If we get here, then bfqq is being activated shortly after the
4576 * last queue. So, if the current burst is also large, we can mark
4577 * bfqq as belonging to this large burst immediately.
4579 if (bfqd
->large_burst
) {
4580 bfq_mark_bfqq_in_large_burst(bfqq
);
4585 * If we get here, then a large-burst state has not yet been
4586 * reached, but bfqq is being activated shortly after the last
4587 * queue. Then we add bfqq to the burst.
4589 bfq_add_to_burst(bfqd
, bfqq
);
4592 * At this point, bfqq either has been added to the current
4593 * burst or has caused the current burst to terminate and a
4594 * possible new burst to start. In particular, in the second
4595 * case, bfqq has become the first queue in the possible new
4596 * burst. In both cases last_ins_in_burst needs to be moved
4599 bfqd
->last_ins_in_burst
= jiffies
;
4602 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
4604 struct bfq_entity
*entity
= &bfqq
->entity
;
4606 return entity
->budget
- entity
->service
;
4610 * If enough samples have been computed, return the current max budget
4611 * stored in bfqd, which is dynamically updated according to the
4612 * estimated disk peak rate; otherwise return the default max budget
4614 static int bfq_max_budget(struct bfq_data
*bfqd
)
4616 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4617 return bfq_default_max_budget
;
4619 return bfqd
->bfq_max_budget
;
4623 * Return min budget, which is a fraction of the current or default
4624 * max budget (trying with 1/32)
4626 static int bfq_min_budget(struct bfq_data
*bfqd
)
4628 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4629 return bfq_default_max_budget
/ 32;
4631 return bfqd
->bfq_max_budget
/ 32;
4634 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4635 struct bfq_queue
*bfqq
,
4637 enum bfqq_expiration reason
);
4640 * The next function, invoked after the input queue bfqq switches from
4641 * idle to busy, updates the budget of bfqq. The function also tells
4642 * whether the in-service queue should be expired, by returning
4643 * true. The purpose of expiring the in-service queue is to give bfqq
4644 * the chance to possibly preempt the in-service queue, and the reason
4645 * for preempting the in-service queue is to achieve one of the two
4648 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
4649 * expired because it has remained idle. In particular, bfqq may have
4650 * expired for one of the following two reasons:
4652 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
4653 * and did not make it to issue a new request before its last
4654 * request was served;
4656 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
4657 * a new request before the expiration of the idling-time.
4659 * Even if bfqq has expired for one of the above reasons, the process
4660 * associated with the queue may be however issuing requests greedily,
4661 * and thus be sensitive to the bandwidth it receives (bfqq may have
4662 * remained idle for other reasons: CPU high load, bfqq not enjoying
4663 * idling, I/O throttling somewhere in the path from the process to
4664 * the I/O scheduler, ...). But if, after every expiration for one of
4665 * the above two reasons, bfqq has to wait for the service of at least
4666 * one full budget of another queue before being served again, then
4667 * bfqq is likely to get a much lower bandwidth or resource time than
4668 * its reserved ones. To address this issue, two countermeasures need
4671 * First, the budget and the timestamps of bfqq need to be updated in
4672 * a special way on bfqq reactivation: they need to be updated as if
4673 * bfqq did not remain idle and did not expire. In fact, if they are
4674 * computed as if bfqq expired and remained idle until reactivation,
4675 * then the process associated with bfqq is treated as if, instead of
4676 * being greedy, it stopped issuing requests when bfqq remained idle,
4677 * and restarts issuing requests only on this reactivation. In other
4678 * words, the scheduler does not help the process recover the "service
4679 * hole" between bfqq expiration and reactivation. As a consequence,
4680 * the process receives a lower bandwidth than its reserved one. In
4681 * contrast, to recover this hole, the budget must be updated as if
4682 * bfqq was not expired at all before this reactivation, i.e., it must
4683 * be set to the value of the remaining budget when bfqq was
4684 * expired. Along the same line, timestamps need to be assigned the
4685 * value they had the last time bfqq was selected for service, i.e.,
4686 * before last expiration. Thus timestamps need to be back-shifted
4687 * with respect to their normal computation (see [1] for more details
4688 * on this tricky aspect).
4690 * Secondly, to allow the process to recover the hole, the in-service
4691 * queue must be expired too, to give bfqq the chance to preempt it
4692 * immediately. In fact, if bfqq has to wait for a full budget of the
4693 * in-service queue to be completed, then it may become impossible to
4694 * let the process recover the hole, even if the back-shifted
4695 * timestamps of bfqq are lower than those of the in-service queue. If
4696 * this happens for most or all of the holes, then the process may not
4697 * receive its reserved bandwidth. In this respect, it is worth noting
4698 * that, being the service of outstanding requests unpreemptible, a
4699 * little fraction of the holes may however be unrecoverable, thereby
4700 * causing a little loss of bandwidth.
4702 * The last important point is detecting whether bfqq does need this
4703 * bandwidth recovery. In this respect, the next function deems the
4704 * process associated with bfqq greedy, and thus allows it to recover
4705 * the hole, if: 1) the process is waiting for the arrival of a new
4706 * request (which implies that bfqq expired for one of the above two
4707 * reasons), and 2) such a request has arrived soon. The first
4708 * condition is controlled through the flag non_blocking_wait_rq,
4709 * while the second through the flag arrived_in_time. If both
4710 * conditions hold, then the function computes the budget in the
4711 * above-described special way, and signals that the in-service queue
4712 * should be expired. Timestamp back-shifting is done later in
4713 * __bfq_activate_entity.
4715 * 2. Reduce latency. Even if timestamps are not backshifted to let
4716 * the process associated with bfqq recover a service hole, bfqq may
4717 * however happen to have, after being (re)activated, a lower finish
4718 * timestamp than the in-service queue. That is, the next budget of
4719 * bfqq may have to be completed before the one of the in-service
4720 * queue. If this is the case, then preempting the in-service queue
4721 * allows this goal to be achieved, apart from the unpreemptible,
4722 * outstanding requests mentioned above.
4724 * Unfortunately, regardless of which of the above two goals one wants
4725 * to achieve, service trees need first to be updated to know whether
4726 * the in-service queue must be preempted. To have service trees
4727 * correctly updated, the in-service queue must be expired and
4728 * rescheduled, and bfqq must be scheduled too. This is one of the
4729 * most costly operations (in future versions, the scheduling
4730 * mechanism may be re-designed in such a way to make it possible to
4731 * know whether preemption is needed without needing to update service
4732 * trees). In addition, queue preemptions almost always cause random
4733 * I/O, and thus loss of throughput. Because of these facts, the next
4734 * function adopts the following simple scheme to avoid both costly
4735 * operations and too frequent preemptions: it requests the expiration
4736 * of the in-service queue (unconditionally) only for queues that need
4737 * to recover a hole, or that either are weight-raised or deserve to
4740 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
4741 struct bfq_queue
*bfqq
,
4742 bool arrived_in_time
,
4743 bool wr_or_deserves_wr
)
4745 struct bfq_entity
*entity
= &bfqq
->entity
;
4747 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
4749 * We do not clear the flag non_blocking_wait_rq here, as
4750 * the latter is used in bfq_activate_bfqq to signal
4751 * that timestamps need to be back-shifted (and is
4752 * cleared right after).
4756 * In next assignment we rely on that either
4757 * entity->service or entity->budget are not updated
4758 * on expiration if bfqq is empty (see
4759 * __bfq_bfqq_recalc_budget). Thus both quantities
4760 * remain unchanged after such an expiration, and the
4761 * following statement therefore assigns to
4762 * entity->budget the remaining budget on such an
4763 * expiration. For clarity, entity->service is not
4764 * updated on expiration in any case, and, in normal
4765 * operation, is reset only when bfqq is selected for
4766 * service (see bfq_get_next_queue).
4768 entity
->budget
= min_t(unsigned long,
4769 bfq_bfqq_budget_left(bfqq
),
4775 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
4776 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
4777 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
4778 return wr_or_deserves_wr
;
4781 static unsigned int bfq_wr_duration(struct bfq_data
*bfqd
)
4785 if (bfqd
->bfq_wr_max_time
> 0)
4786 return bfqd
->bfq_wr_max_time
;
4788 dur
= bfqd
->RT_prod
;
4789 do_div(dur
, bfqd
->peak_rate
);
4792 * Limit duration between 3 and 13 seconds. Tests show that
4793 * higher values than 13 seconds often yield the opposite of
4794 * the desired result, i.e., worsen responsiveness by letting
4795 * non-interactive and non-soft-real-time applications
4796 * preserve weight raising for a too long time interval.
4798 * On the other end, lower values than 3 seconds make it
4799 * difficult for most interactive tasks to complete their jobs
4800 * before weight-raising finishes.
4802 if (dur
> msecs_to_jiffies(13000))
4803 dur
= msecs_to_jiffies(13000);
4804 else if (dur
< msecs_to_jiffies(3000))
4805 dur
= msecs_to_jiffies(3000);
4810 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data
*bfqd
,
4811 struct bfq_queue
*bfqq
,
4812 unsigned int old_wr_coeff
,
4813 bool wr_or_deserves_wr
,
4818 if (old_wr_coeff
== 1 && wr_or_deserves_wr
) {
4819 /* start a weight-raising period */
4821 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4822 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4824 bfqq
->wr_start_at_switch_to_srt
= jiffies
;
4825 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4826 BFQ_SOFTRT_WEIGHT_FACTOR
;
4827 bfqq
->wr_cur_max_time
=
4828 bfqd
->bfq_wr_rt_max_time
;
4832 * If needed, further reduce budget to make sure it is
4833 * close to bfqq's backlog, so as to reduce the
4834 * scheduling-error component due to a too large
4835 * budget. Do not care about throughput consequences,
4836 * but only about latency. Finally, do not assign a
4837 * too small budget either, to avoid increasing
4838 * latency by causing too frequent expirations.
4840 bfqq
->entity
.budget
= min_t(unsigned long,
4841 bfqq
->entity
.budget
,
4842 2 * bfq_min_budget(bfqd
));
4843 } else if (old_wr_coeff
> 1) {
4844 if (interactive
) { /* update wr coeff and duration */
4845 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4846 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4847 } else if (in_burst
)
4851 * The application is now or still meeting the
4852 * requirements for being deemed soft rt. We
4853 * can then correctly and safely (re)charge
4854 * the weight-raising duration for the
4855 * application with the weight-raising
4856 * duration for soft rt applications.
4858 * In particular, doing this recharge now, i.e.,
4859 * before the weight-raising period for the
4860 * application finishes, reduces the probability
4861 * of the following negative scenario:
4862 * 1) the weight of a soft rt application is
4863 * raised at startup (as for any newly
4864 * created application),
4865 * 2) since the application is not interactive,
4866 * at a certain time weight-raising is
4867 * stopped for the application,
4868 * 3) at that time the application happens to
4869 * still have pending requests, and hence
4870 * is destined to not have a chance to be
4871 * deemed soft rt before these requests are
4872 * completed (see the comments to the
4873 * function bfq_bfqq_softrt_next_start()
4874 * for details on soft rt detection),
4875 * 4) these pending requests experience a high
4876 * latency because the application is not
4877 * weight-raised while they are pending.
4879 if (bfqq
->wr_cur_max_time
!=
4880 bfqd
->bfq_wr_rt_max_time
) {
4881 bfqq
->wr_start_at_switch_to_srt
=
4882 bfqq
->last_wr_start_finish
;
4884 bfqq
->wr_cur_max_time
=
4885 bfqd
->bfq_wr_rt_max_time
;
4886 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4887 BFQ_SOFTRT_WEIGHT_FACTOR
;
4889 bfqq
->last_wr_start_finish
= jiffies
;
4894 static bool bfq_bfqq_idle_for_long_time(struct bfq_data
*bfqd
,
4895 struct bfq_queue
*bfqq
)
4897 return bfqq
->dispatched
== 0 &&
4898 time_is_before_jiffies(
4899 bfqq
->budget_timeout
+
4900 bfqd
->bfq_wr_min_idle_time
);
4903 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
4904 struct bfq_queue
*bfqq
,
4909 bool soft_rt
, in_burst
, wr_or_deserves_wr
,
4910 bfqq_wants_to_preempt
,
4911 idle_for_long_time
= bfq_bfqq_idle_for_long_time(bfqd
, bfqq
),
4913 * See the comments on
4914 * bfq_bfqq_update_budg_for_activation for
4915 * details on the usage of the next variable.
4917 arrived_in_time
= ktime_get_ns() <=
4918 bfqq
->ttime
.last_end_request
+
4919 bfqd
->bfq_slice_idle
* 3;
4921 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
4924 * bfqq deserves to be weight-raised if:
4926 * - it does not belong to a large burst,
4927 * - it has been idle for enough time or is soft real-time,
4928 * - is linked to a bfq_io_cq (it is not shared in any sense).
4930 in_burst
= bfq_bfqq_in_large_burst(bfqq
);
4931 soft_rt
= bfqd
->bfq_wr_max_softrt_rate
> 0 &&
4933 time_is_before_jiffies(bfqq
->soft_rt_next_start
);
4934 *interactive
= !in_burst
&& idle_for_long_time
;
4935 wr_or_deserves_wr
= bfqd
->low_latency
&&
4936 (bfqq
->wr_coeff
> 1 ||
4937 (bfq_bfqq_sync(bfqq
) &&
4938 bfqq
->bic
&& (*interactive
|| soft_rt
)));
4941 * Using the last flag, update budget and check whether bfqq
4942 * may want to preempt the in-service queue.
4944 bfqq_wants_to_preempt
=
4945 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
4950 * If bfqq happened to be activated in a burst, but has been
4951 * idle for much more than an interactive queue, then we
4952 * assume that, in the overall I/O initiated in the burst, the
4953 * I/O associated with bfqq is finished. So bfqq does not need
4954 * to be treated as a queue belonging to a burst
4955 * anymore. Accordingly, we reset bfqq's in_large_burst flag
4956 * if set, and remove bfqq from the burst list if it's
4957 * there. We do not decrement burst_size, because the fact
4958 * that bfqq does not need to belong to the burst list any
4959 * more does not invalidate the fact that bfqq was created in
4962 if (likely(!bfq_bfqq_just_created(bfqq
)) &&
4963 idle_for_long_time
&&
4964 time_is_before_jiffies(
4965 bfqq
->budget_timeout
+
4966 msecs_to_jiffies(10000))) {
4967 hlist_del_init(&bfqq
->burst_list_node
);
4968 bfq_clear_bfqq_in_large_burst(bfqq
);
4971 bfq_clear_bfqq_just_created(bfqq
);
4974 if (!bfq_bfqq_IO_bound(bfqq
)) {
4975 if (arrived_in_time
) {
4976 bfqq
->requests_within_timer
++;
4977 if (bfqq
->requests_within_timer
>=
4978 bfqd
->bfq_requests_within_timer
)
4979 bfq_mark_bfqq_IO_bound(bfqq
);
4981 bfqq
->requests_within_timer
= 0;
4984 if (bfqd
->low_latency
) {
4985 if (unlikely(time_is_after_jiffies(bfqq
->split_time
)))
4988 jiffies
- bfqd
->bfq_wr_min_idle_time
- 1;
4990 if (time_is_before_jiffies(bfqq
->split_time
+
4991 bfqd
->bfq_wr_min_idle_time
)) {
4992 bfq_update_bfqq_wr_on_rq_arrival(bfqd
, bfqq
,
4999 if (old_wr_coeff
!= bfqq
->wr_coeff
)
5000 bfqq
->entity
.prio_changed
= 1;
5004 bfqq
->last_idle_bklogged
= jiffies
;
5005 bfqq
->service_from_backlogged
= 0;
5006 bfq_clear_bfqq_softrt_update(bfqq
);
5008 bfq_add_bfqq_busy(bfqd
, bfqq
);
5011 * Expire in-service queue only if preemption may be needed
5012 * for guarantees. In this respect, the function
5013 * next_queue_may_preempt just checks a simple, necessary
5014 * condition, and not a sufficient condition based on
5015 * timestamps. In fact, for the latter condition to be
5016 * evaluated, timestamps would need first to be updated, and
5017 * this operation is quite costly (see the comments on the
5018 * function bfq_bfqq_update_budg_for_activation).
5020 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
5021 bfqd
->in_service_queue
->wr_coeff
< bfqq
->wr_coeff
&&
5022 next_queue_may_preempt(bfqd
))
5023 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
5024 false, BFQQE_PREEMPTED
);
5027 static void bfq_add_request(struct request
*rq
)
5029 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5030 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5031 struct request
*next_rq
, *prev
;
5032 unsigned int old_wr_coeff
= bfqq
->wr_coeff
;
5033 bool interactive
= false;
5035 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
5036 bfqq
->queued
[rq_is_sync(rq
)]++;
5039 elv_rb_add(&bfqq
->sort_list
, rq
);
5042 * Check if this request is a better next-serve candidate.
5044 prev
= bfqq
->next_rq
;
5045 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
5046 bfqq
->next_rq
= next_rq
;
5049 * Adjust priority tree position, if next_rq changes.
5051 if (prev
!= bfqq
->next_rq
)
5052 bfq_pos_tree_add_move(bfqd
, bfqq
);
5054 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
5055 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, old_wr_coeff
,
5058 if (bfqd
->low_latency
&& old_wr_coeff
== 1 && !rq_is_sync(rq
) &&
5059 time_is_before_jiffies(
5060 bfqq
->last_wr_start_finish
+
5061 bfqd
->bfq_wr_min_inter_arr_async
)) {
5062 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
5063 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
5065 bfqd
->wr_busy_queues
++;
5066 bfqq
->entity
.prio_changed
= 1;
5068 if (prev
!= bfqq
->next_rq
)
5069 bfq_updated_next_req(bfqd
, bfqq
);
5073 * Assign jiffies to last_wr_start_finish in the following
5076 * . if bfqq is not going to be weight-raised, because, for
5077 * non weight-raised queues, last_wr_start_finish stores the
5078 * arrival time of the last request; as of now, this piece
5079 * of information is used only for deciding whether to
5080 * weight-raise async queues
5082 * . if bfqq is not weight-raised, because, if bfqq is now
5083 * switching to weight-raised, then last_wr_start_finish
5084 * stores the time when weight-raising starts
5086 * . if bfqq is interactive, because, regardless of whether
5087 * bfqq is currently weight-raised, the weight-raising
5088 * period must start or restart (this case is considered
5089 * separately because it is not detected by the above
5090 * conditions, if bfqq is already weight-raised)
5092 * last_wr_start_finish has to be updated also if bfqq is soft
5093 * real-time, because the weight-raising period is constantly
5094 * restarted on idle-to-busy transitions for these queues, but
5095 * this is already done in bfq_bfqq_handle_idle_busy_switch if
5098 if (bfqd
->low_latency
&&
5099 (old_wr_coeff
== 1 || bfqq
->wr_coeff
== 1 || interactive
))
5100 bfqq
->last_wr_start_finish
= jiffies
;
5103 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
5105 struct request_queue
*q
)
5107 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
5111 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
5116 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
5119 return abs(blk_rq_pos(rq
) - last_pos
);
5124 #if 0 /* Still not clear if we can do without next two functions */
5125 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
5127 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5129 bfqd
->rq_in_driver
++;
5132 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
5134 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5136 bfqd
->rq_in_driver
--;
5140 static void bfq_remove_request(struct request_queue
*q
,
5143 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5144 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5145 const int sync
= rq_is_sync(rq
);
5147 if (bfqq
->next_rq
== rq
) {
5148 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
5149 bfq_updated_next_req(bfqd
, bfqq
);
5152 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
5153 list_del_init(&rq
->queuelist
);
5154 bfqq
->queued
[sync
]--;
5156 elv_rb_del(&bfqq
->sort_list
, rq
);
5158 elv_rqhash_del(q
, rq
);
5159 if (q
->last_merge
== rq
)
5160 q
->last_merge
= NULL
;
5162 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
5163 bfqq
->next_rq
= NULL
;
5165 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
5166 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
5168 * bfqq emptied. In normal operation, when
5169 * bfqq is empty, bfqq->entity.service and
5170 * bfqq->entity.budget must contain,
5171 * respectively, the service received and the
5172 * budget used last time bfqq emptied. These
5173 * facts do not hold in this case, as at least
5174 * this last removal occurred while bfqq is
5175 * not in service. To avoid inconsistencies,
5176 * reset both bfqq->entity.service and
5177 * bfqq->entity.budget, if bfqq has still a
5178 * process that may issue I/O requests to it.
5180 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
5184 * Remove queue from request-position tree as it is empty.
5186 if (bfqq
->pos_root
) {
5187 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
5188 bfqq
->pos_root
= NULL
;
5192 if (rq
->cmd_flags
& REQ_META
)
5193 bfqq
->meta_pending
--;
5195 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
5198 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
5200 struct request_queue
*q
= hctx
->queue
;
5201 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5202 struct request
*free
= NULL
;
5204 * bfq_bic_lookup grabs the queue_lock: invoke it now and
5205 * store its return value for later use, to avoid nesting
5206 * queue_lock inside the bfqd->lock. We assume that the bic
5207 * returned by bfq_bic_lookup does not go away before
5208 * bfqd->lock is taken.
5210 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
5213 spin_lock_irq(&bfqd
->lock
);
5216 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
5218 bfqd
->bio_bfqq
= NULL
;
5219 bfqd
->bio_bic
= bic
;
5221 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
5224 blk_mq_free_request(free
);
5225 spin_unlock_irq(&bfqd
->lock
);
5230 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
5233 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5234 struct request
*__rq
;
5236 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
5237 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
5239 return ELEVATOR_FRONT_MERGE
;
5242 return ELEVATOR_NO_MERGE
;
5245 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
5246 enum elv_merge type
)
5248 if (type
== ELEVATOR_FRONT_MERGE
&&
5249 rb_prev(&req
->rb_node
) &&
5251 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
5252 struct request
, rb_node
))) {
5253 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
5254 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5255 struct request
*prev
, *next_rq
;
5257 /* Reposition request in its sort_list */
5258 elv_rb_del(&bfqq
->sort_list
, req
);
5259 elv_rb_add(&bfqq
->sort_list
, req
);
5261 /* Choose next request to be served for bfqq */
5262 prev
= bfqq
->next_rq
;
5263 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
5264 bfqd
->last_position
);
5265 bfqq
->next_rq
= next_rq
;
5267 * If next_rq changes, update both the queue's budget to
5268 * fit the new request and the queue's position in its
5271 if (prev
!= bfqq
->next_rq
) {
5272 bfq_updated_next_req(bfqd
, bfqq
);
5273 bfq_pos_tree_add_move(bfqd
, bfqq
);
5278 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
5279 struct request
*next
)
5281 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
5283 if (!RB_EMPTY_NODE(&rq
->rb_node
))
5285 spin_lock_irq(&bfqq
->bfqd
->lock
);
5288 * If next and rq belong to the same bfq_queue and next is older
5289 * than rq, then reposition rq in the fifo (by substituting next
5290 * with rq). Otherwise, if next and rq belong to different
5291 * bfq_queues, never reposition rq: in fact, we would have to
5292 * reposition it with respect to next's position in its own fifo,
5293 * which would most certainly be too expensive with respect to
5296 if (bfqq
== next_bfqq
&&
5297 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
5298 next
->fifo_time
< rq
->fifo_time
) {
5299 list_del_init(&rq
->queuelist
);
5300 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
5301 rq
->fifo_time
= next
->fifo_time
;
5304 if (bfqq
->next_rq
== next
)
5307 bfq_remove_request(q
, next
);
5309 spin_unlock_irq(&bfqq
->bfqd
->lock
);
5311 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
5314 /* Must be called with bfqq != NULL */
5315 static void bfq_bfqq_end_wr(struct bfq_queue
*bfqq
)
5317 if (bfq_bfqq_busy(bfqq
))
5318 bfqq
->bfqd
->wr_busy_queues
--;
5320 bfqq
->wr_cur_max_time
= 0;
5321 bfqq
->last_wr_start_finish
= jiffies
;
5323 * Trigger a weight change on the next invocation of
5324 * __bfq_entity_update_weight_prio.
5326 bfqq
->entity
.prio_changed
= 1;
5329 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
5330 struct bfq_group
*bfqg
)
5334 for (i
= 0; i
< 2; i
++)
5335 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
5336 if (bfqg
->async_bfqq
[i
][j
])
5337 bfq_bfqq_end_wr(bfqg
->async_bfqq
[i
][j
]);
5338 if (bfqg
->async_idle_bfqq
)
5339 bfq_bfqq_end_wr(bfqg
->async_idle_bfqq
);
5342 static void bfq_end_wr(struct bfq_data
*bfqd
)
5344 struct bfq_queue
*bfqq
;
5346 spin_lock_irq(&bfqd
->lock
);
5348 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
)
5349 bfq_bfqq_end_wr(bfqq
);
5350 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
)
5351 bfq_bfqq_end_wr(bfqq
);
5352 bfq_end_wr_async(bfqd
);
5354 spin_unlock_irq(&bfqd
->lock
);
5357 static sector_t
bfq_io_struct_pos(void *io_struct
, bool request
)
5360 return blk_rq_pos(io_struct
);
5362 return ((struct bio
*)io_struct
)->bi_iter
.bi_sector
;
5365 static int bfq_rq_close_to_sector(void *io_struct
, bool request
,
5368 return abs(bfq_io_struct_pos(io_struct
, request
) - sector
) <=
5372 static struct bfq_queue
*bfqq_find_close(struct bfq_data
*bfqd
,
5373 struct bfq_queue
*bfqq
,
5376 struct rb_root
*root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
5377 struct rb_node
*parent
, *node
;
5378 struct bfq_queue
*__bfqq
;
5380 if (RB_EMPTY_ROOT(root
))
5384 * First, if we find a request starting at the end of the last
5385 * request, choose it.
5387 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, root
, sector
, &parent
, NULL
);
5392 * If the exact sector wasn't found, the parent of the NULL leaf
5393 * will contain the closest sector (rq_pos_tree sorted by
5394 * next_request position).
5396 __bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
5397 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
5400 if (blk_rq_pos(__bfqq
->next_rq
) < sector
)
5401 node
= rb_next(&__bfqq
->pos_node
);
5403 node
= rb_prev(&__bfqq
->pos_node
);
5407 __bfqq
= rb_entry(node
, struct bfq_queue
, pos_node
);
5408 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
5414 static struct bfq_queue
*bfq_find_close_cooperator(struct bfq_data
*bfqd
,
5415 struct bfq_queue
*cur_bfqq
,
5418 struct bfq_queue
*bfqq
;
5421 * We shall notice if some of the queues are cooperating,
5422 * e.g., working closely on the same area of the device. In
5423 * that case, we can group them together and: 1) don't waste
5424 * time idling, and 2) serve the union of their requests in
5425 * the best possible order for throughput.
5427 bfqq
= bfqq_find_close(bfqd
, cur_bfqq
, sector
);
5428 if (!bfqq
|| bfqq
== cur_bfqq
)
5434 static struct bfq_queue
*
5435 bfq_setup_merge(struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
5437 int process_refs
, new_process_refs
;
5438 struct bfq_queue
*__bfqq
;
5441 * If there are no process references on the new_bfqq, then it is
5442 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
5443 * may have dropped their last reference (not just their last process
5446 if (!bfqq_process_refs(new_bfqq
))
5449 /* Avoid a circular list and skip interim queue merges. */
5450 while ((__bfqq
= new_bfqq
->new_bfqq
)) {
5456 process_refs
= bfqq_process_refs(bfqq
);
5457 new_process_refs
= bfqq_process_refs(new_bfqq
);
5459 * If the process for the bfqq has gone away, there is no
5460 * sense in merging the queues.
5462 if (process_refs
== 0 || new_process_refs
== 0)
5465 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "scheduling merge with queue %d",
5469 * Merging is just a redirection: the requests of the process
5470 * owning one of the two queues are redirected to the other queue.
5471 * The latter queue, in its turn, is set as shared if this is the
5472 * first time that the requests of some process are redirected to
5475 * We redirect bfqq to new_bfqq and not the opposite, because we
5476 * are in the context of the process owning bfqq, hence we have
5477 * the io_cq of this process. So we can immediately configure this
5478 * io_cq to redirect the requests of the process to new_bfqq.
5480 * NOTE, even if new_bfqq coincides with the in-service queue, the
5481 * io_cq of new_bfqq is not available, because, if the in-service
5482 * queue is shared, bfqd->in_service_bic may not point to the
5483 * io_cq of the in-service queue.
5484 * Redirecting the requests of the process owning bfqq to the
5485 * currently in-service queue is in any case the best option, as
5486 * we feed the in-service queue with new requests close to the
5487 * last request served and, by doing so, hopefully increase the
5490 bfqq
->new_bfqq
= new_bfqq
;
5491 new_bfqq
->ref
+= process_refs
;
5495 static bool bfq_may_be_close_cooperator(struct bfq_queue
*bfqq
,
5496 struct bfq_queue
*new_bfqq
)
5498 if (bfq_class_idle(bfqq
) || bfq_class_idle(new_bfqq
) ||
5499 (bfqq
->ioprio_class
!= new_bfqq
->ioprio_class
))
5503 * If either of the queues has already been detected as seeky,
5504 * then merging it with the other queue is unlikely to lead to
5507 if (BFQQ_SEEKY(bfqq
) || BFQQ_SEEKY(new_bfqq
))
5511 * Interleaved I/O is known to be done by (some) applications
5512 * only for reads, so it does not make sense to merge async
5515 if (!bfq_bfqq_sync(bfqq
) || !bfq_bfqq_sync(new_bfqq
))
5522 * If this function returns true, then bfqq cannot be merged. The idea
5523 * is that true cooperation happens very early after processes start
5524 * to do I/O. Usually, late cooperations are just accidental false
5525 * positives. In case bfqq is weight-raised, such false positives
5526 * would evidently degrade latency guarantees for bfqq.
5528 static bool wr_from_too_long(struct bfq_queue
*bfqq
)
5530 return bfqq
->wr_coeff
> 1 &&
5531 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
5532 msecs_to_jiffies(100));
5536 * Attempt to schedule a merge of bfqq with the currently in-service
5537 * queue or with a close queue among the scheduled queues. Return
5538 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
5539 * structure otherwise.
5541 * The OOM queue is not allowed to participate to cooperation: in fact, since
5542 * the requests temporarily redirected to the OOM queue could be redirected
5543 * again to dedicated queues at any time, the state needed to correctly
5544 * handle merging with the OOM queue would be quite complex and expensive
5545 * to maintain. Besides, in such a critical condition as an out of memory,
5546 * the benefits of queue merging may be little relevant, or even negligible.
5548 * Weight-raised queues can be merged only if their weight-raising
5549 * period has just started. In fact cooperating processes are usually
5550 * started together. Thus, with this filter we avoid false positives
5551 * that would jeopardize low-latency guarantees.
5553 * WARNING: queue merging may impair fairness among non-weight raised
5554 * queues, for at least two reasons: 1) the original weight of a
5555 * merged queue may change during the merged state, 2) even being the
5556 * weight the same, a merged queue may be bloated with many more
5557 * requests than the ones produced by its originally-associated
5560 static struct bfq_queue
*
5561 bfq_setup_cooperator(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5562 void *io_struct
, bool request
)
5564 struct bfq_queue
*in_service_bfqq
, *new_bfqq
;
5567 return bfqq
->new_bfqq
;
5570 wr_from_too_long(bfqq
) ||
5571 unlikely(bfqq
== &bfqd
->oom_bfqq
))
5574 /* If there is only one backlogged queue, don't search. */
5575 if (bfqd
->busy_queues
== 1)
5578 in_service_bfqq
= bfqd
->in_service_queue
;
5580 if (!in_service_bfqq
|| in_service_bfqq
== bfqq
||
5581 !bfqd
->in_service_bic
|| wr_from_too_long(in_service_bfqq
) ||
5582 unlikely(in_service_bfqq
== &bfqd
->oom_bfqq
))
5583 goto check_scheduled
;
5585 if (bfq_rq_close_to_sector(io_struct
, request
, bfqd
->last_position
) &&
5586 bfqq
->entity
.parent
== in_service_bfqq
->entity
.parent
&&
5587 bfq_may_be_close_cooperator(bfqq
, in_service_bfqq
)) {
5588 new_bfqq
= bfq_setup_merge(bfqq
, in_service_bfqq
);
5593 * Check whether there is a cooperator among currently scheduled
5594 * queues. The only thing we need is that the bio/request is not
5595 * NULL, as we need it to establish whether a cooperator exists.
5598 new_bfqq
= bfq_find_close_cooperator(bfqd
, bfqq
,
5599 bfq_io_struct_pos(io_struct
, request
));
5601 if (new_bfqq
&& !wr_from_too_long(new_bfqq
) &&
5602 likely(new_bfqq
!= &bfqd
->oom_bfqq
) &&
5603 bfq_may_be_close_cooperator(bfqq
, new_bfqq
))
5604 return bfq_setup_merge(bfqq
, new_bfqq
);
5609 static void bfq_bfqq_save_state(struct bfq_queue
*bfqq
)
5611 struct bfq_io_cq
*bic
= bfqq
->bic
;
5614 * If !bfqq->bic, the queue is already shared or its requests
5615 * have already been redirected to a shared queue; both idle window
5616 * and weight raising state have already been saved. Do nothing.
5621 bic
->saved_ttime
= bfqq
->ttime
;
5622 bic
->saved_idle_window
= bfq_bfqq_idle_window(bfqq
);
5623 bic
->saved_IO_bound
= bfq_bfqq_IO_bound(bfqq
);
5624 bic
->saved_in_large_burst
= bfq_bfqq_in_large_burst(bfqq
);
5625 bic
->was_in_burst_list
= !hlist_unhashed(&bfqq
->burst_list_node
);
5626 bic
->saved_wr_coeff
= bfqq
->wr_coeff
;
5627 bic
->saved_wr_start_at_switch_to_srt
= bfqq
->wr_start_at_switch_to_srt
;
5628 bic
->saved_last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5629 bic
->saved_wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5632 static void bfq_get_bic_reference(struct bfq_queue
*bfqq
)
5635 * If bfqq->bic has a non-NULL value, the bic to which it belongs
5636 * is about to begin using a shared bfq_queue.
5639 atomic_long_inc(&bfqq
->bic
->icq
.ioc
->refcount
);
5643 bfq_merge_bfqqs(struct bfq_data
*bfqd
, struct bfq_io_cq
*bic
,
5644 struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
5646 bfq_log_bfqq(bfqd
, bfqq
, "merging with queue %lu",
5647 (unsigned long)new_bfqq
->pid
);
5648 /* Save weight raising and idle window of the merged queues */
5649 bfq_bfqq_save_state(bfqq
);
5650 bfq_bfqq_save_state(new_bfqq
);
5651 if (bfq_bfqq_IO_bound(bfqq
))
5652 bfq_mark_bfqq_IO_bound(new_bfqq
);
5653 bfq_clear_bfqq_IO_bound(bfqq
);
5656 * If bfqq is weight-raised, then let new_bfqq inherit
5657 * weight-raising. To reduce false positives, neglect the case
5658 * where bfqq has just been created, but has not yet made it
5659 * to be weight-raised (which may happen because EQM may merge
5660 * bfqq even before bfq_add_request is executed for the first
5661 * time for bfqq). Handling this case would however be very
5662 * easy, thanks to the flag just_created.
5664 if (new_bfqq
->wr_coeff
== 1 && bfqq
->wr_coeff
> 1) {
5665 new_bfqq
->wr_coeff
= bfqq
->wr_coeff
;
5666 new_bfqq
->wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5667 new_bfqq
->last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5668 new_bfqq
->wr_start_at_switch_to_srt
=
5669 bfqq
->wr_start_at_switch_to_srt
;
5670 if (bfq_bfqq_busy(new_bfqq
))
5671 bfqd
->wr_busy_queues
++;
5672 new_bfqq
->entity
.prio_changed
= 1;
5675 if (bfqq
->wr_coeff
> 1) { /* bfqq has given its wr to new_bfqq */
5677 bfqq
->entity
.prio_changed
= 1;
5678 if (bfq_bfqq_busy(bfqq
))
5679 bfqd
->wr_busy_queues
--;
5682 bfq_log_bfqq(bfqd
, new_bfqq
, "merge_bfqqs: wr_busy %d",
5683 bfqd
->wr_busy_queues
);
5686 * Grab a reference to the bic, to prevent it from being destroyed
5687 * before being possibly touched by a bfq_split_bfqq().
5689 bfq_get_bic_reference(bfqq
);
5690 bfq_get_bic_reference(new_bfqq
);
5692 * Merge queues (that is, let bic redirect its requests to new_bfqq)
5694 bic_set_bfqq(bic
, new_bfqq
, 1);
5695 bfq_mark_bfqq_coop(new_bfqq
);
5697 * new_bfqq now belongs to at least two bics (it is a shared queue):
5698 * set new_bfqq->bic to NULL. bfqq either:
5699 * - does not belong to any bic any more, and hence bfqq->bic must
5700 * be set to NULL, or
5701 * - is a queue whose owning bics have already been redirected to a
5702 * different queue, hence the queue is destined to not belong to
5703 * any bic soon and bfqq->bic is already NULL (therefore the next
5704 * assignment causes no harm).
5706 new_bfqq
->bic
= NULL
;
5708 /* release process reference to bfqq */
5709 bfq_put_queue(bfqq
);
5712 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
5715 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5716 bool is_sync
= op_is_sync(bio
->bi_opf
);
5717 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
, *new_bfqq
;
5720 * Disallow merge of a sync bio into an async request.
5722 if (is_sync
&& !rq_is_sync(rq
))
5726 * Lookup the bfqq that this bio will be queued with. Allow
5727 * merge only if rq is queued there.
5733 * We take advantage of this function to perform an early merge
5734 * of the queues of possible cooperating processes.
5736 new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, bio
, false);
5739 * bic still points to bfqq, then it has not yet been
5740 * redirected to some other bfq_queue, and a queue
5741 * merge beween bfqq and new_bfqq can be safely
5742 * fulfillled, i.e., bic can be redirected to new_bfqq
5743 * and bfqq can be put.
5745 bfq_merge_bfqqs(bfqd
, bfqd
->bio_bic
, bfqq
,
5748 * If we get here, bio will be queued into new_queue,
5749 * so use new_bfqq to decide whether bio and rq can be
5755 * Change also bqfd->bio_bfqq, as
5756 * bfqd->bio_bic now points to new_bfqq, and
5757 * this function may be invoked again (and then may
5758 * use again bqfd->bio_bfqq).
5760 bfqd
->bio_bfqq
= bfqq
;
5763 return bfqq
== RQ_BFQQ(rq
);
5767 * Set the maximum time for the in-service queue to consume its
5768 * budget. This prevents seeky processes from lowering the throughput.
5769 * In practice, a time-slice service scheme is used with seeky
5772 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
,
5773 struct bfq_queue
*bfqq
)
5775 unsigned int timeout_coeff
;
5777 if (bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
)
5780 timeout_coeff
= bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
;
5782 bfqd
->last_budget_start
= ktime_get();
5784 bfqq
->budget_timeout
= jiffies
+
5785 bfqd
->bfq_timeout
* timeout_coeff
;
5788 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
5789 struct bfq_queue
*bfqq
)
5792 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
5793 bfq_clear_bfqq_fifo_expire(bfqq
);
5795 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
5797 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
) &&
5798 bfqq
->wr_coeff
> 1 &&
5799 bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
&&
5800 time_is_before_jiffies(bfqq
->budget_timeout
)) {
5802 * For soft real-time queues, move the start
5803 * of the weight-raising period forward by the
5804 * time the queue has not received any
5805 * service. Otherwise, a relatively long
5806 * service delay is likely to cause the
5807 * weight-raising period of the queue to end,
5808 * because of the short duration of the
5809 * weight-raising period of a soft real-time
5810 * queue. It is worth noting that this move
5811 * is not so dangerous for the other queues,
5812 * because soft real-time queues are not
5815 * To not add a further variable, we use the
5816 * overloaded field budget_timeout to
5817 * determine for how long the queue has not
5818 * received service, i.e., how much time has
5819 * elapsed since the queue expired. However,
5820 * this is a little imprecise, because
5821 * budget_timeout is set to jiffies if bfqq
5822 * not only expires, but also remains with no
5825 if (time_after(bfqq
->budget_timeout
,
5826 bfqq
->last_wr_start_finish
))
5827 bfqq
->last_wr_start_finish
+=
5828 jiffies
- bfqq
->budget_timeout
;
5830 bfqq
->last_wr_start_finish
= jiffies
;
5833 bfq_set_budget_timeout(bfqd
, bfqq
);
5834 bfq_log_bfqq(bfqd
, bfqq
,
5835 "set_in_service_queue, cur-budget = %d",
5836 bfqq
->entity
.budget
);
5839 bfqd
->in_service_queue
= bfqq
;
5843 * Get and set a new queue for service.
5845 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
5847 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
5849 __bfq_set_in_service_queue(bfqd
, bfqq
);
5853 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
5855 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5856 struct bfq_io_cq
*bic
;
5859 /* Processes have exited, don't wait. */
5860 bic
= bfqd
->in_service_bic
;
5861 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
5864 bfq_mark_bfqq_wait_request(bfqq
);
5867 * We don't want to idle for seeks, but we do want to allow
5868 * fair distribution of slice time for a process doing back-to-back
5869 * seeks. So allow a little bit of time for him to submit a new rq.
5871 sl
= bfqd
->bfq_slice_idle
;
5873 * Unless the queue is being weight-raised or the scenario is
5874 * asymmetric, grant only minimum idle time if the queue
5875 * is seeky. A long idling is preserved for a weight-raised
5876 * queue, or, more in general, in an asymmetric scenario,
5877 * because a long idling is needed for guaranteeing to a queue
5878 * its reserved share of the throughput (in particular, it is
5879 * needed if the queue has a higher weight than some other
5882 if (BFQQ_SEEKY(bfqq
) && bfqq
->wr_coeff
== 1 &&
5883 bfq_symmetric_scenario(bfqd
))
5884 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
5886 bfqd
->last_idling_start
= ktime_get();
5887 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
5889 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
5893 * In autotuning mode, max_budget is dynamically recomputed as the
5894 * amount of sectors transferred in timeout at the estimated peak
5895 * rate. This enables BFQ to utilize a full timeslice with a full
5896 * budget, even if the in-service queue is served at peak rate. And
5897 * this maximises throughput with sequential workloads.
5899 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
5901 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
5902 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
5906 * Update parameters related to throughput and responsiveness, as a
5907 * function of the estimated peak rate. See comments on
5908 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
5910 static void update_thr_responsiveness_params(struct bfq_data
*bfqd
)
5912 int dev_type
= blk_queue_nonrot(bfqd
->queue
);
5914 if (bfqd
->bfq_user_max_budget
== 0)
5915 bfqd
->bfq_max_budget
=
5916 bfq_calc_max_budget(bfqd
);
5918 if (bfqd
->device_speed
== BFQ_BFQD_FAST
&&
5919 bfqd
->peak_rate
< device_speed_thresh
[dev_type
]) {
5920 bfqd
->device_speed
= BFQ_BFQD_SLOW
;
5921 bfqd
->RT_prod
= R_slow
[dev_type
] *
5923 } else if (bfqd
->device_speed
== BFQ_BFQD_SLOW
&&
5924 bfqd
->peak_rate
> device_speed_thresh
[dev_type
]) {
5925 bfqd
->device_speed
= BFQ_BFQD_FAST
;
5926 bfqd
->RT_prod
= R_fast
[dev_type
] *
5931 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
5932 dev_type
== 0 ? "ROT" : "NONROT",
5933 bfqd
->device_speed
== BFQ_BFQD_FAST
? "FAST" : "SLOW",
5934 bfqd
->device_speed
== BFQ_BFQD_FAST
?
5935 (USEC_PER_SEC
*(u64
)R_fast
[dev_type
])>>BFQ_RATE_SHIFT
:
5936 (USEC_PER_SEC
*(u64
)R_slow
[dev_type
])>>BFQ_RATE_SHIFT
,
5937 (USEC_PER_SEC
*(u64
)device_speed_thresh
[dev_type
])>>
5941 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
5944 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
5945 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
5946 bfqd
->peak_rate_samples
= 1;
5947 bfqd
->sequential_samples
= 0;
5948 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
5950 } else /* no new rq dispatched, just reset the number of samples */
5951 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
5954 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
5955 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
5956 bfqd
->tot_sectors_dispatched
);
5959 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
5961 u32 rate
, weight
, divisor
;
5964 * For the convergence property to hold (see comments on
5965 * bfq_update_peak_rate()) and for the assessment to be
5966 * reliable, a minimum number of samples must be present, and
5967 * a minimum amount of time must have elapsed. If not so, do
5968 * not compute new rate. Just reset parameters, to get ready
5969 * for a new evaluation attempt.
5971 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
5972 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
5973 goto reset_computation
;
5976 * If a new request completion has occurred after last
5977 * dispatch, then, to approximate the rate at which requests
5978 * have been served by the device, it is more precise to
5979 * extend the observation interval to the last completion.
5981 bfqd
->delta_from_first
=
5982 max_t(u64
, bfqd
->delta_from_first
,
5983 bfqd
->last_completion
- bfqd
->first_dispatch
);
5986 * Rate computed in sects/usec, and not sects/nsec, for
5989 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
5990 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
5993 * Peak rate not updated if:
5994 * - the percentage of sequential dispatches is below 3/4 of the
5995 * total, and rate is below the current estimated peak rate
5996 * - rate is unreasonably high (> 20M sectors/sec)
5998 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
5999 rate
<= bfqd
->peak_rate
) ||
6000 rate
> 20<<BFQ_RATE_SHIFT
)
6001 goto reset_computation
;
6004 * We have to update the peak rate, at last! To this purpose,
6005 * we use a low-pass filter. We compute the smoothing constant
6006 * of the filter as a function of the 'weight' of the new
6009 * As can be seen in next formulas, we define this weight as a
6010 * quantity proportional to how sequential the workload is,
6011 * and to how long the observation time interval is.
6013 * The weight runs from 0 to 8. The maximum value of the
6014 * weight, 8, yields the minimum value for the smoothing
6015 * constant. At this minimum value for the smoothing constant,
6016 * the measured rate contributes for half of the next value of
6017 * the estimated peak rate.
6019 * So, the first step is to compute the weight as a function
6020 * of how sequential the workload is. Note that the weight
6021 * cannot reach 9, because bfqd->sequential_samples cannot
6022 * become equal to bfqd->peak_rate_samples, which, in its
6023 * turn, holds true because bfqd->sequential_samples is not
6024 * incremented for the first sample.
6026 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
6029 * Second step: further refine the weight as a function of the
6030 * duration of the observation interval.
6032 weight
= min_t(u32
, 8,
6033 div_u64(weight
* bfqd
->delta_from_first
,
6034 BFQ_RATE_REF_INTERVAL
));
6037 * Divisor ranging from 10, for minimum weight, to 2, for
6040 divisor
= 10 - weight
;
6043 * Finally, update peak rate:
6045 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
6047 bfqd
->peak_rate
*= divisor
-1;
6048 bfqd
->peak_rate
/= divisor
;
6049 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
6051 bfqd
->peak_rate
+= rate
;
6052 update_thr_responsiveness_params(bfqd
);
6055 bfq_reset_rate_computation(bfqd
, rq
);
6059 * Update the read/write peak rate (the main quantity used for
6060 * auto-tuning, see update_thr_responsiveness_params()).
6062 * It is not trivial to estimate the peak rate (correctly): because of
6063 * the presence of sw and hw queues between the scheduler and the
6064 * device components that finally serve I/O requests, it is hard to
6065 * say exactly when a given dispatched request is served inside the
6066 * device, and for how long. As a consequence, it is hard to know
6067 * precisely at what rate a given set of requests is actually served
6070 * On the opposite end, the dispatch time of any request is trivially
6071 * available, and, from this piece of information, the "dispatch rate"
6072 * of requests can be immediately computed. So, the idea in the next
6073 * function is to use what is known, namely request dispatch times
6074 * (plus, when useful, request completion times), to estimate what is
6075 * unknown, namely in-device request service rate.
6077 * The main issue is that, because of the above facts, the rate at
6078 * which a certain set of requests is dispatched over a certain time
6079 * interval can vary greatly with respect to the rate at which the
6080 * same requests are then served. But, since the size of any
6081 * intermediate queue is limited, and the service scheme is lossless
6082 * (no request is silently dropped), the following obvious convergence
6083 * property holds: the number of requests dispatched MUST become
6084 * closer and closer to the number of requests completed as the
6085 * observation interval grows. This is the key property used in
6086 * the next function to estimate the peak service rate as a function
6087 * of the observed dispatch rate. The function assumes to be invoked
6088 * on every request dispatch.
6090 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
6092 u64 now_ns
= ktime_get_ns();
6094 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
6095 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
6096 bfqd
->peak_rate_samples
);
6097 bfq_reset_rate_computation(bfqd
, rq
);
6098 goto update_last_values
; /* will add one sample */
6102 * Device idle for very long: the observation interval lasting
6103 * up to this dispatch cannot be a valid observation interval
6104 * for computing a new peak rate (similarly to the late-
6105 * completion event in bfq_completed_request()). Go to
6106 * update_rate_and_reset to have the following three steps
6108 * - close the observation interval at the last (previous)
6109 * request dispatch or completion
6110 * - compute rate, if possible, for that observation interval
6111 * - start a new observation interval with this dispatch
6113 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
6114 bfqd
->rq_in_driver
== 0)
6115 goto update_rate_and_reset
;
6117 /* Update sampling information */
6118 bfqd
->peak_rate_samples
++;
6120 if ((bfqd
->rq_in_driver
> 0 ||
6121 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
6122 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
6123 bfqd
->sequential_samples
++;
6125 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
6127 /* Reset max observed rq size every 32 dispatches */
6128 if (likely(bfqd
->peak_rate_samples
% 32))
6129 bfqd
->last_rq_max_size
=
6130 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
6132 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
6134 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
6136 /* Target observation interval not yet reached, go on sampling */
6137 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
6138 goto update_last_values
;
6140 update_rate_and_reset
:
6141 bfq_update_rate_reset(bfqd
, rq
);
6143 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
6144 bfqd
->last_dispatch
= now_ns
;
6148 * Remove request from internal lists.
6150 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
6152 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
6155 * For consistency, the next instruction should have been
6156 * executed after removing the request from the queue and
6157 * dispatching it. We execute instead this instruction before
6158 * bfq_remove_request() (and hence introduce a temporary
6159 * inconsistency), for efficiency. In fact, should this
6160 * dispatch occur for a non in-service bfqq, this anticipated
6161 * increment prevents two counters related to bfqq->dispatched
6162 * from risking to be, first, uselessly decremented, and then
6163 * incremented again when the (new) value of bfqq->dispatched
6164 * happens to be taken into account.
6167 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
6169 bfq_remove_request(q
, rq
);
6172 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6175 * If this bfqq is shared between multiple processes, check
6176 * to make sure that those processes are still issuing I/Os
6177 * within the mean seek distance. If not, it may be time to
6178 * break the queues apart again.
6180 if (bfq_bfqq_coop(bfqq
) && BFQQ_SEEKY(bfqq
))
6181 bfq_mark_bfqq_split_coop(bfqq
);
6183 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
6184 if (bfqq
->dispatched
== 0)
6186 * Overloading budget_timeout field to store
6187 * the time at which the queue remains with no
6188 * backlog and no outstanding request; used by
6189 * the weight-raising mechanism.
6191 bfqq
->budget_timeout
= jiffies
;
6193 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
6195 bfq_requeue_bfqq(bfqd
, bfqq
);
6197 * Resort priority tree of potential close cooperators.
6199 bfq_pos_tree_add_move(bfqd
, bfqq
);
6203 * All in-service entities must have been properly deactivated
6204 * or requeued before executing the next function, which
6205 * resets all in-service entites as no more in service.
6207 __bfq_bfqd_reset_in_service(bfqd
);
6211 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
6212 * @bfqd: device data.
6213 * @bfqq: queue to update.
6214 * @reason: reason for expiration.
6216 * Handle the feedback on @bfqq budget at queue expiration.
6217 * See the body for detailed comments.
6219 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
6220 struct bfq_queue
*bfqq
,
6221 enum bfqq_expiration reason
)
6223 struct request
*next_rq
;
6224 int budget
, min_budget
;
6226 min_budget
= bfq_min_budget(bfqd
);
6228 if (bfqq
->wr_coeff
== 1)
6229 budget
= bfqq
->max_budget
;
6231 * Use a constant, low budget for weight-raised queues,
6232 * to help achieve a low latency. Keep it slightly higher
6233 * than the minimum possible budget, to cause a little
6234 * bit fewer expirations.
6236 budget
= 2 * min_budget
;
6238 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
6239 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
6240 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
6241 budget
, bfq_min_budget(bfqd
));
6242 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
6243 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
6245 if (bfq_bfqq_sync(bfqq
) && bfqq
->wr_coeff
== 1) {
6248 * Caveat: in all the following cases we trade latency
6251 case BFQQE_TOO_IDLE
:
6253 * This is the only case where we may reduce
6254 * the budget: if there is no request of the
6255 * process still waiting for completion, then
6256 * we assume (tentatively) that the timer has
6257 * expired because the batch of requests of
6258 * the process could have been served with a
6259 * smaller budget. Hence, betting that
6260 * process will behave in the same way when it
6261 * becomes backlogged again, we reduce its
6262 * next budget. As long as we guess right,
6263 * this budget cut reduces the latency
6264 * experienced by the process.
6266 * However, if there are still outstanding
6267 * requests, then the process may have not yet
6268 * issued its next request just because it is
6269 * still waiting for the completion of some of
6270 * the still outstanding ones. So in this
6271 * subcase we do not reduce its budget, on the
6272 * contrary we increase it to possibly boost
6273 * the throughput, as discussed in the
6274 * comments to the BUDGET_TIMEOUT case.
6276 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
6277 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
6279 if (budget
> 5 * min_budget
)
6280 budget
-= 4 * min_budget
;
6282 budget
= min_budget
;
6285 case BFQQE_BUDGET_TIMEOUT
:
6287 * We double the budget here because it gives
6288 * the chance to boost the throughput if this
6289 * is not a seeky process (and has bumped into
6290 * this timeout because of, e.g., ZBR).
6292 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
6294 case BFQQE_BUDGET_EXHAUSTED
:
6296 * The process still has backlog, and did not
6297 * let either the budget timeout or the disk
6298 * idling timeout expire. Hence it is not
6299 * seeky, has a short thinktime and may be
6300 * happy with a higher budget too. So
6301 * definitely increase the budget of this good
6302 * candidate to boost the disk throughput.
6304 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
6306 case BFQQE_NO_MORE_REQUESTS
:
6308 * For queues that expire for this reason, it
6309 * is particularly important to keep the
6310 * budget close to the actual service they
6311 * need. Doing so reduces the timestamp
6312 * misalignment problem described in the
6313 * comments in the body of
6314 * __bfq_activate_entity. In fact, suppose
6315 * that a queue systematically expires for
6316 * BFQQE_NO_MORE_REQUESTS and presents a
6317 * new request in time to enjoy timestamp
6318 * back-shifting. The larger the budget of the
6319 * queue is with respect to the service the
6320 * queue actually requests in each service
6321 * slot, the more times the queue can be
6322 * reactivated with the same virtual finish
6323 * time. It follows that, even if this finish
6324 * time is pushed to the system virtual time
6325 * to reduce the consequent timestamp
6326 * misalignment, the queue unjustly enjoys for
6327 * many re-activations a lower finish time
6328 * than all newly activated queues.
6330 * The service needed by bfqq is measured
6331 * quite precisely by bfqq->entity.service.
6332 * Since bfqq does not enjoy device idling,
6333 * bfqq->entity.service is equal to the number
6334 * of sectors that the process associated with
6335 * bfqq requested to read/write before waiting
6336 * for request completions, or blocking for
6339 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
6344 } else if (!bfq_bfqq_sync(bfqq
)) {
6346 * Async queues get always the maximum possible
6347 * budget, as for them we do not care about latency
6348 * (in addition, their ability to dispatch is limited
6349 * by the charging factor).
6351 budget
= bfqd
->bfq_max_budget
;
6354 bfqq
->max_budget
= budget
;
6356 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
6357 !bfqd
->bfq_user_max_budget
)
6358 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
6361 * If there is still backlog, then assign a new budget, making
6362 * sure that it is large enough for the next request. Since
6363 * the finish time of bfqq must be kept in sync with the
6364 * budget, be sure to call __bfq_bfqq_expire() *after* this
6367 * If there is no backlog, then no need to update the budget;
6368 * it will be updated on the arrival of a new request.
6370 next_rq
= bfqq
->next_rq
;
6372 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
6373 bfq_serv_to_charge(next_rq
, bfqq
));
6375 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
6376 next_rq
? blk_rq_sectors(next_rq
) : 0,
6377 bfqq
->entity
.budget
);
6381 * Return true if the process associated with bfqq is "slow". The slow
6382 * flag is used, in addition to the budget timeout, to reduce the
6383 * amount of service provided to seeky processes, and thus reduce
6384 * their chances to lower the throughput. More details in the comments
6385 * on the function bfq_bfqq_expire().
6387 * An important observation is in order: as discussed in the comments
6388 * on the function bfq_update_peak_rate(), with devices with internal
6389 * queues, it is hard if ever possible to know when and for how long
6390 * an I/O request is processed by the device (apart from the trivial
6391 * I/O pattern where a new request is dispatched only after the
6392 * previous one has been completed). This makes it hard to evaluate
6393 * the real rate at which the I/O requests of each bfq_queue are
6394 * served. In fact, for an I/O scheduler like BFQ, serving a
6395 * bfq_queue means just dispatching its requests during its service
6396 * slot (i.e., until the budget of the queue is exhausted, or the
6397 * queue remains idle, or, finally, a timeout fires). But, during the
6398 * service slot of a bfq_queue, around 100 ms at most, the device may
6399 * be even still processing requests of bfq_queues served in previous
6400 * service slots. On the opposite end, the requests of the in-service
6401 * bfq_queue may be completed after the service slot of the queue
6404 * Anyway, unless more sophisticated solutions are used
6405 * (where possible), the sum of the sizes of the requests dispatched
6406 * during the service slot of a bfq_queue is probably the only
6407 * approximation available for the service received by the bfq_queue
6408 * during its service slot. And this sum is the quantity used in this
6409 * function to evaluate the I/O speed of a process.
6411 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6412 bool compensate
, enum bfqq_expiration reason
,
6413 unsigned long *delta_ms
)
6415 ktime_t delta_ktime
;
6417 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
6419 if (!bfq_bfqq_sync(bfqq
))
6423 delta_ktime
= bfqd
->last_idling_start
;
6425 delta_ktime
= ktime_get();
6426 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
6427 delta_usecs
= ktime_to_us(delta_ktime
);
6429 /* don't use too short time intervals */
6430 if (delta_usecs
< 1000) {
6431 if (blk_queue_nonrot(bfqd
->queue
))
6433 * give same worst-case guarantees as idling
6436 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
6437 else /* charge at least one seek */
6438 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
6443 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
6446 * Use only long (> 20ms) intervals to filter out excessive
6447 * spikes in service rate estimation.
6449 if (delta_usecs
> 20000) {
6451 * Caveat for rotational devices: processes doing I/O
6452 * in the slower disk zones tend to be slow(er) even
6453 * if not seeky. In this respect, the estimated peak
6454 * rate is likely to be an average over the disk
6455 * surface. Accordingly, to not be too harsh with
6456 * unlucky processes, a process is deemed slow only if
6457 * its rate has been lower than half of the estimated
6460 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
6463 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
6469 * To be deemed as soft real-time, an application must meet two
6470 * requirements. First, the application must not require an average
6471 * bandwidth higher than the approximate bandwidth required to playback or
6472 * record a compressed high-definition video.
6473 * The next function is invoked on the completion of the last request of a
6474 * batch, to compute the next-start time instant, soft_rt_next_start, such
6475 * that, if the next request of the application does not arrive before
6476 * soft_rt_next_start, then the above requirement on the bandwidth is met.
6478 * The second requirement is that the request pattern of the application is
6479 * isochronous, i.e., that, after issuing a request or a batch of requests,
6480 * the application stops issuing new requests until all its pending requests
6481 * have been completed. After that, the application may issue a new batch,
6483 * For this reason the next function is invoked to compute
6484 * soft_rt_next_start only for applications that meet this requirement,
6485 * whereas soft_rt_next_start is set to infinity for applications that do
6488 * Unfortunately, even a greedy application may happen to behave in an
6489 * isochronous way if the CPU load is high. In fact, the application may
6490 * stop issuing requests while the CPUs are busy serving other processes,
6491 * then restart, then stop again for a while, and so on. In addition, if
6492 * the disk achieves a low enough throughput with the request pattern
6493 * issued by the application (e.g., because the request pattern is random
6494 * and/or the device is slow), then the application may meet the above
6495 * bandwidth requirement too. To prevent such a greedy application to be
6496 * deemed as soft real-time, a further rule is used in the computation of
6497 * soft_rt_next_start: soft_rt_next_start must be higher than the current
6498 * time plus the maximum time for which the arrival of a request is waited
6499 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
6500 * This filters out greedy applications, as the latter issue instead their
6501 * next request as soon as possible after the last one has been completed
6502 * (in contrast, when a batch of requests is completed, a soft real-time
6503 * application spends some time processing data).
6505 * Unfortunately, the last filter may easily generate false positives if
6506 * only bfqd->bfq_slice_idle is used as a reference time interval and one
6507 * or both the following cases occur:
6508 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
6509 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
6511 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
6512 * for a while, then suddenly 'jump' by several units to recover the lost
6513 * increments. This seems to happen, e.g., inside virtual machines.
6514 * To address this issue, we do not use as a reference time interval just
6515 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
6516 * particular we add the minimum number of jiffies for which the filter
6517 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
6520 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data
*bfqd
,
6521 struct bfq_queue
*bfqq
)
6523 return max(bfqq
->last_idle_bklogged
+
6524 HZ
* bfqq
->service_from_backlogged
/
6525 bfqd
->bfq_wr_max_softrt_rate
,
6526 jiffies
+ nsecs_to_jiffies(bfqq
->bfqd
->bfq_slice_idle
) + 4);
6530 * Return the farthest future time instant according to jiffies
6533 static unsigned long bfq_greatest_from_now(void)
6535 return jiffies
+ MAX_JIFFY_OFFSET
;
6539 * Return the farthest past time instant according to jiffies
6542 static unsigned long bfq_smallest_from_now(void)
6544 return jiffies
- MAX_JIFFY_OFFSET
;
6548 * bfq_bfqq_expire - expire a queue.
6549 * @bfqd: device owning the queue.
6550 * @bfqq: the queue to expire.
6551 * @compensate: if true, compensate for the time spent idling.
6552 * @reason: the reason causing the expiration.
6554 * If the process associated with bfqq does slow I/O (e.g., because it
6555 * issues random requests), we charge bfqq with the time it has been
6556 * in service instead of the service it has received (see
6557 * bfq_bfqq_charge_time for details on how this goal is achieved). As
6558 * a consequence, bfqq will typically get higher timestamps upon
6559 * reactivation, and hence it will be rescheduled as if it had
6560 * received more service than what it has actually received. In the
6561 * end, bfqq receives less service in proportion to how slowly its
6562 * associated process consumes its budgets (and hence how seriously it
6563 * tends to lower the throughput). In addition, this time-charging
6564 * strategy guarantees time fairness among slow processes. In
6565 * contrast, if the process associated with bfqq is not slow, we
6566 * charge bfqq exactly with the service it has received.
6568 * Charging time to the first type of queues and the exact service to
6569 * the other has the effect of using the WF2Q+ policy to schedule the
6570 * former on a timeslice basis, without violating service domain
6571 * guarantees among the latter.
6573 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
6574 struct bfq_queue
*bfqq
,
6576 enum bfqq_expiration reason
)
6579 unsigned long delta
= 0;
6580 struct bfq_entity
*entity
= &bfqq
->entity
;
6584 * Check whether the process is slow (see bfq_bfqq_is_slow).
6586 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
6589 * Increase service_from_backlogged before next statement,
6590 * because the possible next invocation of
6591 * bfq_bfqq_charge_time would likely inflate
6592 * entity->service. In contrast, service_from_backlogged must
6593 * contain real service, to enable the soft real-time
6594 * heuristic to correctly compute the bandwidth consumed by
6597 bfqq
->service_from_backlogged
+= entity
->service
;
6600 * As above explained, charge slow (typically seeky) and
6601 * timed-out queues with the time and not the service
6602 * received, to favor sequential workloads.
6604 * Processes doing I/O in the slower disk zones will tend to
6605 * be slow(er) even if not seeky. Therefore, since the
6606 * estimated peak rate is actually an average over the disk
6607 * surface, these processes may timeout just for bad luck. To
6608 * avoid punishing them, do not charge time to processes that
6609 * succeeded in consuming at least 2/3 of their budget. This
6610 * allows BFQ to preserve enough elasticity to still perform
6611 * bandwidth, and not time, distribution with little unlucky
6612 * or quasi-sequential processes.
6614 if (bfqq
->wr_coeff
== 1 &&
6616 (reason
== BFQQE_BUDGET_TIMEOUT
&&
6617 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3)))
6618 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
6620 if (reason
== BFQQE_TOO_IDLE
&&
6621 entity
->service
<= 2 * entity
->budget
/ 10)
6622 bfq_clear_bfqq_IO_bound(bfqq
);
6624 if (bfqd
->low_latency
&& bfqq
->wr_coeff
== 1)
6625 bfqq
->last_wr_start_finish
= jiffies
;
6627 if (bfqd
->low_latency
&& bfqd
->bfq_wr_max_softrt_rate
> 0 &&
6628 RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
6630 * If we get here, and there are no outstanding
6631 * requests, then the request pattern is isochronous
6632 * (see the comments on the function
6633 * bfq_bfqq_softrt_next_start()). Thus we can compute
6634 * soft_rt_next_start. If, instead, the queue still
6635 * has outstanding requests, then we have to wait for
6636 * the completion of all the outstanding requests to
6637 * discover whether the request pattern is actually
6640 if (bfqq
->dispatched
== 0)
6641 bfqq
->soft_rt_next_start
=
6642 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
6645 * The application is still waiting for the
6646 * completion of one or more requests:
6647 * prevent it from possibly being incorrectly
6648 * deemed as soft real-time by setting its
6649 * soft_rt_next_start to infinity. In fact,
6650 * without this assignment, the application
6651 * would be incorrectly deemed as soft
6653 * 1) it issued a new request before the
6654 * completion of all its in-flight
6656 * 2) at that time, its soft_rt_next_start
6657 * happened to be in the past.
6659 bfqq
->soft_rt_next_start
=
6660 bfq_greatest_from_now();
6662 * Schedule an update of soft_rt_next_start to when
6663 * the task may be discovered to be isochronous.
6665 bfq_mark_bfqq_softrt_update(bfqq
);
6669 bfq_log_bfqq(bfqd
, bfqq
,
6670 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
6671 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
6674 * Increase, decrease or leave budget unchanged according to
6677 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
6679 __bfq_bfqq_expire(bfqd
, bfqq
);
6681 /* mark bfqq as waiting a request only if a bic still points to it */
6682 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
6683 reason
!= BFQQE_BUDGET_TIMEOUT
&&
6684 reason
!= BFQQE_BUDGET_EXHAUSTED
)
6685 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
6689 * Budget timeout is not implemented through a dedicated timer, but
6690 * just checked on request arrivals and completions, as well as on
6691 * idle timer expirations.
6693 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
6695 return time_is_before_eq_jiffies(bfqq
->budget_timeout
);
6699 * If we expire a queue that is actively waiting (i.e., with the
6700 * device idled) for the arrival of a new request, then we may incur
6701 * the timestamp misalignment problem described in the body of the
6702 * function __bfq_activate_entity. Hence we return true only if this
6703 * condition does not hold, or if the queue is slow enough to deserve
6704 * only to be kicked off for preserving a high throughput.
6706 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
6708 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
6709 "may_budget_timeout: wait_request %d left %d timeout %d",
6710 bfq_bfqq_wait_request(bfqq
),
6711 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
6712 bfq_bfqq_budget_timeout(bfqq
));
6714 return (!bfq_bfqq_wait_request(bfqq
) ||
6715 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
6717 bfq_bfqq_budget_timeout(bfqq
);
6721 * For a queue that becomes empty, device idling is allowed only if
6722 * this function returns true for the queue. As a consequence, since
6723 * device idling plays a critical role in both throughput boosting and
6724 * service guarantees, the return value of this function plays a
6725 * critical role in both these aspects as well.
6727 * In a nutshell, this function returns true only if idling is
6728 * beneficial for throughput or, even if detrimental for throughput,
6729 * idling is however necessary to preserve service guarantees (low
6730 * latency, desired throughput distribution, ...). In particular, on
6731 * NCQ-capable devices, this function tries to return false, so as to
6732 * help keep the drives' internal queues full, whenever this helps the
6733 * device boost the throughput without causing any service-guarantee
6736 * In more detail, the return value of this function is obtained by,
6737 * first, computing a number of boolean variables that take into
6738 * account throughput and service-guarantee issues, and, then,
6739 * combining these variables in a logical expression. Most of the
6740 * issues taken into account are not trivial. We discuss these issues
6741 * individually while introducing the variables.
6743 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
6745 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6746 bool idling_boosts_thr
, idling_boosts_thr_without_issues
,
6747 idling_needed_for_service_guarantees
,
6748 asymmetric_scenario
;
6750 if (bfqd
->strict_guarantees
)
6754 * The next variable takes into account the cases where idling
6755 * boosts the throughput.
6757 * The value of the variable is computed considering, first, that
6758 * idling is virtually always beneficial for the throughput if:
6759 * (a) the device is not NCQ-capable, or
6760 * (b) regardless of the presence of NCQ, the device is rotational
6761 * and the request pattern for bfqq is I/O-bound and sequential.
6763 * Secondly, and in contrast to the above item (b), idling an
6764 * NCQ-capable flash-based device would not boost the
6765 * throughput even with sequential I/O; rather it would lower
6766 * the throughput in proportion to how fast the device
6767 * is. Accordingly, the next variable is true if any of the
6768 * above conditions (a) and (b) is true, and, in particular,
6769 * happens to be false if bfqd is an NCQ-capable flash-based
6772 idling_boosts_thr
= !bfqd
->hw_tag
||
6773 (!blk_queue_nonrot(bfqd
->queue
) && bfq_bfqq_IO_bound(bfqq
) &&
6774 bfq_bfqq_idle_window(bfqq
));
6777 * The value of the next variable,
6778 * idling_boosts_thr_without_issues, is equal to that of
6779 * idling_boosts_thr, unless a special case holds. In this
6780 * special case, described below, idling may cause problems to
6781 * weight-raised queues.
6783 * When the request pool is saturated (e.g., in the presence
6784 * of write hogs), if the processes associated with
6785 * non-weight-raised queues ask for requests at a lower rate,
6786 * then processes associated with weight-raised queues have a
6787 * higher probability to get a request from the pool
6788 * immediately (or at least soon) when they need one. Thus
6789 * they have a higher probability to actually get a fraction
6790 * of the device throughput proportional to their high
6791 * weight. This is especially true with NCQ-capable drives,
6792 * which enqueue several requests in advance, and further
6793 * reorder internally-queued requests.
6795 * For this reason, we force to false the value of
6796 * idling_boosts_thr_without_issues if there are weight-raised
6797 * busy queues. In this case, and if bfqq is not weight-raised,
6798 * this guarantees that the device is not idled for bfqq (if,
6799 * instead, bfqq is weight-raised, then idling will be
6800 * guaranteed by another variable, see below). Combined with
6801 * the timestamping rules of BFQ (see [1] for details), this
6802 * behavior causes bfqq, and hence any sync non-weight-raised
6803 * queue, to get a lower number of requests served, and thus
6804 * to ask for a lower number of requests from the request
6805 * pool, before the busy weight-raised queues get served
6806 * again. This often mitigates starvation problems in the
6807 * presence of heavy write workloads and NCQ, thereby
6808 * guaranteeing a higher application and system responsiveness
6809 * in these hostile scenarios.
6811 idling_boosts_thr_without_issues
= idling_boosts_thr
&&
6812 bfqd
->wr_busy_queues
== 0;
6815 * There is then a case where idling must be performed not
6816 * for throughput concerns, but to preserve service
6819 * To introduce this case, we can note that allowing the drive
6820 * to enqueue more than one request at a time, and hence
6821 * delegating de facto final scheduling decisions to the
6822 * drive's internal scheduler, entails loss of control on the
6823 * actual request service order. In particular, the critical
6824 * situation is when requests from different processes happen
6825 * to be present, at the same time, in the internal queue(s)
6826 * of the drive. In such a situation, the drive, by deciding
6827 * the service order of the internally-queued requests, does
6828 * determine also the actual throughput distribution among
6829 * these processes. But the drive typically has no notion or
6830 * concern about per-process throughput distribution, and
6831 * makes its decisions only on a per-request basis. Therefore,
6832 * the service distribution enforced by the drive's internal
6833 * scheduler is likely to coincide with the desired
6834 * device-throughput distribution only in a completely
6835 * symmetric scenario where:
6836 * (i) each of these processes must get the same throughput as
6838 * (ii) all these processes have the same I/O pattern
6839 (either sequential or random).
6840 * In fact, in such a scenario, the drive will tend to treat
6841 * the requests of each of these processes in about the same
6842 * way as the requests of the others, and thus to provide
6843 * each of these processes with about the same throughput
6844 * (which is exactly the desired throughput distribution). In
6845 * contrast, in any asymmetric scenario, device idling is
6846 * certainly needed to guarantee that bfqq receives its
6847 * assigned fraction of the device throughput (see [1] for
6850 * We address this issue by controlling, actually, only the
6851 * symmetry sub-condition (i), i.e., provided that
6852 * sub-condition (i) holds, idling is not performed,
6853 * regardless of whether sub-condition (ii) holds. In other
6854 * words, only if sub-condition (i) holds, then idling is
6855 * allowed, and the device tends to be prevented from queueing
6856 * many requests, possibly of several processes. The reason
6857 * for not controlling also sub-condition (ii) is that we
6858 * exploit preemption to preserve guarantees in case of
6859 * symmetric scenarios, even if (ii) does not hold, as
6860 * explained in the next two paragraphs.
6862 * Even if a queue, say Q, is expired when it remains idle, Q
6863 * can still preempt the new in-service queue if the next
6864 * request of Q arrives soon (see the comments on
6865 * bfq_bfqq_update_budg_for_activation). If all queues and
6866 * groups have the same weight, this form of preemption,
6867 * combined with the hole-recovery heuristic described in the
6868 * comments on function bfq_bfqq_update_budg_for_activation,
6869 * are enough to preserve a correct bandwidth distribution in
6870 * the mid term, even without idling. In fact, even if not
6871 * idling allows the internal queues of the device to contain
6872 * many requests, and thus to reorder requests, we can rather
6873 * safely assume that the internal scheduler still preserves a
6874 * minimum of mid-term fairness. The motivation for using
6875 * preemption instead of idling is that, by not idling,
6876 * service guarantees are preserved without minimally
6877 * sacrificing throughput. In other words, both a high
6878 * throughput and its desired distribution are obtained.
6880 * More precisely, this preemption-based, idleless approach
6881 * provides fairness in terms of IOPS, and not sectors per
6882 * second. This can be seen with a simple example. Suppose
6883 * that there are two queues with the same weight, but that
6884 * the first queue receives requests of 8 sectors, while the
6885 * second queue receives requests of 1024 sectors. In
6886 * addition, suppose that each of the two queues contains at
6887 * most one request at a time, which implies that each queue
6888 * always remains idle after it is served. Finally, after
6889 * remaining idle, each queue receives very quickly a new
6890 * request. It follows that the two queues are served
6891 * alternatively, preempting each other if needed. This
6892 * implies that, although both queues have the same weight,
6893 * the queue with large requests receives a service that is
6894 * 1024/8 times as high as the service received by the other
6897 * On the other hand, device idling is performed, and thus
6898 * pure sector-domain guarantees are provided, for the
6899 * following queues, which are likely to need stronger
6900 * throughput guarantees: weight-raised queues, and queues
6901 * with a higher weight than other queues. When such queues
6902 * are active, sub-condition (i) is false, which triggers
6905 * According to the above considerations, the next variable is
6906 * true (only) if sub-condition (i) holds. To compute the
6907 * value of this variable, we not only use the return value of
6908 * the function bfq_symmetric_scenario(), but also check
6909 * whether bfqq is being weight-raised, because
6910 * bfq_symmetric_scenario() does not take into account also
6911 * weight-raised queues (see comments on
6912 * bfq_weights_tree_add()).
6914 * As a side note, it is worth considering that the above
6915 * device-idling countermeasures may however fail in the
6916 * following unlucky scenario: if idling is (correctly)
6917 * disabled in a time period during which all symmetry
6918 * sub-conditions hold, and hence the device is allowed to
6919 * enqueue many requests, but at some later point in time some
6920 * sub-condition stops to hold, then it may become impossible
6921 * to let requests be served in the desired order until all
6922 * the requests already queued in the device have been served.
6924 asymmetric_scenario
= bfqq
->wr_coeff
> 1 ||
6925 !bfq_symmetric_scenario(bfqd
);
6928 * Finally, there is a case where maximizing throughput is the
6929 * best choice even if it may cause unfairness toward
6930 * bfqq. Such a case is when bfqq became active in a burst of
6931 * queue activations. Queues that became active during a large
6932 * burst benefit only from throughput, as discussed in the
6933 * comments on bfq_handle_burst. Thus, if bfqq became active
6934 * in a burst and not idling the device maximizes throughput,
6935 * then the device must no be idled, because not idling the
6936 * device provides bfqq and all other queues in the burst with
6937 * maximum benefit. Combining this and the above case, we can
6938 * now establish when idling is actually needed to preserve
6939 * service guarantees.
6941 idling_needed_for_service_guarantees
=
6942 asymmetric_scenario
&& !bfq_bfqq_in_large_burst(bfqq
);
6945 * We have now all the components we need to compute the return
6946 * value of the function, which is true only if both the following
6948 * 1) bfqq is sync, because idling make sense only for sync queues;
6949 * 2) idling either boosts the throughput (without issues), or
6950 * is necessary to preserve service guarantees.
6952 return bfq_bfqq_sync(bfqq
) &&
6953 (idling_boosts_thr_without_issues
||
6954 idling_needed_for_service_guarantees
);
6958 * If the in-service queue is empty but the function bfq_bfqq_may_idle
6959 * returns true, then:
6960 * 1) the queue must remain in service and cannot be expired, and
6961 * 2) the device must be idled to wait for the possible arrival of a new
6962 * request for the queue.
6963 * See the comments on the function bfq_bfqq_may_idle for the reasons
6964 * why performing device idling is the best choice to boost the throughput
6965 * and preserve service guarantees when bfq_bfqq_may_idle itself
6968 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
6970 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6972 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
6973 bfq_bfqq_may_idle(bfqq
);
6977 * Select a queue for service. If we have a current queue in service,
6978 * check whether to continue servicing it, or retrieve and set a new one.
6980 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
6982 struct bfq_queue
*bfqq
;
6983 struct request
*next_rq
;
6984 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
6986 bfqq
= bfqd
->in_service_queue
;
6990 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
6992 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
6993 !bfq_bfqq_wait_request(bfqq
) &&
6994 !bfq_bfqq_must_idle(bfqq
))
6999 * This loop is rarely executed more than once. Even when it
7000 * happens, it is much more convenient to re-execute this loop
7001 * than to return NULL and trigger a new dispatch to get a
7004 next_rq
= bfqq
->next_rq
;
7006 * If bfqq has requests queued and it has enough budget left to
7007 * serve them, keep the queue, otherwise expire it.
7010 if (bfq_serv_to_charge(next_rq
, bfqq
) >
7011 bfq_bfqq_budget_left(bfqq
)) {
7013 * Expire the queue for budget exhaustion,
7014 * which makes sure that the next budget is
7015 * enough to serve the next request, even if
7016 * it comes from the fifo expired path.
7018 reason
= BFQQE_BUDGET_EXHAUSTED
;
7022 * The idle timer may be pending because we may
7023 * not disable disk idling even when a new request
7026 if (bfq_bfqq_wait_request(bfqq
)) {
7028 * If we get here: 1) at least a new request
7029 * has arrived but we have not disabled the
7030 * timer because the request was too small,
7031 * 2) then the block layer has unplugged
7032 * the device, causing the dispatch to be
7035 * Since the device is unplugged, now the
7036 * requests are probably large enough to
7037 * provide a reasonable throughput.
7038 * So we disable idling.
7040 bfq_clear_bfqq_wait_request(bfqq
);
7041 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
7042 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
7049 * No requests pending. However, if the in-service queue is idling
7050 * for a new request, or has requests waiting for a completion and
7051 * may idle after their completion, then keep it anyway.
7053 if (bfq_bfqq_wait_request(bfqq
) ||
7054 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
7059 reason
= BFQQE_NO_MORE_REQUESTS
;
7061 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
7063 bfqq
= bfq_set_in_service_queue(bfqd
);
7065 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
7070 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
7072 bfq_log(bfqd
, "select_queue: no queue returned");
7077 static void bfq_update_wr_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
7079 struct bfq_entity
*entity
= &bfqq
->entity
;
7081 if (bfqq
->wr_coeff
> 1) { /* queue is being weight-raised */
7082 bfq_log_bfqq(bfqd
, bfqq
,
7083 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
7084 jiffies_to_msecs(jiffies
- bfqq
->last_wr_start_finish
),
7085 jiffies_to_msecs(bfqq
->wr_cur_max_time
),
7087 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
7089 if (entity
->prio_changed
)
7090 bfq_log_bfqq(bfqd
, bfqq
, "WARN: pending prio change");
7093 * If the queue was activated in a burst, or too much
7094 * time has elapsed from the beginning of this
7095 * weight-raising period, then end weight raising.
7097 if (bfq_bfqq_in_large_burst(bfqq
))
7098 bfq_bfqq_end_wr(bfqq
);
7099 else if (time_is_before_jiffies(bfqq
->last_wr_start_finish
+
7100 bfqq
->wr_cur_max_time
)) {
7101 if (bfqq
->wr_cur_max_time
!= bfqd
->bfq_wr_rt_max_time
||
7102 time_is_before_jiffies(bfqq
->wr_start_at_switch_to_srt
+
7103 bfq_wr_duration(bfqd
)))
7104 bfq_bfqq_end_wr(bfqq
);
7106 /* switch back to interactive wr */
7107 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
7108 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
7109 bfqq
->last_wr_start_finish
=
7110 bfqq
->wr_start_at_switch_to_srt
;
7111 bfqq
->entity
.prio_changed
= 1;
7115 /* Update weight both if it must be raised and if it must be lowered */
7116 if ((entity
->weight
> entity
->orig_weight
) != (bfqq
->wr_coeff
> 1))
7117 __bfq_entity_update_weight_prio(
7118 bfq_entity_service_tree(entity
),
7123 * Dispatch next request from bfqq.
7125 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
7126 struct bfq_queue
*bfqq
)
7128 struct request
*rq
= bfqq
->next_rq
;
7129 unsigned long service_to_charge
;
7131 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
7133 bfq_bfqq_served(bfqq
, service_to_charge
);
7135 bfq_dispatch_remove(bfqd
->queue
, rq
);
7138 * If weight raising has to terminate for bfqq, then next
7139 * function causes an immediate update of bfqq's weight,
7140 * without waiting for next activation. As a consequence, on
7141 * expiration, bfqq will be timestamped as if has never been
7142 * weight-raised during this service slot, even if it has
7143 * received part or even most of the service as a
7144 * weight-raised queue. This inflates bfqq's timestamps, which
7145 * is beneficial, as bfqq is then more willing to leave the
7146 * device immediately to possible other weight-raised queues.
7148 bfq_update_wr_data(bfqd
, bfqq
);
7150 if (!bfqd
->in_service_bic
) {
7151 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
7152 bfqd
->in_service_bic
= RQ_BIC(rq
);
7156 * Expire bfqq, pretending that its budget expired, if bfqq
7157 * belongs to CLASS_IDLE and other queues are waiting for
7160 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
7166 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
7170 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
7172 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
7175 * Avoiding lock: a race on bfqd->busy_queues should cause at
7176 * most a call to dispatch for nothing
7178 return !list_empty_careful(&bfqd
->dispatch
) ||
7179 bfqd
->busy_queues
> 0;
7182 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
7184 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
7185 struct request
*rq
= NULL
;
7186 struct bfq_queue
*bfqq
= NULL
;
7188 if (!list_empty(&bfqd
->dispatch
)) {
7189 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
7191 list_del_init(&rq
->queuelist
);
7197 * Increment counters here, because this
7198 * dispatch does not follow the standard
7199 * dispatch flow (where counters are
7204 goto inc_in_driver_start_rq
;
7208 * We exploit the put_rq_private hook to decrement
7209 * rq_in_driver, but put_rq_private will not be
7210 * invoked on this request. So, to avoid unbalance,
7211 * just start this request, without incrementing
7212 * rq_in_driver. As a negative consequence,
7213 * rq_in_driver is deceptively lower than it should be
7214 * while this request is in service. This may cause
7215 * bfq_schedule_dispatch to be invoked uselessly.
7217 * As for implementing an exact solution, the
7218 * put_request hook, if defined, is probably invoked
7219 * also on this request. So, by exploiting this hook,
7220 * we could 1) increment rq_in_driver here, and 2)
7221 * decrement it in put_request. Such a solution would
7222 * let the value of the counter be always accurate,
7223 * but it would entail using an extra interface
7224 * function. This cost seems higher than the benefit,
7225 * being the frequency of non-elevator-private
7226 * requests very low.
7231 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
7233 if (bfqd
->busy_queues
== 0)
7237 * Force device to serve one request at a time if
7238 * strict_guarantees is true. Forcing this service scheme is
7239 * currently the ONLY way to guarantee that the request
7240 * service order enforced by the scheduler is respected by a
7241 * queueing device. Otherwise the device is free even to make
7242 * some unlucky request wait for as long as the device
7245 * Of course, serving one request at at time may cause loss of
7248 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
7251 bfqq
= bfq_select_queue(bfqd
);
7255 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
7258 inc_in_driver_start_rq
:
7259 bfqd
->rq_in_driver
++;
7261 rq
->rq_flags
|= RQF_STARTED
;
7267 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
7269 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
7272 spin_lock_irq(&bfqd
->lock
);
7274 rq
= __bfq_dispatch_request(hctx
);
7275 bfq_unlock_put_ioc(bfqd
);
7281 * Task holds one reference to the queue, dropped when task exits. Each rq
7282 * in-flight on this queue also holds a reference, dropped when rq is freed.
7284 * Scheduler lock must be held here. Recall not to use bfqq after calling
7285 * this function on it.
7287 static void bfq_put_queue(struct bfq_queue
*bfqq
)
7289 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7290 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
7294 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
7301 if (bfq_bfqq_sync(bfqq
))
7303 * The fact that this queue is being destroyed does not
7304 * invalidate the fact that this queue may have been
7305 * activated during the current burst. As a consequence,
7306 * although the queue does not exist anymore, and hence
7307 * needs to be removed from the burst list if there,
7308 * the burst size has not to be decremented.
7310 hlist_del_init(&bfqq
->burst_list_node
);
7312 kmem_cache_free(bfq_pool
, bfqq
);
7313 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7318 static void bfq_put_cooperator(struct bfq_queue
*bfqq
)
7320 struct bfq_queue
*__bfqq
, *next
;
7323 * If this queue was scheduled to merge with another queue, be
7324 * sure to drop the reference taken on that queue (and others in
7325 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
7327 __bfqq
= bfqq
->new_bfqq
;
7331 next
= __bfqq
->new_bfqq
;
7332 bfq_put_queue(__bfqq
);
7337 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
7339 if (bfqq
== bfqd
->in_service_queue
) {
7340 __bfq_bfqq_expire(bfqd
, bfqq
);
7341 bfq_schedule_dispatch(bfqd
);
7344 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
7346 bfq_put_cooperator(bfqq
);
7348 bfq_put_queue(bfqq
); /* release process reference */
7351 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
7353 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
7354 struct bfq_data
*bfqd
;
7357 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
7360 unsigned long flags
;
7362 spin_lock_irqsave(&bfqd
->lock
, flags
);
7364 * If the bic is using a shared queue, put the
7365 * reference taken on the io_context when the bic
7366 * started using a shared bfq_queue. This put cannot
7367 * make ioc->ref_count reach 0, then no ioc->lock
7368 * risks to be taken (leading to possible deadlock
7371 if (is_sync
&& bfq_bfqq_coop(bfqq
))
7372 put_io_context(bic
->icq
.ioc
);
7374 bfq_exit_bfqq(bfqd
, bfqq
);
7375 bic_set_bfqq(bic
, NULL
, is_sync
);
7376 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7380 static void bfq_exit_icq(struct io_cq
*icq
)
7382 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
7384 bfq_exit_icq_bfqq(bic
, true);
7385 bfq_exit_icq_bfqq(bic
, false);
7389 * Update the entity prio values; note that the new values will not
7390 * be used until the next (re)activation.
7393 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
7395 struct task_struct
*tsk
= current
;
7397 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7402 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
7403 switch (ioprio_class
) {
7405 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
7406 "bfq: bad prio class %d\n", ioprio_class
);
7407 case IOPRIO_CLASS_NONE
:
7409 * No prio set, inherit CPU scheduling settings.
7411 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
7412 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
7414 case IOPRIO_CLASS_RT
:
7415 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7416 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
7418 case IOPRIO_CLASS_BE
:
7419 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7420 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
7422 case IOPRIO_CLASS_IDLE
:
7423 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
7424 bfqq
->new_ioprio
= 7;
7425 bfq_clear_bfqq_idle_window(bfqq
);
7429 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
7430 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
7432 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
7435 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
7436 bfqq
->entity
.prio_changed
= 1;
7439 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
7441 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
7442 struct bfq_queue
*bfqq
;
7443 int ioprio
= bic
->icq
.ioc
->ioprio
;
7446 * This condition may trigger on a newly created bic, be sure to
7447 * drop the lock before returning.
7449 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
7452 bic
->ioprio
= ioprio
;
7454 bfqq
= bic_to_bfqq(bic
, false);
7456 /* release process reference on this queue */
7457 bfq_put_queue(bfqq
);
7458 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
7459 bic_set_bfqq(bic
, bfqq
, false);
7462 bfqq
= bic_to_bfqq(bic
, true);
7464 bfq_set_next_ioprio_data(bfqq
, bic
);
7467 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7468 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
7470 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
7471 INIT_LIST_HEAD(&bfqq
->fifo
);
7472 INIT_HLIST_NODE(&bfqq
->burst_list_node
);
7478 bfq_set_next_ioprio_data(bfqq
, bic
);
7481 if (!bfq_class_idle(bfqq
))
7482 bfq_mark_bfqq_idle_window(bfqq
);
7483 bfq_mark_bfqq_sync(bfqq
);
7484 bfq_mark_bfqq_just_created(bfqq
);
7486 bfq_clear_bfqq_sync(bfqq
);
7488 /* set end request to minus infinity from now */
7489 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
7491 bfq_mark_bfqq_IO_bound(bfqq
);
7495 /* Tentative initial value to trade off between thr and lat */
7496 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
7497 bfqq
->budget_timeout
= bfq_smallest_from_now();
7500 bfqq
->last_wr_start_finish
= jiffies
;
7501 bfqq
->wr_start_at_switch_to_srt
= bfq_smallest_from_now();
7502 bfqq
->split_time
= bfq_smallest_from_now();
7505 * Set to the value for which bfqq will not be deemed as
7506 * soft rt when it becomes backlogged.
7508 bfqq
->soft_rt_next_start
= bfq_greatest_from_now();
7510 /* first request is almost certainly seeky */
7511 bfqq
->seek_history
= 1;
7514 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
7515 struct bfq_group
*bfqg
,
7516 int ioprio_class
, int ioprio
)
7518 switch (ioprio_class
) {
7519 case IOPRIO_CLASS_RT
:
7520 return &bfqg
->async_bfqq
[0][ioprio
];
7521 case IOPRIO_CLASS_NONE
:
7522 ioprio
= IOPRIO_NORM
;
7524 case IOPRIO_CLASS_BE
:
7525 return &bfqg
->async_bfqq
[1][ioprio
];
7526 case IOPRIO_CLASS_IDLE
:
7527 return &bfqg
->async_idle_bfqq
;
7533 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
7534 struct bio
*bio
, bool is_sync
,
7535 struct bfq_io_cq
*bic
)
7537 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7538 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
7539 struct bfq_queue
**async_bfqq
= NULL
;
7540 struct bfq_queue
*bfqq
;
7541 struct bfq_group
*bfqg
;
7545 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
7547 bfqq
= &bfqd
->oom_bfqq
;
7552 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
7559 bfqq
= kmem_cache_alloc_node(bfq_pool
,
7560 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
7564 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
7566 bfq_init_entity(&bfqq
->entity
, bfqg
);
7567 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
7569 bfqq
= &bfqd
->oom_bfqq
;
7570 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
7575 * Pin the queue now that it's allocated, scheduler exit will
7580 * Extra group reference, w.r.t. sync
7581 * queue. This extra reference is removed
7582 * only if bfqq->bfqg disappears, to
7583 * guarantee that this queue is not freed
7584 * until its group goes away.
7586 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
7592 bfqq
->ref
++; /* get a process reference to this queue */
7593 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
7598 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
7599 struct bfq_queue
*bfqq
)
7601 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
7602 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
7604 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
7606 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
7607 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
7608 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
7609 ttime
->ttime_samples
);
7613 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7616 bfqq
->seek_history
<<= 1;
7617 bfqq
->seek_history
|=
7618 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
7619 (!blk_queue_nonrot(bfqd
->queue
) ||
7620 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
7624 * Disable idle window if the process thinks too long or seeks so much that
7625 * it doesn't matter.
7627 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
7628 struct bfq_queue
*bfqq
,
7629 struct bfq_io_cq
*bic
)
7633 /* Don't idle for async or idle io prio class. */
7634 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
7637 /* Idle window just restored, statistics are meaningless. */
7638 if (time_is_after_eq_jiffies(bfqq
->split_time
+
7639 bfqd
->bfq_wr_min_idle_time
))
7642 enable_idle
= bfq_bfqq_idle_window(bfqq
);
7644 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
7645 bfqd
->bfq_slice_idle
== 0 ||
7646 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
7647 bfqq
->wr_coeff
== 1))
7649 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
7650 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
7651 bfqq
->wr_coeff
== 1)
7656 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
7660 bfq_mark_bfqq_idle_window(bfqq
);
7662 bfq_clear_bfqq_idle_window(bfqq
);
7666 * Called when a new fs request (rq) is added to bfqq. Check if there's
7667 * something we should do about it.
7669 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7672 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
7674 if (rq
->cmd_flags
& REQ_META
)
7675 bfqq
->meta_pending
++;
7677 bfq_update_io_thinktime(bfqd
, bfqq
);
7678 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
7679 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
7681 bfq_update_idle_window(bfqd
, bfqq
, bic
);
7683 bfq_log_bfqq(bfqd
, bfqq
,
7684 "rq_enqueued: idle_window=%d (seeky %d)",
7685 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
7687 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
7689 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
7690 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
7691 blk_rq_sectors(rq
) < 32;
7692 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
7695 * There is just this request queued: if the request
7696 * is small and the queue is not to be expired, then
7699 * In this way, if the device is being idled to wait
7700 * for a new request from the in-service queue, we
7701 * avoid unplugging the device and committing the
7702 * device to serve just a small request. On the
7703 * contrary, we wait for the block layer to decide
7704 * when to unplug the device: hopefully, new requests
7705 * will be merged to this one quickly, then the device
7706 * will be unplugged and larger requests will be
7709 if (small_req
&& !budget_timeout
)
7713 * A large enough request arrived, or the queue is to
7714 * be expired: in both cases disk idling is to be
7715 * stopped, so clear wait_request flag and reset
7718 bfq_clear_bfqq_wait_request(bfqq
);
7719 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
7720 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
7723 * The queue is not empty, because a new request just
7724 * arrived. Hence we can safely expire the queue, in
7725 * case of budget timeout, without risking that the
7726 * timestamps of the queue are not updated correctly.
7727 * See [1] for more details.
7730 bfq_bfqq_expire(bfqd
, bfqq
, false,
7731 BFQQE_BUDGET_TIMEOUT
);
7735 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
7737 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
),
7738 *new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, rq
, true);
7741 if (bic_to_bfqq(RQ_BIC(rq
), 1) != bfqq
)
7742 new_bfqq
= bic_to_bfqq(RQ_BIC(rq
), 1);
7744 * Release the request's reference to the old bfqq
7745 * and make sure one is taken to the shared queue.
7747 new_bfqq
->allocated
++;
7750 bfq_clear_bfqq_just_created(bfqq
);
7752 * If the bic associated with the process
7753 * issuing this request still points to bfqq
7754 * (and thus has not been already redirected
7755 * to new_bfqq or even some other bfq_queue),
7756 * then complete the merge and redirect it to
7759 if (bic_to_bfqq(RQ_BIC(rq
), 1) == bfqq
)
7760 bfq_merge_bfqqs(bfqd
, RQ_BIC(rq
),
7763 * rq is about to be enqueued into new_bfqq,
7764 * release rq reference on bfqq
7766 bfq_put_queue(bfqq
);
7767 rq
->elv
.priv
[1] = new_bfqq
;
7771 bfq_add_request(rq
);
7773 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
7774 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
7776 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
7779 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
7782 struct request_queue
*q
= hctx
->queue
;
7783 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
7785 spin_lock_irq(&bfqd
->lock
);
7786 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
7787 spin_unlock_irq(&bfqd
->lock
);
7791 spin_unlock_irq(&bfqd
->lock
);
7793 blk_mq_sched_request_inserted(rq
);
7795 spin_lock_irq(&bfqd
->lock
);
7796 if (at_head
|| blk_rq_is_passthrough(rq
)) {
7798 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
7800 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
7802 __bfq_insert_request(bfqd
, rq
);
7804 if (rq_mergeable(rq
)) {
7805 elv_rqhash_add(q
, rq
);
7811 bfq_unlock_put_ioc(bfqd
);
7814 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
7815 struct list_head
*list
, bool at_head
)
7817 while (!list_empty(list
)) {
7820 rq
= list_first_entry(list
, struct request
, queuelist
);
7821 list_del_init(&rq
->queuelist
);
7822 bfq_insert_request(hctx
, rq
, at_head
);
7826 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
7828 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
7829 bfqd
->rq_in_driver
);
7831 if (bfqd
->hw_tag
== 1)
7835 * This sample is valid if the number of outstanding requests
7836 * is large enough to allow a queueing behavior. Note that the
7837 * sum is not exact, as it's not taking into account deactivated
7840 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
7843 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
7846 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
7847 bfqd
->max_rq_in_driver
= 0;
7848 bfqd
->hw_tag_samples
= 0;
7851 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
7856 bfq_update_hw_tag(bfqd
);
7858 bfqd
->rq_in_driver
--;
7861 if (!bfqq
->dispatched
&& !bfq_bfqq_busy(bfqq
)) {
7863 * Set budget_timeout (which we overload to store the
7864 * time at which the queue remains with no backlog and
7865 * no outstanding request; used by the weight-raising
7868 bfqq
->budget_timeout
= jiffies
;
7870 bfq_weights_tree_remove(bfqd
, &bfqq
->entity
,
7871 &bfqd
->queue_weights_tree
);
7874 now_ns
= ktime_get_ns();
7876 bfqq
->ttime
.last_end_request
= now_ns
;
7879 * Using us instead of ns, to get a reasonable precision in
7880 * computing rate in next check.
7882 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
7885 * If the request took rather long to complete, and, according
7886 * to the maximum request size recorded, this completion latency
7887 * implies that the request was certainly served at a very low
7888 * rate (less than 1M sectors/sec), then the whole observation
7889 * interval that lasts up to this time instant cannot be a
7890 * valid time interval for computing a new peak rate. Invoke
7891 * bfq_update_rate_reset to have the following three steps
7893 * - close the observation interval at the last (previous)
7894 * request dispatch or completion
7895 * - compute rate, if possible, for that observation interval
7896 * - reset to zero samples, which will trigger a proper
7897 * re-initialization of the observation interval on next
7900 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
7901 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
7902 1UL<<(BFQ_RATE_SHIFT
- 10))
7903 bfq_update_rate_reset(bfqd
, NULL
);
7904 bfqd
->last_completion
= now_ns
;
7907 * If we are waiting to discover whether the request pattern
7908 * of the task associated with the queue is actually
7909 * isochronous, and both requisites for this condition to hold
7910 * are now satisfied, then compute soft_rt_next_start (see the
7911 * comments on the function bfq_bfqq_softrt_next_start()). We
7912 * schedule this delayed check when bfqq expires, if it still
7913 * has in-flight requests.
7915 if (bfq_bfqq_softrt_update(bfqq
) && bfqq
->dispatched
== 0 &&
7916 RB_EMPTY_ROOT(&bfqq
->sort_list
))
7917 bfqq
->soft_rt_next_start
=
7918 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
7921 * If this is the in-service queue, check if it needs to be expired,
7922 * or if we want to idle in case it has no pending requests.
7924 if (bfqd
->in_service_queue
== bfqq
) {
7925 if (bfqq
->dispatched
== 0 && bfq_bfqq_must_idle(bfqq
)) {
7926 bfq_arm_slice_timer(bfqd
);
7928 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
7929 bfq_bfqq_expire(bfqd
, bfqq
, false,
7930 BFQQE_BUDGET_TIMEOUT
);
7931 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
7932 (bfqq
->dispatched
== 0 ||
7933 !bfq_bfqq_may_idle(bfqq
)))
7934 bfq_bfqq_expire(bfqd
, bfqq
, false,
7935 BFQQE_NO_MORE_REQUESTS
);
7939 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
7943 bfq_put_queue(bfqq
);
7946 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
7948 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
7949 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7951 if (rq
->rq_flags
& RQF_STARTED
)
7952 bfqg_stats_update_completion(bfqq_group(bfqq
),
7953 rq_start_time_ns(rq
),
7954 rq_io_start_time_ns(rq
),
7957 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
7958 unsigned long flags
;
7960 spin_lock_irqsave(&bfqd
->lock
, flags
);
7962 bfq_completed_request(bfqq
, bfqd
);
7963 bfq_put_rq_priv_body(bfqq
);
7965 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7968 * Request rq may be still/already in the scheduler,
7969 * in which case we need to remove it. And we cannot
7970 * defer such a check and removal, to avoid
7971 * inconsistencies in the time interval from the end
7972 * of this function to the start of the deferred work.
7973 * This situation seems to occur only in process
7974 * context, as a consequence of a merge. In the
7975 * current version of the code, this implies that the
7979 if (!RB_EMPTY_NODE(&rq
->rb_node
))
7980 bfq_remove_request(q
, rq
);
7981 bfq_put_rq_priv_body(bfqq
);
7984 rq
->elv
.priv
[0] = NULL
;
7985 rq
->elv
.priv
[1] = NULL
;
7989 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
7990 * was the last process referring to that bfqq.
7992 static struct bfq_queue
*
7993 bfq_split_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
)
7995 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "splitting queue");
7997 if (bfqq_process_refs(bfqq
) == 1) {
7998 bfqq
->pid
= current
->pid
;
7999 bfq_clear_bfqq_coop(bfqq
);
8000 bfq_clear_bfqq_split_coop(bfqq
);
8004 bic_set_bfqq(bic
, NULL
, 1);
8006 bfq_put_cooperator(bfqq
);
8008 bfq_put_queue(bfqq
);
8012 static struct bfq_queue
*bfq_get_bfqq_handle_split(struct bfq_data
*bfqd
,
8013 struct bfq_io_cq
*bic
,
8015 bool split
, bool is_sync
,
8018 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
8020 if (likely(bfqq
&& bfqq
!= &bfqd
->oom_bfqq
))
8027 bfq_put_queue(bfqq
);
8028 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
8030 bic_set_bfqq(bic
, bfqq
, is_sync
);
8031 if (split
&& is_sync
) {
8032 if ((bic
->was_in_burst_list
&& bfqd
->large_burst
) ||
8033 bic
->saved_in_large_burst
)
8034 bfq_mark_bfqq_in_large_burst(bfqq
);
8036 bfq_clear_bfqq_in_large_burst(bfqq
);
8037 if (bic
->was_in_burst_list
)
8038 hlist_add_head(&bfqq
->burst_list_node
,
8041 bfqq
->split_time
= jiffies
;
8048 * Allocate bfq data structures associated with this request.
8050 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
8053 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
8054 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
8055 const int is_sync
= rq_is_sync(rq
);
8056 struct bfq_queue
*bfqq
;
8057 bool new_queue
= false;
8059 spin_lock_irq(&bfqd
->lock
);
8061 bfq_check_ioprio_change(bic
, bio
);
8066 bfq_bic_update_cgroup(bic
, bio
);
8068 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
, false, is_sync
,
8071 if (likely(!new_queue
)) {
8072 /* If the queue was seeky for too long, break it apart. */
8073 if (bfq_bfqq_coop(bfqq
) && bfq_bfqq_split_coop(bfqq
)) {
8074 bfq_log_bfqq(bfqd
, bfqq
, "breaking apart bfqq");
8076 /* Update bic before losing reference to bfqq */
8077 if (bfq_bfqq_in_large_burst(bfqq
))
8078 bic
->saved_in_large_burst
= true;
8080 bfqq
= bfq_split_bfqq(bic
, bfqq
);
8082 * A reference to bic->icq.ioc needs to be
8083 * released after a queue split. Do not do it
8084 * immediately, to not risk to possibly take
8085 * an ioc->lock while holding the scheduler
8088 bfqd
->ioc_to_put
= bic
->icq
.ioc
;
8091 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
,
8099 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
8100 rq
, bfqq
, bfqq
->ref
);
8102 rq
->elv
.priv
[0] = bic
;
8103 rq
->elv
.priv
[1] = bfqq
;
8106 * If a bfq_queue has only one process reference, it is owned
8107 * by only this bic: we can then set bfqq->bic = bic. in
8108 * addition, if the queue has also just been split, we have to
8111 if (likely(bfqq
!= &bfqd
->oom_bfqq
) && bfqq_process_refs(bfqq
) == 1) {
8113 if (bfqd
->ioc_to_put
) { /* if true, there has been a split */
8115 * The queue has just been split from a shared
8116 * queue: restore the idle window and the
8117 * possible weight raising period.
8119 bfq_bfqq_resume_state(bfqq
, bic
);
8123 if (unlikely(bfq_bfqq_just_created(bfqq
)))
8124 bfq_handle_burst(bfqd
, bfqq
);
8126 bfq_unlock_put_ioc(bfqd
);
8131 spin_unlock_irq(&bfqd
->lock
);
8136 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
8138 struct bfq_data
*bfqd
= bfqq
->bfqd
;
8139 enum bfqq_expiration reason
;
8140 unsigned long flags
;
8142 spin_lock_irqsave(&bfqd
->lock
, flags
);
8143 bfq_clear_bfqq_wait_request(bfqq
);
8145 if (bfqq
!= bfqd
->in_service_queue
) {
8146 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
8150 if (bfq_bfqq_budget_timeout(bfqq
))
8152 * Also here the queue can be safely expired
8153 * for budget timeout without wasting
8156 reason
= BFQQE_BUDGET_TIMEOUT
;
8157 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
8159 * The queue may not be empty upon timer expiration,
8160 * because we may not disable the timer when the
8161 * first request of the in-service queue arrives
8162 * during disk idling.
8164 reason
= BFQQE_TOO_IDLE
;
8166 goto schedule_dispatch
;
8168 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
8171 bfq_unlock_put_ioc_restore(bfqd
, flags
);
8172 bfq_schedule_dispatch(bfqd
);
8176 * Handler of the expiration of the timer running if the in-service queue
8177 * is idling inside its time slice.
8179 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
8181 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
8183 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
8186 * Theoretical race here: the in-service queue can be NULL or
8187 * different from the queue that was idling if a new request
8188 * arrives for the current queue and there is a full dispatch
8189 * cycle that changes the in-service queue. This can hardly
8190 * happen, but in the worst case we just expire a queue too
8194 bfq_idle_slice_timer_body(bfqq
);
8196 return HRTIMER_NORESTART
;
8199 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
8200 struct bfq_queue
**bfqq_ptr
)
8202 struct bfq_queue
*bfqq
= *bfqq_ptr
;
8204 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
8206 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
8208 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
8210 bfq_put_queue(bfqq
);
8216 * Release all the bfqg references to its async queues. If we are
8217 * deallocating the group these queues may still contain requests, so
8218 * we reparent them to the root cgroup (i.e., the only one that will
8219 * exist for sure until all the requests on a device are gone).
8221 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
8225 for (i
= 0; i
< 2; i
++)
8226 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
8227 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
8229 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
8232 static void bfq_exit_queue(struct elevator_queue
*e
)
8234 struct bfq_data
*bfqd
= e
->elevator_data
;
8235 struct bfq_queue
*bfqq
, *n
;
8237 hrtimer_cancel(&bfqd
->idle_slice_timer
);
8239 spin_lock_irq(&bfqd
->lock
);
8240 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
8241 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
8242 spin_unlock_irq(&bfqd
->lock
);
8244 hrtimer_cancel(&bfqd
->idle_slice_timer
);
8246 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8247 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
8249 spin_lock_irq(&bfqd
->lock
);
8250 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
8251 kfree(bfqd
->root_group
);
8252 spin_unlock_irq(&bfqd
->lock
);
8258 static void bfq_init_root_group(struct bfq_group
*root_group
,
8259 struct bfq_data
*bfqd
)
8263 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8264 root_group
->entity
.parent
= NULL
;
8265 root_group
->my_entity
= NULL
;
8266 root_group
->bfqd
= bfqd
;
8268 root_group
->rq_pos_tree
= RB_ROOT
;
8269 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
8270 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
8271 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
8274 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
8276 struct bfq_data
*bfqd
;
8277 struct elevator_queue
*eq
;
8279 eq
= elevator_alloc(q
, e
);
8283 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
8285 kobject_put(&eq
->kobj
);
8288 eq
->elevator_data
= bfqd
;
8290 spin_lock_irq(q
->queue_lock
);
8292 spin_unlock_irq(q
->queue_lock
);
8295 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
8296 * Grab a permanent reference to it, so that the normal code flow
8297 * will not attempt to free it.
8299 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
8300 bfqd
->oom_bfqq
.ref
++;
8301 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
8302 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
8303 bfqd
->oom_bfqq
.entity
.new_weight
=
8304 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
8306 /* oom_bfqq does not participate to bursts */
8307 bfq_clear_bfqq_just_created(&bfqd
->oom_bfqq
);
8310 * Trigger weight initialization, according to ioprio, at the
8311 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
8312 * class won't be changed any more.
8314 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
8318 INIT_LIST_HEAD(&bfqd
->dispatch
);
8320 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
8322 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
8324 bfqd
->queue_weights_tree
= RB_ROOT
;
8325 bfqd
->group_weights_tree
= RB_ROOT
;
8327 INIT_LIST_HEAD(&bfqd
->active_list
);
8328 INIT_LIST_HEAD(&bfqd
->idle_list
);
8329 INIT_HLIST_HEAD(&bfqd
->burst_list
);
8333 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
8335 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
8336 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
8337 bfqd
->bfq_back_max
= bfq_back_max
;
8338 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
8339 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
8340 bfqd
->bfq_timeout
= bfq_timeout
;
8342 bfqd
->bfq_requests_within_timer
= 120;
8344 bfqd
->bfq_large_burst_thresh
= 8;
8345 bfqd
->bfq_burst_interval
= msecs_to_jiffies(180);
8347 bfqd
->low_latency
= true;
8350 * Trade-off between responsiveness and fairness.
8352 bfqd
->bfq_wr_coeff
= 30;
8353 bfqd
->bfq_wr_rt_max_time
= msecs_to_jiffies(300);
8354 bfqd
->bfq_wr_max_time
= 0;
8355 bfqd
->bfq_wr_min_idle_time
= msecs_to_jiffies(2000);
8356 bfqd
->bfq_wr_min_inter_arr_async
= msecs_to_jiffies(500);
8357 bfqd
->bfq_wr_max_softrt_rate
= 7000; /*
8358 * Approximate rate required
8359 * to playback or record a
8360 * high-definition compressed
8363 bfqd
->wr_busy_queues
= 0;
8366 * Begin by assuming, optimistically, that the device is a
8367 * high-speed one, and that its peak rate is equal to 2/3 of
8368 * the highest reference rate.
8370 bfqd
->RT_prod
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] *
8371 T_fast
[blk_queue_nonrot(bfqd
->queue
)];
8372 bfqd
->peak_rate
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] * 2 / 3;
8373 bfqd
->device_speed
= BFQ_BFQD_FAST
;
8375 spin_lock_init(&bfqd
->lock
);
8378 * The invocation of the next bfq_create_group_hierarchy
8379 * function is the head of a chain of function calls
8380 * (bfq_create_group_hierarchy->blkcg_activate_policy->
8381 * blk_mq_freeze_queue) that may lead to the invocation of the
8382 * has_work hook function. For this reason,
8383 * bfq_create_group_hierarchy is invoked only after all
8384 * scheduler data has been initialized, apart from the fields
8385 * that can be initialized only after invoking
8386 * bfq_create_group_hierarchy. This, in particular, enables
8387 * has_work to correctly return false. Of course, to avoid
8388 * other inconsistencies, the blk-mq stack must then refrain
8389 * from invoking further scheduler hooks before this init
8390 * function is finished.
8392 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
8393 if (!bfqd
->root_group
)
8395 bfq_init_root_group(bfqd
->root_group
, bfqd
);
8396 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
8403 kobject_put(&eq
->kobj
);
8407 static void bfq_slab_kill(void)
8409 kmem_cache_destroy(bfq_pool
);
8412 static int __init
bfq_slab_setup(void)
8414 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
8420 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
8422 return sprintf(page
, "%u\n", var
);
8425 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
8428 unsigned long new_val
;
8429 int ret
= kstrtoul(page
, 10, &new_val
);
8437 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
8438 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
8440 struct bfq_data *bfqd = e->elevator_data; \
8441 u64 __data = __VAR; \
8443 __data = jiffies_to_msecs(__data); \
8444 else if (__CONV == 2) \
8445 __data = div_u64(__data, NSEC_PER_MSEC); \
8446 return bfq_var_show(__data, (page)); \
8448 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
8449 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
8450 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
8451 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
8452 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
8453 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
8454 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
8455 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
8456 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
8457 #undef SHOW_FUNCTION
8459 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
8460 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
8462 struct bfq_data *bfqd = e->elevator_data; \
8463 u64 __data = __VAR; \
8464 __data = div_u64(__data, NSEC_PER_USEC); \
8465 return bfq_var_show(__data, (page)); \
8467 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
8468 #undef USEC_SHOW_FUNCTION
8470 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
8472 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
8474 struct bfq_data *bfqd = e->elevator_data; \
8475 unsigned long uninitialized_var(__data); \
8476 int ret = bfq_var_store(&__data, (page), count); \
8477 if (__data < (MIN)) \
8479 else if (__data > (MAX)) \
8482 *(__PTR) = msecs_to_jiffies(__data); \
8483 else if (__CONV == 2) \
8484 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
8486 *(__PTR) = __data; \
8489 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
8491 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
8493 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
8494 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
8496 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
8497 #undef STORE_FUNCTION
8499 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
8500 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
8502 struct bfq_data *bfqd = e->elevator_data; \
8503 unsigned long uninitialized_var(__data); \
8504 int ret = bfq_var_store(&__data, (page), count); \
8505 if (__data < (MIN)) \
8507 else if (__data > (MAX)) \
8509 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
8512 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
8514 #undef USEC_STORE_FUNCTION
8516 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
8517 const char *page
, size_t count
)
8519 struct bfq_data
*bfqd
= e
->elevator_data
;
8520 unsigned long uninitialized_var(__data
);
8521 int ret
= bfq_var_store(&__data
, (page
), count
);
8524 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
8526 if (__data
> INT_MAX
)
8528 bfqd
->bfq_max_budget
= __data
;
8531 bfqd
->bfq_user_max_budget
= __data
;
8537 * Leaving this name to preserve name compatibility with cfq
8538 * parameters, but this timeout is used for both sync and async.
8540 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
8541 const char *page
, size_t count
)
8543 struct bfq_data
*bfqd
= e
->elevator_data
;
8544 unsigned long uninitialized_var(__data
);
8545 int ret
= bfq_var_store(&__data
, (page
), count
);
8549 else if (__data
> INT_MAX
)
8552 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
8553 if (bfqd
->bfq_user_max_budget
== 0)
8554 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
8559 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
8560 const char *page
, size_t count
)
8562 struct bfq_data
*bfqd
= e
->elevator_data
;
8563 unsigned long uninitialized_var(__data
);
8564 int ret
= bfq_var_store(&__data
, (page
), count
);
8568 if (!bfqd
->strict_guarantees
&& __data
== 1
8569 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
8570 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
8572 bfqd
->strict_guarantees
= __data
;
8577 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
8578 const char *page
, size_t count
)
8580 struct bfq_data
*bfqd
= e
->elevator_data
;
8581 unsigned long uninitialized_var(__data
);
8582 int ret
= bfq_var_store(&__data
, (page
), count
);
8586 if (__data
== 0 && bfqd
->low_latency
!= 0)
8588 bfqd
->low_latency
= __data
;
8593 #define BFQ_ATTR(name) \
8594 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
8596 static struct elv_fs_entry bfq_attrs
[] = {
8597 BFQ_ATTR(fifo_expire_sync
),
8598 BFQ_ATTR(fifo_expire_async
),
8599 BFQ_ATTR(back_seek_max
),
8600 BFQ_ATTR(back_seek_penalty
),
8601 BFQ_ATTR(slice_idle
),
8602 BFQ_ATTR(slice_idle_us
),
8603 BFQ_ATTR(max_budget
),
8604 BFQ_ATTR(timeout_sync
),
8605 BFQ_ATTR(strict_guarantees
),
8606 BFQ_ATTR(low_latency
),
8610 static struct elevator_type iosched_bfq_mq
= {
8612 .get_rq_priv
= bfq_get_rq_private
,
8613 .put_rq_priv
= bfq_put_rq_private
,
8614 .exit_icq
= bfq_exit_icq
,
8615 .insert_requests
= bfq_insert_requests
,
8616 .dispatch_request
= bfq_dispatch_request
,
8617 .next_request
= elv_rb_latter_request
,
8618 .former_request
= elv_rb_former_request
,
8619 .allow_merge
= bfq_allow_bio_merge
,
8620 .bio_merge
= bfq_bio_merge
,
8621 .request_merge
= bfq_request_merge
,
8622 .requests_merged
= bfq_requests_merged
,
8623 .request_merged
= bfq_request_merged
,
8624 .has_work
= bfq_has_work
,
8625 .init_sched
= bfq_init_queue
,
8626 .exit_sched
= bfq_exit_queue
,
8630 .icq_size
= sizeof(struct bfq_io_cq
),
8631 .icq_align
= __alignof__(struct bfq_io_cq
),
8632 .elevator_attrs
= bfq_attrs
,
8633 .elevator_name
= "bfq",
8634 .elevator_owner
= THIS_MODULE
,
8637 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8638 static struct blkcg_policy blkcg_policy_bfq
= {
8639 .dfl_cftypes
= bfq_blkg_files
,
8640 .legacy_cftypes
= bfq_blkcg_legacy_files
,
8642 .cpd_alloc_fn
= bfq_cpd_alloc
,
8643 .cpd_init_fn
= bfq_cpd_init
,
8644 .cpd_bind_fn
= bfq_cpd_init
,
8645 .cpd_free_fn
= bfq_cpd_free
,
8647 .pd_alloc_fn
= bfq_pd_alloc
,
8648 .pd_init_fn
= bfq_pd_init
,
8649 .pd_offline_fn
= bfq_pd_offline
,
8650 .pd_free_fn
= bfq_pd_free
,
8651 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
8655 static int __init
bfq_init(void)
8659 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8660 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
8666 if (bfq_slab_setup())
8670 * Times to load large popular applications for the typical
8671 * systems installed on the reference devices (see the
8672 * comments before the definitions of the next two
8673 * arrays). Actually, we use slightly slower values, as the
8674 * estimated peak rate tends to be smaller than the actual
8675 * peak rate. The reason for this last fact is that estimates
8676 * are computed over much shorter time intervals than the long
8677 * intervals typically used for benchmarking. Why? First, to
8678 * adapt more quickly to variations. Second, because an I/O
8679 * scheduler cannot rely on a peak-rate-evaluation workload to
8680 * be run for a long time.
8682 T_slow
[0] = msecs_to_jiffies(3500); /* actually 4 sec */
8683 T_slow
[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
8684 T_fast
[0] = msecs_to_jiffies(7000); /* actually 8 sec */
8685 T_fast
[1] = msecs_to_jiffies(2500); /* actually 3 sec */
8688 * Thresholds that determine the switch between speed classes
8689 * (see the comments before the definition of the array
8690 * device_speed_thresh). These thresholds are biased towards
8691 * transitions to the fast class. This is safer than the
8692 * opposite bias. In fact, a wrong transition to the slow
8693 * class results in short weight-raising periods, because the
8694 * speed of the device then tends to be higher that the
8695 * reference peak rate. On the opposite end, a wrong
8696 * transition to the fast class tends to increase
8697 * weight-raising periods, because of the opposite reason.
8699 device_speed_thresh
[0] = (4 * R_slow
[0]) / 3;
8700 device_speed_thresh
[1] = (4 * R_slow
[1]) / 3;
8702 ret
= elv_register(&iosched_bfq_mq
);
8709 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8710 blkcg_policy_unregister(&blkcg_policy_bfq
);
8715 static void __exit
bfq_exit(void)
8717 elv_unregister(&iosched_bfq_mq
);
8718 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8719 blkcg_policy_unregister(&blkcg_policy_bfq
);
8724 module_init(bfq_init
);
8725 module_exit(bfq_exit
);
8727 MODULE_AUTHOR("Paolo Valente");
8728 MODULE_LICENSE("GPL");
8729 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");