2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
127 #define BFQ_SOFTRT_WEIGHT_FACTOR 100
132 * struct bfq_service_tree - per ioprio_class service tree.
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
139 struct bfq_service_tree
{
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active
;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
145 /* idle entity with minimum F_i */
146 struct bfq_entity
*first_idle
;
147 /* idle entity with maximum F_i */
148 struct bfq_entity
*last_idle
;
150 /* scheduler virtual time */
152 /* scheduler weight sum; active and idle entities contribute to it */
157 * struct bfq_sched_data - multi-class scheduler.
159 * bfq_sched_data is the basic scheduler queue. It supports three
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
173 struct bfq_sched_data
{
174 /* entity in service */
175 struct bfq_entity
*in_service_entity
;
176 /* head-of-line entity (see comments above) */
177 struct bfq_entity
*next_in_service
;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service
;
186 * struct bfq_entity - schedulable entity.
188 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
189 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
190 * entity belongs to the sched_data of the parent group in the cgroup
191 * hierarchy. Non-leaf entities have also their own sched_data, stored
194 * Each entity stores independently its priority values; this would
195 * allow different weights on different devices, but this
196 * functionality is not exported to userspace by now. Priorities and
197 * weights are updated lazily, first storing the new values into the
198 * new_* fields, then setting the @prio_changed flag. As soon as
199 * there is a transition in the entity state that allows the priority
200 * update to take place the effective and the requested priority
201 * values are synchronized.
203 * Unless cgroups are used, the weight value is calculated from the
204 * ioprio to export the same interface as CFQ. When dealing with
205 * ``well-behaved'' queues (i.e., queues that do not spend too much
206 * time to consume their budget and have true sequential behavior, and
207 * when there are no external factors breaking anticipation) the
208 * relative weights at each level of the cgroups hierarchy should be
209 * guaranteed. All the fields are protected by the queue lock of the
213 /* service_tree member */
214 struct rb_node rb_node
;
217 * Flag, true if the entity is on a tree (either the active or
218 * the idle one of its service_tree) or is in service.
222 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
225 /* tree the entity is enqueued into; %NULL if not on a tree */
226 struct rb_root
*tree
;
229 * minimum start time of the (active) subtree rooted at this
230 * entity; used for O(log N) lookups into active trees
234 /* amount of service received during the last service slot */
237 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
240 /* weight of the queue */
242 /* next weight if a change is in progress */
245 /* original weight, used to implement weight boosting */
248 /* parent entity, for hierarchical scheduling */
249 struct bfq_entity
*parent
;
252 * For non-leaf nodes in the hierarchy, the associated
253 * scheduler queue, %NULL on leaf nodes.
255 struct bfq_sched_data
*my_sched_data
;
256 /* the scheduler queue this entity belongs to */
257 struct bfq_sched_data
*sched_data
;
259 /* flag, set to request a weight, ioprio or ioprio_class change */
266 * struct bfq_ttime - per process thinktime stats.
269 /* completion time of the last request */
270 u64 last_end_request
;
272 /* total process thinktime */
274 /* number of thinktime samples */
275 unsigned long ttime_samples
;
276 /* average process thinktime */
281 * struct bfq_queue - leaf schedulable entity.
283 * A bfq_queue is a leaf request queue; it can be associated with an
284 * io_context or more, if it is async or shared between cooperating
285 * processes. @cgroup holds a reference to the cgroup, to be sure that it
286 * does not disappear while a bfqq still references it (mostly to avoid
287 * races between request issuing and task migration followed by cgroup
289 * All the fields are protected by the queue lock of the containing bfqd.
292 /* reference counter */
294 /* parent bfq_data */
295 struct bfq_data
*bfqd
;
297 /* current ioprio and ioprio class */
298 unsigned short ioprio
, ioprio_class
;
299 /* next ioprio and ioprio class if a change is in progress */
300 unsigned short new_ioprio
, new_ioprio_class
;
303 * Shared bfq_queue if queue is cooperating with one or more
306 struct bfq_queue
*new_bfqq
;
307 /* request-position tree member (see bfq_group's @rq_pos_tree) */
308 struct rb_node pos_node
;
309 /* request-position tree root (see bfq_group's @rq_pos_tree) */
310 struct rb_root
*pos_root
;
312 /* sorted list of pending requests */
313 struct rb_root sort_list
;
314 /* if fifo isn't expired, next request to serve */
315 struct request
*next_rq
;
316 /* number of sync and async requests queued */
318 /* number of requests currently allocated */
320 /* number of pending metadata requests */
322 /* fifo list of requests in sort_list */
323 struct list_head fifo
;
325 /* entity representing this queue in the scheduler */
326 struct bfq_entity entity
;
328 /* maximum budget allowed from the feedback mechanism */
330 /* budget expiration (in jiffies) */
331 unsigned long budget_timeout
;
333 /* number of requests on the dispatch list or inside driver */
339 /* node for active/idle bfqq list inside parent bfqd */
340 struct list_head bfqq_list
;
342 /* associated @bfq_ttime struct */
343 struct bfq_ttime ttime
;
345 /* bit vector: a 1 for each seeky requests in history */
347 /* position of the last request enqueued */
348 sector_t last_request_pos
;
350 /* Number of consecutive pairs of request completion and
351 * arrival, such that the queue becomes idle after the
352 * completion, but the next request arrives within an idle
353 * time slice; used only if the queue's IO_bound flag has been
356 unsigned int requests_within_timer
;
358 /* pid of the process owning the queue, used for logging purposes */
362 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
363 * if the queue is shared.
365 struct bfq_io_cq
*bic
;
367 /* current maximum weight-raising time for this queue */
368 unsigned long wr_cur_max_time
;
370 * Minimum time instant such that, only if a new request is
371 * enqueued after this time instant in an idle @bfq_queue with
372 * no outstanding requests, then the task associated with the
373 * queue it is deemed as soft real-time (see the comments on
374 * the function bfq_bfqq_softrt_next_start())
376 unsigned long soft_rt_next_start
;
378 * Start time of the current weight-raising period if
379 * the @bfq-queue is being weight-raised, otherwise
380 * finish time of the last weight-raising period.
382 unsigned long last_wr_start_finish
;
383 /* factor by which the weight of this queue is multiplied */
384 unsigned int wr_coeff
;
386 * Time of the last transition of the @bfq_queue from idle to
389 unsigned long last_idle_bklogged
;
391 * Cumulative service received from the @bfq_queue since the
392 * last transition from idle to backlogged.
394 unsigned long service_from_backlogged
;
397 * Value of wr start time when switching to soft rt
399 unsigned long wr_start_at_switch_to_srt
;
401 unsigned long split_time
; /* time of last split */
405 * struct bfq_io_cq - per (request_queue, io_context) structure.
408 /* associated io_cq structure */
409 struct io_cq icq
; /* must be the first member */
410 /* array of two process queues, the sync and the async */
411 struct bfq_queue
*bfqq
[2];
412 /* per (request_queue, blkcg) ioprio */
414 #ifdef CONFIG_BFQ_GROUP_IOSCHED
415 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
418 * Snapshot of the idle window before merging; taken to
419 * remember this value while the queue is merged, so as to be
420 * able to restore it in case of split.
422 bool saved_idle_window
;
424 * Same purpose as the previous two fields for the I/O bound
425 * classification of a queue.
430 * Similar to previous fields: save wr information.
432 unsigned long saved_wr_coeff
;
433 unsigned long saved_last_wr_start_finish
;
434 unsigned long saved_wr_start_at_switch_to_srt
;
435 unsigned int saved_wr_cur_max_time
;
436 struct bfq_ttime saved_ttime
;
439 enum bfq_device_speed
{
445 * struct bfq_data - per-device data structure.
447 * All the fields are protected by @lock.
450 /* device request queue */
451 struct request_queue
*queue
;
453 struct list_head dispatch
;
455 /* root bfq_group for the device */
456 struct bfq_group
*root_group
;
459 * Number of bfq_queues containing requests (including the
460 * queue in service, even if it is idling).
463 /* number of weight-raised busy @bfq_queues */
465 /* number of queued requests */
467 /* number of requests dispatched and waiting for completion */
471 * Maximum number of requests in driver in the last
472 * @hw_tag_samples completed requests.
474 int max_rq_in_driver
;
475 /* number of samples used to calculate hw_tag */
477 /* flag set to one if the driver is showing a queueing behavior */
480 /* number of budgets assigned */
481 int budgets_assigned
;
484 * Timer set when idling (waiting) for the next request from
485 * the queue in service.
487 struct hrtimer idle_slice_timer
;
489 /* bfq_queue in service */
490 struct bfq_queue
*in_service_queue
;
491 /* bfq_io_cq (bic) associated with the @in_service_queue */
492 struct bfq_io_cq
*in_service_bic
;
494 /* on-disk position of the last served request */
495 sector_t last_position
;
497 /* time of last request completion (ns) */
500 /* time of first rq dispatch in current observation interval (ns) */
502 /* time of last rq dispatch in current observation interval (ns) */
505 /* beginning of the last budget */
506 ktime_t last_budget_start
;
507 /* beginning of the last idle slice */
508 ktime_t last_idling_start
;
510 /* number of samples in current observation interval */
511 int peak_rate_samples
;
512 /* num of samples of seq dispatches in current observation interval */
513 u32 sequential_samples
;
514 /* total num of sectors transferred in current observation interval */
515 u64 tot_sectors_dispatched
;
516 /* max rq size seen during current observation interval (sectors) */
517 u32 last_rq_max_size
;
518 /* time elapsed from first dispatch in current observ. interval (us) */
519 u64 delta_from_first
;
521 * Current estimate of the device peak rate, measured in
522 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
523 * BFQ_RATE_SHIFT is performed to increase precision in
524 * fixed-point calculations.
528 /* maximum budget allotted to a bfq_queue before rescheduling */
531 /* list of all the bfq_queues active on the device */
532 struct list_head active_list
;
533 /* list of all the bfq_queues idle on the device */
534 struct list_head idle_list
;
537 * Timeout for async/sync requests; when it fires, requests
538 * are served in fifo order.
540 u64 bfq_fifo_expire
[2];
541 /* weight of backward seeks wrt forward ones */
542 unsigned int bfq_back_penalty
;
543 /* maximum allowed backward seek */
544 unsigned int bfq_back_max
;
545 /* maximum idling time */
548 /* user-configured max budget value (0 for auto-tuning) */
549 int bfq_user_max_budget
;
551 * Timeout for bfq_queues to consume their budget; used to
552 * prevent seeky queues from imposing long latencies to
553 * sequential or quasi-sequential ones (this also implies that
554 * seeky queues cannot receive guarantees in the service
555 * domain; after a timeout they are charged for the time they
556 * have been in service, to preserve fairness among them, but
557 * without service-domain guarantees).
559 unsigned int bfq_timeout
;
562 * Number of consecutive requests that must be issued within
563 * the idle time slice to set again idling to a queue which
564 * was marked as non-I/O-bound (see the definition of the
565 * IO_bound flag for further details).
567 unsigned int bfq_requests_within_timer
;
570 * Force device idling whenever needed to provide accurate
571 * service guarantees, without caring about throughput
572 * issues. CAVEAT: this may even increase latencies, in case
573 * of useless idling for processes that did stop doing I/O.
575 bool strict_guarantees
;
577 /* if set to true, low-latency heuristics are enabled */
580 * Maximum factor by which the weight of a weight-raised queue
583 unsigned int bfq_wr_coeff
;
584 /* maximum duration of a weight-raising period (jiffies) */
585 unsigned int bfq_wr_max_time
;
587 /* Maximum weight-raising duration for soft real-time processes */
588 unsigned int bfq_wr_rt_max_time
;
590 * Minimum idle period after which weight-raising may be
591 * reactivated for a queue (in jiffies).
593 unsigned int bfq_wr_min_idle_time
;
595 * Minimum period between request arrivals after which
596 * weight-raising may be reactivated for an already busy async
597 * queue (in jiffies).
599 unsigned long bfq_wr_min_inter_arr_async
;
601 /* Max service-rate for a soft real-time queue, in sectors/sec */
602 unsigned int bfq_wr_max_softrt_rate
;
604 * Cached value of the product R*T, used for computing the
605 * maximum duration of weight raising automatically.
608 /* device-speed class for the low-latency heuristic */
609 enum bfq_device_speed device_speed
;
611 /* fallback dummy bfqq for extreme OOM conditions */
612 struct bfq_queue oom_bfqq
;
617 * bic associated with the task issuing current bio for
618 * merging. This and the next field are used as a support to
619 * be able to perform the bic lookup, needed by bio-merge
620 * functions, before the scheduler lock is taken, and thus
621 * avoid taking the request-queue lock while the scheduler
622 * lock is being held.
624 struct bfq_io_cq
*bio_bic
;
625 /* bfqq associated with the task issuing current bio for merging */
626 struct bfq_queue
*bio_bfqq
;
629 * io context to put right after bfqd->lock is released. This
630 * filed is used to perform put_io_context, when needed, to
631 * after the scheduler lock has been released, and thus
632 * prevent an ioc->lock from being possibly taken while the
633 * scheduler lock is being held.
635 struct io_context
*ioc_to_put
;
638 enum bfqq_state_flags
{
639 BFQQF_busy
= 0, /* has requests or is in service */
640 BFQQF_wait_request
, /* waiting for a request */
641 BFQQF_non_blocking_wait_rq
, /*
642 * waiting for a request
643 * without idling the device
645 BFQQF_fifo_expire
, /* FIFO checked in this slice */
646 BFQQF_idle_window
, /* slice idling enabled */
647 BFQQF_sync
, /* synchronous queue */
649 * bfqq has timed-out at least once
650 * having consumed at most 2/10 of
653 BFQQF_softrt_update
, /*
654 * may need softrt-next-start
657 BFQQF_coop
, /* bfqq is shared */
658 BFQQF_split_coop
/* shared bfqq will be split */
661 #define BFQ_BFQQ_FNS(name) \
662 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
664 __set_bit(BFQQF_##name, &(bfqq)->flags); \
666 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
668 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
670 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
672 return test_bit(BFQQF_##name, &(bfqq)->flags); \
676 BFQ_BFQQ_FNS(wait_request
);
677 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
678 BFQ_BFQQ_FNS(fifo_expire
);
679 BFQ_BFQQ_FNS(idle_window
);
681 BFQ_BFQQ_FNS(IO_bound
);
683 BFQ_BFQQ_FNS(split_coop
);
684 BFQ_BFQQ_FNS(softrt_update
);
687 /* Logging facilities. */
688 #ifdef CONFIG_BFQ_GROUP_IOSCHED
689 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
690 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
692 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
695 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
696 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
697 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
701 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
704 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
705 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
708 #else /* CONFIG_BFQ_GROUP_IOSCHED */
710 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
711 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
712 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
714 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
716 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
718 #define bfq_log(bfqd, fmt, args...) \
719 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
721 /* Expiration reasons. */
722 enum bfqq_expiration
{
723 BFQQE_TOO_IDLE
= 0, /*
724 * queue has been idling for
727 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
728 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
729 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
730 BFQQE_PREEMPTED
/* preemption in progress */
734 #ifdef CONFIG_BFQ_GROUP_IOSCHED
735 /* number of ios merged */
736 struct blkg_rwstat merged
;
737 /* total time spent on device in ns, may not be accurate w/ queueing */
738 struct blkg_rwstat service_time
;
739 /* total time spent waiting in scheduler queue in ns */
740 struct blkg_rwstat wait_time
;
741 /* number of IOs queued up */
742 struct blkg_rwstat queued
;
743 /* total disk time and nr sectors dispatched by this group */
744 struct blkg_stat time
;
745 /* sum of number of ios queued across all samples */
746 struct blkg_stat avg_queue_size_sum
;
747 /* count of samples taken for average */
748 struct blkg_stat avg_queue_size_samples
;
749 /* how many times this group has been removed from service tree */
750 struct blkg_stat dequeue
;
751 /* total time spent waiting for it to be assigned a timeslice. */
752 struct blkg_stat group_wait_time
;
753 /* time spent idling for this blkcg_gq */
754 struct blkg_stat idle_time
;
755 /* total time with empty current active q with other requests queued */
756 struct blkg_stat empty_time
;
757 /* fields after this shouldn't be cleared on stat reset */
758 uint64_t start_group_wait_time
;
759 uint64_t start_idle_time
;
760 uint64_t start_empty_time
;
762 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
765 #ifdef CONFIG_BFQ_GROUP_IOSCHED
768 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
770 * @ps: @blkcg_policy_storage that this structure inherits
771 * @weight: weight of the bfq_group
773 struct bfq_group_data
{
774 /* must be the first member */
775 struct blkcg_policy_data pd
;
781 * struct bfq_group - per (device, cgroup) data structure.
782 * @entity: schedulable entity to insert into the parent group sched_data.
783 * @sched_data: own sched_data, to contain child entities (they may be
784 * both bfq_queues and bfq_groups).
785 * @bfqd: the bfq_data for the device this group acts upon.
786 * @async_bfqq: array of async queues for all the tasks belonging to
787 * the group, one queue per ioprio value per ioprio_class,
788 * except for the idle class that has only one queue.
789 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
790 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
791 * to avoid too many special cases during group creation/
793 * @stats: stats for this bfqg.
794 * @rq_pos_tree: rbtree sorted by next_request position, used when
795 * determining if two or more queues have interleaving
796 * requests (see bfq_find_close_cooperator()).
798 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
799 * there is a set of bfq_groups, each one collecting the lower-level
800 * entities belonging to the group that are acting on the same device.
802 * Locking works as follows:
803 * o @bfqd is protected by the queue lock, RCU is used to access it
805 * o All the other fields are protected by the @bfqd queue lock.
808 /* must be the first member */
809 struct blkg_policy_data pd
;
811 struct bfq_entity entity
;
812 struct bfq_sched_data sched_data
;
816 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
817 struct bfq_queue
*async_idle_bfqq
;
819 struct bfq_entity
*my_entity
;
821 struct rb_root rq_pos_tree
;
823 struct bfqg_stats stats
;
828 struct bfq_sched_data sched_data
;
830 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
831 struct bfq_queue
*async_idle_bfqq
;
833 struct rb_root rq_pos_tree
;
837 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
839 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
841 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
843 return bfqq
? bfqq
->ioprio_class
- 1 :
844 BFQ_DEFAULT_GRP_CLASS
- 1;
847 static struct bfq_service_tree
*
848 bfq_entity_service_tree(struct bfq_entity
*entity
)
850 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
851 unsigned int idx
= bfq_class_idx(entity
);
853 return sched_data
->service_tree
+ idx
;
856 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
858 return bic
->bfqq
[is_sync
];
861 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
864 bic
->bfqq
[is_sync
] = bfqq
;
867 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
869 return bic
->icq
.q
->elevator
->elevator_data
;
872 #ifdef CONFIG_BFQ_GROUP_IOSCHED
874 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
876 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
879 group_entity
= &bfqq
->bfqd
->root_group
->entity
;
881 return container_of(group_entity
, struct bfq_group
, entity
);
886 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
888 return bfqq
->bfqd
->root_group
;
893 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
894 static void bfq_put_queue(struct bfq_queue
*bfqq
);
895 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
896 struct bio
*bio
, bool is_sync
,
897 struct bfq_io_cq
*bic
);
898 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
899 struct bfq_group
*bfqg
);
900 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
901 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
903 /* Expiration time of sync (0) and async (1) requests, in ns. */
904 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
906 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
907 static const int bfq_back_max
= 16 * 1024;
909 /* Penalty of a backwards seek, in number of sectors. */
910 static const int bfq_back_penalty
= 2;
912 /* Idling period duration, in ns. */
913 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
915 /* Minimum number of assigned budgets for which stats are safe to compute. */
916 static const int bfq_stats_min_budgets
= 194;
918 /* Default maximum budget values, in sectors and number of requests. */
919 static const int bfq_default_max_budget
= 16 * 1024;
922 * Async to sync throughput distribution is controlled as follows:
923 * when an async request is served, the entity is charged the number
924 * of sectors of the request, multiplied by the factor below
926 static const int bfq_async_charge_factor
= 10;
928 /* Default timeout values, in jiffies, approximating CFQ defaults. */
929 static const int bfq_timeout
= HZ
/ 8;
931 static struct kmem_cache
*bfq_pool
;
933 /* Below this threshold (in ns), we consider thinktime immediate. */
934 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
936 /* hw_tag detection: parallel requests threshold and min samples needed. */
937 #define BFQ_HW_QUEUE_THRESHOLD 4
938 #define BFQ_HW_QUEUE_SAMPLES 32
940 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
941 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
942 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
943 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
945 /* Min number of samples required to perform peak-rate update */
946 #define BFQ_RATE_MIN_SAMPLES 32
947 /* Min observation time interval required to perform a peak-rate update (ns) */
948 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
949 /* Target observation time interval for a peak-rate update (ns) */
950 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
952 /* Shift used for peak rate fixed precision calculations. */
953 #define BFQ_RATE_SHIFT 16
956 * By default, BFQ computes the duration of the weight raising for
957 * interactive applications automatically, using the following formula:
958 * duration = (R / r) * T, where r is the peak rate of the device, and
959 * R and T are two reference parameters.
960 * In particular, R is the peak rate of the reference device (see below),
961 * and T is a reference time: given the systems that are likely to be
962 * installed on the reference device according to its speed class, T is
963 * about the maximum time needed, under BFQ and while reading two files in
964 * parallel, to load typical large applications on these systems.
965 * In practice, the slower/faster the device at hand is, the more/less it
966 * takes to load applications with respect to the reference device.
967 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
970 * BFQ uses four different reference pairs (R, T), depending on:
971 * . whether the device is rotational or non-rotational;
972 * . whether the device is slow, such as old or portable HDDs, as well as
973 * SD cards, or fast, such as newer HDDs and SSDs.
975 * The device's speed class is dynamically (re)detected in
976 * bfq_update_peak_rate() every time the estimated peak rate is updated.
978 * In the following definitions, R_slow[0]/R_fast[0] and
979 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
980 * rotational device, whereas R_slow[1]/R_fast[1] and
981 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
982 * non-rotational device. Finally, device_speed_thresh are the
983 * thresholds used to switch between speed classes. The reference
984 * rates are not the actual peak rates of the devices used as a
985 * reference, but slightly lower values. The reason for using these
986 * slightly lower values is that the peak-rate estimator tends to
987 * yield slightly lower values than the actual peak rate (it can yield
988 * the actual peak rate only if there is only one process doing I/O,
989 * and the process does sequential I/O).
991 * Both the reference peak rates and the thresholds are measured in
992 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
994 static int R_slow
[2] = {1000, 10700};
995 static int R_fast
[2] = {14000, 33000};
997 * To improve readability, a conversion function is used to initialize the
998 * following arrays, which entails that they can be initialized only in a
1001 static int T_slow
[2];
1002 static int T_fast
[2];
1003 static int device_speed_thresh
[2];
1005 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1006 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1008 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1009 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1012 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1013 * @icq: the iocontext queue.
1015 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
1017 /* bic->icq is the first member, %NULL will convert to %NULL */
1018 return container_of(icq
, struct bfq_io_cq
, icq
);
1022 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1023 * @bfqd: the lookup key.
1024 * @ioc: the io_context of the process doing I/O.
1025 * @q: the request queue.
1027 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
1028 struct io_context
*ioc
,
1029 struct request_queue
*q
)
1032 unsigned long flags
;
1033 struct bfq_io_cq
*icq
;
1035 spin_lock_irqsave(q
->queue_lock
, flags
);
1036 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
1037 spin_unlock_irqrestore(q
->queue_lock
, flags
);
1046 * Scheduler run of queue, if there are requests pending and no one in the
1047 * driver that will restart queueing.
1049 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
1051 if (bfqd
->queued
!= 0) {
1052 bfq_log(bfqd
, "schedule dispatch");
1053 blk_mq_run_hw_queues(bfqd
->queue
, true);
1058 * Next two functions release bfqd->lock and put the io context
1059 * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
1060 * to take an ioc->lock while the scheduler lock is being held.
1062 static void bfq_unlock_put_ioc(struct bfq_data
*bfqd
)
1064 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1066 bfqd
->ioc_to_put
= NULL
;
1067 spin_unlock_irq(&bfqd
->lock
);
1070 put_io_context(ioc_to_put
);
1073 static void bfq_unlock_put_ioc_restore(struct bfq_data
*bfqd
,
1074 unsigned long flags
)
1076 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1078 bfqd
->ioc_to_put
= NULL
;
1079 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
1082 put_io_context(ioc_to_put
);
1086 * bfq_gt - compare two timestamps.
1090 * Return @a > @b, dealing with wrapping correctly.
1092 static int bfq_gt(u64 a
, u64 b
)
1094 return (s64
)(a
- b
) > 0;
1097 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
1099 struct rb_node
*node
= tree
->rb_node
;
1101 return rb_entry(node
, struct bfq_entity
, rb_node
);
1104 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
1106 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
1109 * bfq_update_next_in_service - update sd->next_in_service
1110 * @sd: sched_data for which to perform the update.
1111 * @new_entity: if not NULL, pointer to the entity whose activation,
1112 * requeueing or repositionig triggered the invocation of
1115 * This function is called to update sd->next_in_service, which, in
1116 * its turn, may change as a consequence of the insertion or
1117 * extraction of an entity into/from one of the active trees of
1118 * sd. These insertions/extractions occur as a consequence of
1119 * activations/deactivations of entities, with some activations being
1120 * 'true' activations, and other activations being requeueings (i.e.,
1121 * implementing the second, requeueing phase of the mechanism used to
1122 * reposition an entity in its active tree; see comments on
1123 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1124 * both the last two activation sub-cases, new_entity points to the
1125 * just activated or requeued entity.
1127 * Returns true if sd->next_in_service changes in such a way that
1128 * entity->parent may become the next_in_service for its parent
1131 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
1132 struct bfq_entity
*new_entity
)
1134 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
1135 bool parent_sched_may_change
= false;
1138 * If this update is triggered by the activation, requeueing
1139 * or repositiong of an entity that does not coincide with
1140 * sd->next_in_service, then a full lookup in the active tree
1141 * can be avoided. In fact, it is enough to check whether the
1142 * just-modified entity has a higher priority than
1143 * sd->next_in_service, or, even if it has the same priority
1144 * as sd->next_in_service, is eligible and has a lower virtual
1145 * finish time than sd->next_in_service. If this compound
1146 * condition holds, then the new entity becomes the new
1147 * next_in_service. Otherwise no change is needed.
1149 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
1151 * Flag used to decide whether to replace
1152 * sd->next_in_service with new_entity. Tentatively
1153 * set to true, and left as true if
1154 * sd->next_in_service is NULL.
1156 bool replace_next
= true;
1159 * If there is already a next_in_service candidate
1160 * entity, then compare class priorities or timestamps
1161 * to decide whether to replace sd->service_tree with
1164 if (next_in_service
) {
1165 unsigned int new_entity_class_idx
=
1166 bfq_class_idx(new_entity
);
1167 struct bfq_service_tree
*st
=
1168 sd
->service_tree
+ new_entity_class_idx
;
1171 * For efficiency, evaluate the most likely
1172 * sub-condition first.
1175 (new_entity_class_idx
==
1176 bfq_class_idx(next_in_service
)
1178 !bfq_gt(new_entity
->start
, st
->vtime
)
1180 bfq_gt(next_in_service
->finish
,
1181 new_entity
->finish
))
1183 new_entity_class_idx
<
1184 bfq_class_idx(next_in_service
);
1188 next_in_service
= new_entity
;
1189 } else /* invoked because of a deactivation: lookup needed */
1190 next_in_service
= bfq_lookup_next_entity(sd
);
1192 if (next_in_service
) {
1193 parent_sched_may_change
= !sd
->next_in_service
||
1194 bfq_update_parent_budget(next_in_service
);
1197 sd
->next_in_service
= next_in_service
;
1199 if (!next_in_service
)
1200 return parent_sched_may_change
;
1202 return parent_sched_may_change
;
1205 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1206 /* both next loops stop at one of the child entities of the root group */
1207 #define for_each_entity(entity) \
1208 for (; entity ; entity = entity->parent)
1211 * For each iteration, compute parent in advance, so as to be safe if
1212 * entity is deallocated during the iteration. Such a deallocation may
1213 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1214 * containing entity.
1216 #define for_each_entity_safe(entity, parent) \
1217 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1220 * Returns true if this budget changes may let next_in_service->parent
1221 * become the next_in_service entity for its parent entity.
1223 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1225 struct bfq_entity
*bfqg_entity
;
1226 struct bfq_group
*bfqg
;
1227 struct bfq_sched_data
*group_sd
;
1230 group_sd
= next_in_service
->sched_data
;
1232 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
1234 * bfq_group's my_entity field is not NULL only if the group
1235 * is not the root group. We must not touch the root entity
1236 * as it must never become an in-service entity.
1238 bfqg_entity
= bfqg
->my_entity
;
1240 if (bfqg_entity
->budget
> next_in_service
->budget
)
1242 bfqg_entity
->budget
= next_in_service
->budget
;
1249 * This function tells whether entity stops being a candidate for next
1250 * service, according to the following logic.
1252 * This function is invoked for an entity that is about to be set in
1253 * service. If such an entity is a queue, then the entity is no longer
1254 * a candidate for next service (i.e, a candidate entity to serve
1255 * after the in-service entity is expired). The function then returns
1258 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1260 if (bfq_entity_to_bfqq(entity
))
1266 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1268 * Next two macros are fake loops when cgroups support is not
1269 * enabled. I fact, in such a case, there is only one level to go up
1270 * (to reach the root group).
1272 #define for_each_entity(entity) \
1273 for (; entity ; entity = NULL)
1275 #define for_each_entity_safe(entity, parent) \
1276 for (parent = NULL; entity ; entity = parent)
1278 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1283 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1288 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1291 * Shift for timestamp calculations. This actually limits the maximum
1292 * service allowed in one timestamp delta (small shift values increase it),
1293 * the maximum total weight that can be used for the queues in the system
1294 * (big shift values increase it), and the period of virtual time
1297 #define WFQ_SERVICE_SHIFT 22
1299 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1301 struct bfq_queue
*bfqq
= NULL
;
1303 if (!entity
->my_sched_data
)
1304 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1311 * bfq_delta - map service into the virtual time domain.
1312 * @service: amount of service.
1313 * @weight: scale factor (weight of an entity or weight sum).
1315 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1317 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1324 * bfq_calc_finish - assign the finish time to an entity.
1325 * @entity: the entity to act upon.
1326 * @service: the service to be charged to the entity.
1328 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1330 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1332 entity
->finish
= entity
->start
+
1333 bfq_delta(service
, entity
->weight
);
1336 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1337 "calc_finish: serv %lu, w %d",
1338 service
, entity
->weight
);
1339 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1340 "calc_finish: start %llu, finish %llu, delta %llu",
1341 entity
->start
, entity
->finish
,
1342 bfq_delta(service
, entity
->weight
));
1347 * bfq_entity_of - get an entity from a node.
1348 * @node: the node field of the entity.
1350 * Convert a node pointer to the relative entity. This is used only
1351 * to simplify the logic of some functions and not as the generic
1352 * conversion mechanism because, e.g., in the tree walking functions,
1353 * the check for a %NULL value would be redundant.
1355 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1357 struct bfq_entity
*entity
= NULL
;
1360 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1366 * bfq_extract - remove an entity from a tree.
1367 * @root: the tree root.
1368 * @entity: the entity to remove.
1370 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1372 entity
->tree
= NULL
;
1373 rb_erase(&entity
->rb_node
, root
);
1377 * bfq_idle_extract - extract an entity from the idle tree.
1378 * @st: the service tree of the owning @entity.
1379 * @entity: the entity being removed.
1381 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1382 struct bfq_entity
*entity
)
1384 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1385 struct rb_node
*next
;
1387 if (entity
== st
->first_idle
) {
1388 next
= rb_next(&entity
->rb_node
);
1389 st
->first_idle
= bfq_entity_of(next
);
1392 if (entity
== st
->last_idle
) {
1393 next
= rb_prev(&entity
->rb_node
);
1394 st
->last_idle
= bfq_entity_of(next
);
1397 bfq_extract(&st
->idle
, entity
);
1400 list_del(&bfqq
->bfqq_list
);
1404 * bfq_insert - generic tree insertion.
1406 * @entity: entity to insert.
1408 * This is used for the idle and the active tree, since they are both
1409 * ordered by finish time.
1411 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1413 struct bfq_entity
*entry
;
1414 struct rb_node
**node
= &root
->rb_node
;
1415 struct rb_node
*parent
= NULL
;
1419 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1421 if (bfq_gt(entry
->finish
, entity
->finish
))
1422 node
= &parent
->rb_left
;
1424 node
= &parent
->rb_right
;
1427 rb_link_node(&entity
->rb_node
, parent
, node
);
1428 rb_insert_color(&entity
->rb_node
, root
);
1430 entity
->tree
= root
;
1434 * bfq_update_min - update the min_start field of a entity.
1435 * @entity: the entity to update.
1436 * @node: one of its children.
1438 * This function is called when @entity may store an invalid value for
1439 * min_start due to updates to the active tree. The function assumes
1440 * that the subtree rooted at @node (which may be its left or its right
1441 * child) has a valid min_start value.
1443 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1445 struct bfq_entity
*child
;
1448 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1449 if (bfq_gt(entity
->min_start
, child
->min_start
))
1450 entity
->min_start
= child
->min_start
;
1455 * bfq_update_active_node - recalculate min_start.
1456 * @node: the node to update.
1458 * @node may have changed position or one of its children may have moved,
1459 * this function updates its min_start value. The left and right subtrees
1460 * are assumed to hold a correct min_start value.
1462 static void bfq_update_active_node(struct rb_node
*node
)
1464 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1466 entity
->min_start
= entity
->start
;
1467 bfq_update_min(entity
, node
->rb_right
);
1468 bfq_update_min(entity
, node
->rb_left
);
1472 * bfq_update_active_tree - update min_start for the whole active tree.
1473 * @node: the starting node.
1475 * @node must be the deepest modified node after an update. This function
1476 * updates its min_start using the values held by its children, assuming
1477 * that they did not change, and then updates all the nodes that may have
1478 * changed in the path to the root. The only nodes that may have changed
1479 * are the ones in the path or their siblings.
1481 static void bfq_update_active_tree(struct rb_node
*node
)
1483 struct rb_node
*parent
;
1486 bfq_update_active_node(node
);
1488 parent
= rb_parent(node
);
1492 if (node
== parent
->rb_left
&& parent
->rb_right
)
1493 bfq_update_active_node(parent
->rb_right
);
1494 else if (parent
->rb_left
)
1495 bfq_update_active_node(parent
->rb_left
);
1502 * bfq_active_insert - insert an entity in the active tree of its
1504 * @st: the service tree of the entity.
1505 * @entity: the entity being inserted.
1507 * The active tree is ordered by finish time, but an extra key is kept
1508 * per each node, containing the minimum value for the start times of
1509 * its children (and the node itself), so it's possible to search for
1510 * the eligible node with the lowest finish time in logarithmic time.
1512 static void bfq_active_insert(struct bfq_service_tree
*st
,
1513 struct bfq_entity
*entity
)
1515 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1516 struct rb_node
*node
= &entity
->rb_node
;
1517 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1518 struct bfq_sched_data
*sd
= NULL
;
1519 struct bfq_group
*bfqg
= NULL
;
1520 struct bfq_data
*bfqd
= NULL
;
1523 bfq_insert(&st
->active
, entity
);
1526 node
= node
->rb_left
;
1527 else if (node
->rb_right
)
1528 node
= node
->rb_right
;
1530 bfq_update_active_tree(node
);
1532 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1533 sd
= entity
->sched_data
;
1534 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1535 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1538 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1542 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1543 * @ioprio: the ioprio value to convert.
1545 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1547 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1551 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1552 * @weight: the weight value to convert.
1554 * To preserve as much as possible the old only-ioprio user interface,
1555 * 0 is used as an escape ioprio value for weights (numerically) equal or
1556 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1558 static unsigned short bfq_weight_to_ioprio(int weight
)
1560 return max_t(int, 0,
1561 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1564 static void bfq_get_entity(struct bfq_entity
*entity
)
1566 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1570 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1576 * bfq_find_deepest - find the deepest node that an extraction can modify.
1577 * @node: the node being removed.
1579 * Do the first step of an extraction in an rb tree, looking for the
1580 * node that will replace @node, and returning the deepest node that
1581 * the following modifications to the tree can touch. If @node is the
1582 * last node in the tree return %NULL.
1584 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1586 struct rb_node
*deepest
;
1588 if (!node
->rb_right
&& !node
->rb_left
)
1589 deepest
= rb_parent(node
);
1590 else if (!node
->rb_right
)
1591 deepest
= node
->rb_left
;
1592 else if (!node
->rb_left
)
1593 deepest
= node
->rb_right
;
1595 deepest
= rb_next(node
);
1596 if (deepest
->rb_right
)
1597 deepest
= deepest
->rb_right
;
1598 else if (rb_parent(deepest
) != node
)
1599 deepest
= rb_parent(deepest
);
1606 * bfq_active_extract - remove an entity from the active tree.
1607 * @st: the service_tree containing the tree.
1608 * @entity: the entity being removed.
1610 static void bfq_active_extract(struct bfq_service_tree
*st
,
1611 struct bfq_entity
*entity
)
1613 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1614 struct rb_node
*node
;
1615 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1616 struct bfq_sched_data
*sd
= NULL
;
1617 struct bfq_group
*bfqg
= NULL
;
1618 struct bfq_data
*bfqd
= NULL
;
1621 node
= bfq_find_deepest(&entity
->rb_node
);
1622 bfq_extract(&st
->active
, entity
);
1625 bfq_update_active_tree(node
);
1627 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1628 sd
= entity
->sched_data
;
1629 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1630 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1633 list_del(&bfqq
->bfqq_list
);
1637 * bfq_idle_insert - insert an entity into the idle tree.
1638 * @st: the service tree containing the tree.
1639 * @entity: the entity to insert.
1641 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1642 struct bfq_entity
*entity
)
1644 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1645 struct bfq_entity
*first_idle
= st
->first_idle
;
1646 struct bfq_entity
*last_idle
= st
->last_idle
;
1648 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1649 st
->first_idle
= entity
;
1650 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1651 st
->last_idle
= entity
;
1653 bfq_insert(&st
->idle
, entity
);
1656 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1660 * bfq_forget_entity - do not consider entity any longer for scheduling
1661 * @st: the service tree.
1662 * @entity: the entity being removed.
1663 * @is_in_service: true if entity is currently the in-service entity.
1665 * Forget everything about @entity. In addition, if entity represents
1666 * a queue, and the latter is not in service, then release the service
1667 * reference to the queue (the one taken through bfq_get_entity). In
1668 * fact, in this case, there is really no more service reference to
1669 * the queue, as the latter is also outside any service tree. If,
1670 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1671 * will take care of putting the reference when the queue finally
1672 * stops being served.
1674 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1675 struct bfq_entity
*entity
,
1678 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1680 entity
->on_st
= false;
1681 st
->wsum
-= entity
->weight
;
1682 if (bfqq
&& !is_in_service
)
1683 bfq_put_queue(bfqq
);
1687 * bfq_put_idle_entity - release the idle tree ref of an entity.
1688 * @st: service tree for the entity.
1689 * @entity: the entity being released.
1691 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1692 struct bfq_entity
*entity
)
1694 bfq_idle_extract(st
, entity
);
1695 bfq_forget_entity(st
, entity
,
1696 entity
== entity
->sched_data
->in_service_entity
);
1700 * bfq_forget_idle - update the idle tree if necessary.
1701 * @st: the service tree to act upon.
1703 * To preserve the global O(log N) complexity we only remove one entry here;
1704 * as the idle tree will not grow indefinitely this can be done safely.
1706 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1708 struct bfq_entity
*first_idle
= st
->first_idle
;
1709 struct bfq_entity
*last_idle
= st
->last_idle
;
1711 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1712 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1714 * Forget the whole idle tree, increasing the vtime past
1715 * the last finish time of idle entities.
1717 st
->vtime
= last_idle
->finish
;
1720 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1721 bfq_put_idle_entity(st
, first_idle
);
1724 static struct bfq_service_tree
*
1725 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1726 struct bfq_entity
*entity
)
1728 struct bfq_service_tree
*new_st
= old_st
;
1730 if (entity
->prio_changed
) {
1731 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1732 unsigned int prev_weight
, new_weight
;
1733 struct bfq_data
*bfqd
= NULL
;
1734 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1735 struct bfq_sched_data
*sd
;
1736 struct bfq_group
*bfqg
;
1741 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1743 sd
= entity
->my_sched_data
;
1744 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1745 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1749 old_st
->wsum
-= entity
->weight
;
1751 if (entity
->new_weight
!= entity
->orig_weight
) {
1752 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1753 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1754 pr_crit("update_weight_prio: new_weight %d\n",
1755 entity
->new_weight
);
1756 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1757 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1759 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1761 entity
->orig_weight
= entity
->new_weight
;
1764 bfq_weight_to_ioprio(entity
->orig_weight
);
1768 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1769 entity
->prio_changed
= 0;
1772 * NOTE: here we may be changing the weight too early,
1773 * this will cause unfairness. The correct approach
1774 * would have required additional complexity to defer
1775 * weight changes to the proper time instants (i.e.,
1776 * when entity->finish <= old_st->vtime).
1778 new_st
= bfq_entity_service_tree(entity
);
1780 prev_weight
= entity
->weight
;
1781 new_weight
= entity
->orig_weight
*
1782 (bfqq
? bfqq
->wr_coeff
: 1);
1783 entity
->weight
= new_weight
;
1785 new_st
->wsum
+= entity
->weight
;
1787 if (new_st
!= old_st
)
1788 entity
->start
= new_st
->vtime
;
1794 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1795 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1798 * bfq_bfqq_served - update the scheduler status after selection for
1800 * @bfqq: the queue being served.
1801 * @served: bytes to transfer.
1803 * NOTE: this can be optimized, as the timestamps of upper level entities
1804 * are synchronized every time a new bfqq is selected for service. By now,
1805 * we keep it to better check consistency.
1807 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1809 struct bfq_entity
*entity
= &bfqq
->entity
;
1810 struct bfq_service_tree
*st
;
1812 for_each_entity(entity
) {
1813 st
= bfq_entity_service_tree(entity
);
1815 entity
->service
+= served
;
1817 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1818 bfq_forget_idle(st
);
1820 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1821 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1825 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1826 * of the time interval during which bfqq has been in
1829 * @bfqq: the queue that needs a service update.
1830 * @time_ms: the amount of time during which the queue has received service
1832 * If a queue does not consume its budget fast enough, then providing
1833 * the queue with service fairness may impair throughput, more or less
1834 * severely. For this reason, queues that consume their budget slowly
1835 * are provided with time fairness instead of service fairness. This
1836 * goal is achieved through the BFQ scheduling engine, even if such an
1837 * engine works in the service, and not in the time domain. The trick
1838 * is charging these queues with an inflated amount of service, equal
1839 * to the amount of service that they would have received during their
1840 * service slot if they had been fast, i.e., if their requests had
1841 * been dispatched at a rate equal to the estimated peak rate.
1843 * It is worth noting that time fairness can cause important
1844 * distortions in terms of bandwidth distribution, on devices with
1845 * internal queueing. The reason is that I/O requests dispatched
1846 * during the service slot of a queue may be served after that service
1847 * slot is finished, and may have a total processing time loosely
1848 * correlated with the duration of the service slot. This is
1849 * especially true for short service slots.
1851 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1852 unsigned long time_ms
)
1854 struct bfq_entity
*entity
= &bfqq
->entity
;
1855 int tot_serv_to_charge
= entity
->service
;
1856 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
1858 if (time_ms
> 0 && time_ms
< timeout_ms
)
1859 tot_serv_to_charge
=
1860 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
1862 if (tot_serv_to_charge
< entity
->service
)
1863 tot_serv_to_charge
= entity
->service
;
1865 /* Increase budget to avoid inconsistencies */
1866 if (tot_serv_to_charge
> entity
->budget
)
1867 entity
->budget
= tot_serv_to_charge
;
1869 bfq_bfqq_served(bfqq
,
1870 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
1873 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1874 struct bfq_service_tree
*st
,
1877 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1879 st
= __bfq_entity_update_weight_prio(st
, entity
);
1880 bfq_calc_finish(entity
, entity
->budget
);
1883 * If some queues enjoy backshifting for a while, then their
1884 * (virtual) finish timestamps may happen to become lower and
1885 * lower than the system virtual time. In particular, if
1886 * these queues often happen to be idle for short time
1887 * periods, and during such time periods other queues with
1888 * higher timestamps happen to be busy, then the backshifted
1889 * timestamps of the former queues can become much lower than
1890 * the system virtual time. In fact, to serve the queues with
1891 * higher timestamps while the ones with lower timestamps are
1892 * idle, the system virtual time may be pushed-up to much
1893 * higher values than the finish timestamps of the idle
1894 * queues. As a consequence, the finish timestamps of all new
1895 * or newly activated queues may end up being much larger than
1896 * those of lucky queues with backshifted timestamps. The
1897 * latter queues may then monopolize the device for a lot of
1898 * time. This would simply break service guarantees.
1900 * To reduce this problem, push up a little bit the
1901 * backshifted timestamps of the queue associated with this
1902 * entity (only a queue can happen to have the backshifted
1903 * flag set): just enough to let the finish timestamp of the
1904 * queue be equal to the current value of the system virtual
1905 * time. This may introduce a little unfairness among queues
1906 * with backshifted timestamps, but it does not break
1907 * worst-case fairness guarantees.
1909 * As a special case, if bfqq is weight-raised, push up
1910 * timestamps much less, to keep very low the probability that
1911 * this push up causes the backshifted finish timestamps of
1912 * weight-raised queues to become higher than the backshifted
1913 * finish timestamps of non weight-raised queues.
1915 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1916 unsigned long delta
= st
->vtime
- entity
->finish
;
1919 delta
/= bfqq
->wr_coeff
;
1921 entity
->start
+= delta
;
1922 entity
->finish
+= delta
;
1925 bfq_active_insert(st
, entity
);
1929 * __bfq_activate_entity - handle activation of entity.
1930 * @entity: the entity being activated.
1931 * @non_blocking_wait_rq: true if entity was waiting for a request
1933 * Called for a 'true' activation, i.e., if entity is not active and
1934 * one of its children receives a new request.
1936 * Basically, this function updates the timestamps of entity and
1937 * inserts entity into its active tree, ater possible extracting it
1938 * from its idle tree.
1940 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1941 bool non_blocking_wait_rq
)
1943 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1944 bool backshifted
= false;
1945 unsigned long long min_vstart
;
1947 /* See comments on bfq_fqq_update_budg_for_activation */
1948 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1950 min_vstart
= entity
->finish
;
1952 min_vstart
= st
->vtime
;
1954 if (entity
->tree
== &st
->idle
) {
1956 * Must be on the idle tree, bfq_idle_extract() will
1959 bfq_idle_extract(st
, entity
);
1960 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1961 min_vstart
: entity
->finish
;
1964 * The finish time of the entity may be invalid, and
1965 * it is in the past for sure, otherwise the queue
1966 * would have been on the idle tree.
1968 entity
->start
= min_vstart
;
1969 st
->wsum
+= entity
->weight
;
1971 * entity is about to be inserted into a service tree,
1972 * and then set in service: get a reference to make
1973 * sure entity does not disappear until it is no
1974 * longer in service or scheduled for service.
1976 bfq_get_entity(entity
);
1978 entity
->on_st
= true;
1981 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1985 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1986 * @entity: the entity being requeued or repositioned.
1988 * Requeueing is needed if this entity stops being served, which
1989 * happens if a leaf descendant entity has expired. On the other hand,
1990 * repositioning is needed if the next_inservice_entity for the child
1991 * entity has changed. See the comments inside the function for
1994 * Basically, this function: 1) removes entity from its active tree if
1995 * present there, 2) updates the timestamps of entity and 3) inserts
1996 * entity back into its active tree (in the new, right position for
1997 * the new values of the timestamps).
1999 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
2001 struct bfq_sched_data
*sd
= entity
->sched_data
;
2002 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2004 if (entity
== sd
->in_service_entity
) {
2006 * We are requeueing the current in-service entity,
2007 * which may have to be done for one of the following
2009 * - entity represents the in-service queue, and the
2010 * in-service queue is being requeued after an
2012 * - entity represents a group, and its budget has
2013 * changed because one of its child entities has
2014 * just been either activated or requeued for some
2015 * reason; the timestamps of the entity need then to
2016 * be updated, and the entity needs to be enqueued
2017 * or repositioned accordingly.
2019 * In particular, before requeueing, the start time of
2020 * the entity must be moved forward to account for the
2021 * service that the entity has received while in
2022 * service. This is done by the next instructions. The
2023 * finish time will then be updated according to this
2024 * new value of the start time, and to the budget of
2027 bfq_calc_finish(entity
, entity
->service
);
2028 entity
->start
= entity
->finish
;
2030 * In addition, if the entity had more than one child
2031 * when set in service, then was not extracted from
2032 * the active tree. This implies that the position of
2033 * the entity in the active tree may need to be
2034 * changed now, because we have just updated the start
2035 * time of the entity, and we will update its finish
2036 * time in a moment (the requeueing is then, more
2037 * precisely, a repositioning in this case). To
2038 * implement this repositioning, we: 1) dequeue the
2039 * entity here, 2) update the finish time and
2040 * requeue the entity according to the new
2044 bfq_active_extract(st
, entity
);
2045 } else { /* The entity is already active, and not in service */
2047 * In this case, this function gets called only if the
2048 * next_in_service entity below this entity has
2049 * changed, and this change has caused the budget of
2050 * this entity to change, which, finally implies that
2051 * the finish time of this entity must be
2052 * updated. Such an update may cause the scheduling,
2053 * i.e., the position in the active tree, of this
2054 * entity to change. We handle this change by: 1)
2055 * dequeueing the entity here, 2) updating the finish
2056 * time and requeueing the entity according to the new
2057 * timestamps below. This is the same approach as the
2058 * non-extracted-entity sub-case above.
2060 bfq_active_extract(st
, entity
);
2063 bfq_update_fin_time_enqueue(entity
, st
, false);
2066 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2067 struct bfq_sched_data
*sd
,
2068 bool non_blocking_wait_rq
)
2070 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2072 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
2074 * in service or already queued on the active tree,
2075 * requeue or reposition
2077 __bfq_requeue_entity(entity
);
2080 * Not in service and not queued on its active tree:
2081 * the activity is idle and this is a true activation.
2083 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
2088 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
2089 * and activate, requeue or reposition all ancestors
2090 * for which such an update becomes necessary.
2091 * @entity: the entity to activate.
2092 * @non_blocking_wait_rq: true if this entity was waiting for a request
2093 * @requeue: true if this is a requeue, which implies that bfqq is
2094 * being expired; thus ALL its ancestors stop being served and must
2095 * therefore be requeued
2097 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2098 bool non_blocking_wait_rq
,
2101 struct bfq_sched_data
*sd
;
2103 for_each_entity(entity
) {
2104 sd
= entity
->sched_data
;
2105 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
2107 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
2113 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2114 * @entity: the entity to deactivate.
2115 * @ins_into_idle_tree: if false, the entity will not be put into the
2118 * Deactivates an entity, independently from its previous state. Must
2119 * be invoked only if entity is on a service tree. Extracts the entity
2120 * from that tree, and if necessary and allowed, puts it on the idle
2123 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
2124 bool ins_into_idle_tree
)
2126 struct bfq_sched_data
*sd
= entity
->sched_data
;
2127 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2128 int is_in_service
= entity
== sd
->in_service_entity
;
2130 if (!entity
->on_st
) /* entity never activated, or already inactive */
2134 bfq_calc_finish(entity
, entity
->service
);
2136 if (entity
->tree
== &st
->active
)
2137 bfq_active_extract(st
, entity
);
2138 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
2139 bfq_idle_extract(st
, entity
);
2141 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
2142 bfq_forget_entity(st
, entity
, is_in_service
);
2144 bfq_idle_insert(st
, entity
);
2150 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2151 * @entity: the entity to deactivate.
2152 * @ins_into_idle_tree: true if the entity can be put on the idle tree
2154 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
2155 bool ins_into_idle_tree
,
2158 struct bfq_sched_data
*sd
;
2159 struct bfq_entity
*parent
= NULL
;
2161 for_each_entity_safe(entity
, parent
) {
2162 sd
= entity
->sched_data
;
2164 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
2166 * entity is not in any tree any more, so
2167 * this deactivation is a no-op, and there is
2168 * nothing to change for upper-level entities
2169 * (in case of expiration, this can never
2175 if (sd
->next_in_service
== entity
)
2177 * entity was the next_in_service entity,
2178 * then, since entity has just been
2179 * deactivated, a new one must be found.
2181 bfq_update_next_in_service(sd
, NULL
);
2183 if (sd
->next_in_service
)
2185 * The parent entity is still backlogged,
2186 * because next_in_service is not NULL. So, no
2187 * further upwards deactivation must be
2188 * performed. Yet, next_in_service has
2189 * changed. Then the schedule does need to be
2195 * If we get here, then the parent is no more
2196 * backlogged and we need to propagate the
2197 * deactivation upwards. Thus let the loop go on.
2201 * Also let parent be queued into the idle tree on
2202 * deactivation, to preserve service guarantees, and
2203 * assuming that who invoked this function does not
2204 * need parent entities too to be removed completely.
2206 ins_into_idle_tree
= true;
2210 * If the deactivation loop is fully executed, then there are
2211 * no more entities to touch and next loop is not executed at
2212 * all. Otherwise, requeue remaining entities if they are
2213 * about to stop receiving service, or reposition them if this
2217 for_each_entity(entity
) {
2219 * Invoke __bfq_requeue_entity on entity, even if
2220 * already active, to requeue/reposition it in the
2221 * active tree (because sd->next_in_service has
2224 __bfq_requeue_entity(entity
);
2226 sd
= entity
->sched_data
;
2227 if (!bfq_update_next_in_service(sd
, entity
) &&
2230 * next_in_service unchanged or not causing
2231 * any change in entity->parent->sd, and no
2232 * requeueing needed for expiration: stop
2240 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2241 * if needed, to have at least one entity eligible.
2242 * @st: the service tree to act upon.
2244 * Assumes that st is not empty.
2246 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
2248 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
2250 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
2251 return root_entity
->min_start
;
2256 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2258 if (new_value
> st
->vtime
) {
2259 st
->vtime
= new_value
;
2260 bfq_forget_idle(st
);
2265 * bfq_first_active_entity - find the eligible entity with
2266 * the smallest finish time
2267 * @st: the service tree to select from.
2268 * @vtime: the system virtual to use as a reference for eligibility
2270 * This function searches the first schedulable entity, starting from the
2271 * root of the tree and going on the left every time on this side there is
2272 * a subtree with at least one eligible (start >= vtime) entity. The path on
2273 * the right is followed only if a) the left subtree contains no eligible
2274 * entities and b) no eligible entity has been found yet.
2276 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2279 struct bfq_entity
*entry
, *first
= NULL
;
2280 struct rb_node
*node
= st
->active
.rb_node
;
2283 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2285 if (!bfq_gt(entry
->start
, vtime
))
2288 if (node
->rb_left
) {
2289 entry
= rb_entry(node
->rb_left
,
2290 struct bfq_entity
, rb_node
);
2291 if (!bfq_gt(entry
->min_start
, vtime
)) {
2292 node
= node
->rb_left
;
2298 node
= node
->rb_right
;
2305 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2306 * @st: the service tree.
2308 * If there is no in-service entity for the sched_data st belongs to,
2309 * then return the entity that will be set in service if:
2310 * 1) the parent entity this st belongs to is set in service;
2311 * 2) no entity belonging to such parent entity undergoes a state change
2312 * that would influence the timestamps of the entity (e.g., becomes idle,
2313 * becomes backlogged, changes its budget, ...).
2315 * In this first case, update the virtual time in @st too (see the
2316 * comments on this update inside the function).
2318 * In constrast, if there is an in-service entity, then return the
2319 * entity that would be set in service if not only the above
2320 * conditions, but also the next one held true: the currently
2321 * in-service entity, on expiration,
2322 * 1) gets a finish time equal to the current one, or
2323 * 2) is not eligible any more, or
2326 static struct bfq_entity
*
2327 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2329 struct bfq_entity
*entity
;
2332 if (RB_EMPTY_ROOT(&st
->active
))
2336 * Get the value of the system virtual time for which at
2337 * least one entity is eligible.
2339 new_vtime
= bfq_calc_vtime_jump(st
);
2342 * If there is no in-service entity for the sched_data this
2343 * active tree belongs to, then push the system virtual time
2344 * up to the value that guarantees that at least one entity is
2345 * eligible. If, instead, there is an in-service entity, then
2346 * do not make any such update, because there is already an
2347 * eligible entity, namely the in-service one (even if the
2348 * entity is not on st, because it was extracted when set in
2352 bfq_update_vtime(st
, new_vtime
);
2354 entity
= bfq_first_active_entity(st
, new_vtime
);
2360 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2361 * @sd: the sched_data.
2363 * This function is invoked when there has been a change in the trees
2364 * for sd, and we need know what is the new next entity after this
2367 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2369 struct bfq_service_tree
*st
= sd
->service_tree
;
2370 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2371 struct bfq_entity
*entity
= NULL
;
2375 * Choose from idle class, if needed to guarantee a minimum
2376 * bandwidth to this class (and if there is some active entity
2377 * in idle class). This should also mitigate
2378 * priority-inversion problems in case a low priority task is
2379 * holding file system resources.
2381 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2382 BFQ_CL_IDLE_TIMEOUT
)) {
2383 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2384 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2385 /* About to be served if backlogged, or not yet backlogged */
2386 sd
->bfq_class_idle_last_service
= jiffies
;
2390 * Find the next entity to serve for the highest-priority
2391 * class, unless the idle class needs to be served.
2393 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2394 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2395 sd
->in_service_entity
);
2407 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2409 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2411 return sd
->next_in_service
!= sd
->in_service_entity
;
2415 * Get next queue for service.
2417 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2419 struct bfq_entity
*entity
= NULL
;
2420 struct bfq_sched_data
*sd
;
2421 struct bfq_queue
*bfqq
;
2423 if (bfqd
->busy_queues
== 0)
2427 * Traverse the path from the root to the leaf entity to
2428 * serve. Set in service all the entities visited along the
2431 sd
= &bfqd
->root_group
->sched_data
;
2432 for (; sd
; sd
= entity
->my_sched_data
) {
2434 * WARNING. We are about to set the in-service entity
2435 * to sd->next_in_service, i.e., to the (cached) value
2436 * returned by bfq_lookup_next_entity(sd) the last
2437 * time it was invoked, i.e., the last time when the
2438 * service order in sd changed as a consequence of the
2439 * activation or deactivation of an entity. In this
2440 * respect, if we execute bfq_lookup_next_entity(sd)
2441 * in this very moment, it may, although with low
2442 * probability, yield a different entity than that
2443 * pointed to by sd->next_in_service. This rare event
2444 * happens in case there was no CLASS_IDLE entity to
2445 * serve for sd when bfq_lookup_next_entity(sd) was
2446 * invoked for the last time, while there is now one
2449 * If the above event happens, then the scheduling of
2450 * such entity in CLASS_IDLE is postponed until the
2451 * service of the sd->next_in_service entity
2452 * finishes. In fact, when the latter is expired,
2453 * bfq_lookup_next_entity(sd) gets called again,
2454 * exactly to update sd->next_in_service.
2457 /* Make next_in_service entity become in_service_entity */
2458 entity
= sd
->next_in_service
;
2459 sd
->in_service_entity
= entity
;
2462 * Reset the accumulator of the amount of service that
2463 * the entity is about to receive.
2465 entity
->service
= 0;
2468 * If entity is no longer a candidate for next
2469 * service, then we extract it from its active tree,
2470 * for the following reason. To further boost the
2471 * throughput in some special case, BFQ needs to know
2472 * which is the next candidate entity to serve, while
2473 * there is already an entity in service. In this
2474 * respect, to make it easy to compute/update the next
2475 * candidate entity to serve after the current
2476 * candidate has been set in service, there is a case
2477 * where it is necessary to extract the current
2478 * candidate from its service tree. Such a case is
2479 * when the entity just set in service cannot be also
2480 * a candidate for next service. Details about when
2481 * this conditions holds are reported in the comments
2482 * on the function bfq_no_longer_next_in_service()
2485 if (bfq_no_longer_next_in_service(entity
))
2486 bfq_active_extract(bfq_entity_service_tree(entity
),
2490 * For the same reason why we may have just extracted
2491 * entity from its active tree, we may need to update
2492 * next_in_service for the sched_data of entity too,
2493 * regardless of whether entity has been extracted.
2494 * In fact, even if entity has not been extracted, a
2495 * descendant entity may get extracted. Such an event
2496 * would cause a change in next_in_service for the
2497 * level of the descendant entity, and thus possibly
2498 * back to upper levels.
2500 * We cannot perform the resulting needed update
2501 * before the end of this loop, because, to know which
2502 * is the correct next-to-serve candidate entity for
2503 * each level, we need first to find the leaf entity
2504 * to set in service. In fact, only after we know
2505 * which is the next-to-serve leaf entity, we can
2506 * discover whether the parent entity of the leaf
2507 * entity becomes the next-to-serve, and so on.
2512 bfqq
= bfq_entity_to_bfqq(entity
);
2515 * We can finally update all next-to-serve entities along the
2516 * path from the leaf entity just set in service to the root.
2518 for_each_entity(entity
) {
2519 struct bfq_sched_data
*sd
= entity
->sched_data
;
2521 if (!bfq_update_next_in_service(sd
, NULL
))
2528 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2530 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2531 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2532 struct bfq_entity
*entity
= in_serv_entity
;
2534 if (bfqd
->in_service_bic
) {
2536 * Schedule the release of a reference to
2537 * bfqd->in_service_bic->icq.ioc to right after the
2538 * scheduler lock is released. This ioc is not
2539 * released immediately, to not risk to possibly take
2540 * an ioc->lock while holding the scheduler lock.
2542 bfqd
->ioc_to_put
= bfqd
->in_service_bic
->icq
.ioc
;
2543 bfqd
->in_service_bic
= NULL
;
2546 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2547 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2548 bfqd
->in_service_queue
= NULL
;
2551 * When this function is called, all in-service entities have
2552 * been properly deactivated or requeued, so we can safely
2553 * execute the final step: reset in_service_entity along the
2554 * path from entity to the root.
2556 for_each_entity(entity
)
2557 entity
->sched_data
->in_service_entity
= NULL
;
2560 * in_serv_entity is no longer in service, so, if it is in no
2561 * service tree either, then release the service reference to
2562 * the queue it represents (taken with bfq_get_entity).
2564 if (!in_serv_entity
->on_st
)
2565 bfq_put_queue(in_serv_bfqq
);
2568 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2569 bool ins_into_idle_tree
, bool expiration
)
2571 struct bfq_entity
*entity
= &bfqq
->entity
;
2573 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2576 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2578 struct bfq_entity
*entity
= &bfqq
->entity
;
2580 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2582 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2585 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2587 struct bfq_entity
*entity
= &bfqq
->entity
;
2589 bfq_activate_requeue_entity(entity
, false,
2590 bfqq
== bfqd
->in_service_queue
);
2593 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2596 * Called when the bfqq no longer has requests pending, remove it from
2597 * the service tree. As a special case, it can be invoked during an
2600 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2603 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2605 bfq_clear_bfqq_busy(bfqq
);
2607 bfqd
->busy_queues
--;
2609 if (bfqq
->wr_coeff
> 1)
2610 bfqd
->wr_busy_queues
--;
2612 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2614 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2618 * Called when an inactive queue receives a new request.
2620 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2622 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2624 bfq_activate_bfqq(bfqd
, bfqq
);
2626 bfq_mark_bfqq_busy(bfqq
);
2627 bfqd
->busy_queues
++;
2629 if (bfqq
->wr_coeff
> 1)
2630 bfqd
->wr_busy_queues
++;
2633 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2635 /* bfqg stats flags */
2636 enum bfqg_stats_flags
{
2637 BFQG_stats_waiting
= 0,
2642 #define BFQG_FLAG_FNS(name) \
2643 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2645 stats->flags |= (1 << BFQG_stats_##name); \
2647 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2649 stats->flags &= ~(1 << BFQG_stats_##name); \
2651 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2653 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2656 BFQG_FLAG_FNS(waiting)
2657 BFQG_FLAG_FNS(idling
)
2658 BFQG_FLAG_FNS(empty
)
2659 #undef BFQG_FLAG_FNS
2661 /* This should be called with the queue_lock held. */
2662 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2664 unsigned long long now
;
2666 if (!bfqg_stats_waiting(stats
))
2669 now
= sched_clock();
2670 if (time_after64(now
, stats
->start_group_wait_time
))
2671 blkg_stat_add(&stats
->group_wait_time
,
2672 now
- stats
->start_group_wait_time
);
2673 bfqg_stats_clear_waiting(stats
);
2676 /* This should be called with the queue_lock held. */
2677 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2678 struct bfq_group
*curr_bfqg
)
2680 struct bfqg_stats
*stats
= &bfqg
->stats
;
2682 if (bfqg_stats_waiting(stats
))
2684 if (bfqg
== curr_bfqg
)
2686 stats
->start_group_wait_time
= sched_clock();
2687 bfqg_stats_mark_waiting(stats
);
2690 /* This should be called with the queue_lock held. */
2691 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2693 unsigned long long now
;
2695 if (!bfqg_stats_empty(stats
))
2698 now
= sched_clock();
2699 if (time_after64(now
, stats
->start_empty_time
))
2700 blkg_stat_add(&stats
->empty_time
,
2701 now
- stats
->start_empty_time
);
2702 bfqg_stats_clear_empty(stats
);
2705 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2707 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2710 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2712 struct bfqg_stats
*stats
= &bfqg
->stats
;
2714 if (blkg_rwstat_total(&stats
->queued
))
2718 * group is already marked empty. This can happen if bfqq got new
2719 * request in parent group and moved to this group while being added
2720 * to service tree. Just ignore the event and move on.
2722 if (bfqg_stats_empty(stats
))
2725 stats
->start_empty_time
= sched_clock();
2726 bfqg_stats_mark_empty(stats
);
2729 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2731 struct bfqg_stats
*stats
= &bfqg
->stats
;
2733 if (bfqg_stats_idling(stats
)) {
2734 unsigned long long now
= sched_clock();
2736 if (time_after64(now
, stats
->start_idle_time
))
2737 blkg_stat_add(&stats
->idle_time
,
2738 now
- stats
->start_idle_time
);
2739 bfqg_stats_clear_idling(stats
);
2743 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2745 struct bfqg_stats
*stats
= &bfqg
->stats
;
2747 stats
->start_idle_time
= sched_clock();
2748 bfqg_stats_mark_idling(stats
);
2751 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2753 struct bfqg_stats
*stats
= &bfqg
->stats
;
2755 blkg_stat_add(&stats
->avg_queue_size_sum
,
2756 blkg_rwstat_total(&stats
->queued
));
2757 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2758 bfqg_stats_update_group_wait_time(stats
);
2762 * blk-cgroup policy-related handlers
2763 * The following functions help in converting between blk-cgroup
2764 * internal structures and BFQ-specific structures.
2767 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2769 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2772 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2774 return pd_to_blkg(&bfqg
->pd
);
2777 static struct blkcg_policy blkcg_policy_bfq
;
2779 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2781 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2785 * bfq_group handlers
2786 * The following functions help in navigating the bfq_group hierarchy
2787 * by allowing to find the parent of a bfq_group or the bfq_group
2788 * associated to a bfq_queue.
2791 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2793 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2795 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2798 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2800 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2802 return group_entity
? container_of(group_entity
, struct bfq_group
,
2804 bfqq
->bfqd
->root_group
;
2808 * The following two functions handle get and put of a bfq_group by
2809 * wrapping the related blk-cgroup hooks.
2812 static void bfqg_get(struct bfq_group
*bfqg
)
2814 return blkg_get(bfqg_to_blkg(bfqg
));
2817 static void bfqg_put(struct bfq_group
*bfqg
)
2819 return blkg_put(bfqg_to_blkg(bfqg
));
2822 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2823 struct bfq_queue
*bfqq
,
2826 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2827 bfqg_stats_end_empty_time(&bfqg
->stats
);
2828 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2829 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2832 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2834 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2837 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2839 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2842 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2843 uint64_t start_time
, uint64_t io_start_time
,
2846 struct bfqg_stats
*stats
= &bfqg
->stats
;
2847 unsigned long long now
= sched_clock();
2849 if (time_after64(now
, io_start_time
))
2850 blkg_rwstat_add(&stats
->service_time
, op
,
2851 now
- io_start_time
);
2852 if (time_after64(io_start_time
, start_time
))
2853 blkg_rwstat_add(&stats
->wait_time
, op
,
2854 io_start_time
- start_time
);
2858 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2860 /* queued stats shouldn't be cleared */
2861 blkg_rwstat_reset(&stats
->merged
);
2862 blkg_rwstat_reset(&stats
->service_time
);
2863 blkg_rwstat_reset(&stats
->wait_time
);
2864 blkg_stat_reset(&stats
->time
);
2865 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2866 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2867 blkg_stat_reset(&stats
->dequeue
);
2868 blkg_stat_reset(&stats
->group_wait_time
);
2869 blkg_stat_reset(&stats
->idle_time
);
2870 blkg_stat_reset(&stats
->empty_time
);
2874 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2879 /* queued stats shouldn't be cleared */
2880 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2881 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2882 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2883 blkg_stat_add_aux(&from
->time
, &from
->time
);
2884 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2885 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2886 &from
->avg_queue_size_samples
);
2887 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2888 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2889 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2890 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2894 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2895 * recursive stats can still account for the amount used by this bfqg after
2898 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2900 struct bfq_group
*parent
;
2902 if (!bfqg
) /* root_group */
2905 parent
= bfqg_parent(bfqg
);
2907 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2909 if (unlikely(!parent
))
2912 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2913 bfqg_stats_reset(&bfqg
->stats
);
2916 static void bfq_init_entity(struct bfq_entity
*entity
,
2917 struct bfq_group
*bfqg
)
2919 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2921 entity
->weight
= entity
->new_weight
;
2922 entity
->orig_weight
= entity
->new_weight
;
2924 bfqq
->ioprio
= bfqq
->new_ioprio
;
2925 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2928 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2929 entity
->sched_data
= &bfqg
->sched_data
;
2932 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2934 blkg_rwstat_exit(&stats
->merged
);
2935 blkg_rwstat_exit(&stats
->service_time
);
2936 blkg_rwstat_exit(&stats
->wait_time
);
2937 blkg_rwstat_exit(&stats
->queued
);
2938 blkg_stat_exit(&stats
->time
);
2939 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2940 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2941 blkg_stat_exit(&stats
->dequeue
);
2942 blkg_stat_exit(&stats
->group_wait_time
);
2943 blkg_stat_exit(&stats
->idle_time
);
2944 blkg_stat_exit(&stats
->empty_time
);
2947 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2949 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2950 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2951 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2952 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2953 blkg_stat_init(&stats
->time
, gfp
) ||
2954 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2955 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2956 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2957 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2958 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2959 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2960 bfqg_stats_exit(stats
);
2967 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2969 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2972 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2974 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2977 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2979 struct bfq_group_data
*bgd
;
2981 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2987 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2989 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2991 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2992 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2995 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2997 kfree(cpd_to_bfqgd(cpd
));
3000 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
3002 struct bfq_group
*bfqg
;
3004 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
3008 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
3016 static void bfq_pd_init(struct blkg_policy_data
*pd
)
3018 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
3019 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3020 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
3021 struct bfq_entity
*entity
= &bfqg
->entity
;
3022 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
3024 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
3025 entity
->my_sched_data
= &bfqg
->sched_data
;
3026 bfqg
->my_entity
= entity
; /*
3027 * the root_group's will be set to NULL
3028 * in bfq_init_queue()
3031 bfqg
->rq_pos_tree
= RB_ROOT
;
3034 static void bfq_pd_free(struct blkg_policy_data
*pd
)
3036 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3038 bfqg_stats_exit(&bfqg
->stats
);
3042 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
3044 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3046 bfqg_stats_reset(&bfqg
->stats
);
3049 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
3050 struct bfq_group
*parent
)
3052 struct bfq_entity
*entity
;
3054 entity
= &bfqg
->entity
;
3055 entity
->parent
= parent
->my_entity
;
3056 entity
->sched_data
= &parent
->sched_data
;
3059 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
3060 struct blkcg
*blkcg
)
3062 struct blkcg_gq
*blkg
;
3064 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
3066 return blkg_to_bfqg(blkg
);
3070 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3071 struct blkcg
*blkcg
)
3073 struct bfq_group
*bfqg
, *parent
;
3074 struct bfq_entity
*entity
;
3076 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
3078 if (unlikely(!bfqg
))
3082 * Update chain of bfq_groups as we might be handling a leaf group
3083 * which, along with some of its relatives, has not been hooked yet
3084 * to the private hierarchy of BFQ.
3086 entity
= &bfqg
->entity
;
3087 for_each_entity(entity
) {
3088 bfqg
= container_of(entity
, struct bfq_group
, entity
);
3089 if (bfqg
!= bfqd
->root_group
) {
3090 parent
= bfqg_parent(bfqg
);
3092 parent
= bfqd
->root_group
;
3093 bfq_group_set_parent(bfqg
, parent
);
3100 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
,
3101 struct bfq_queue
*bfqq
);
3102 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3103 struct bfq_queue
*bfqq
,
3105 enum bfqq_expiration reason
);
3108 * bfq_bfqq_move - migrate @bfqq to @bfqg.
3109 * @bfqd: queue descriptor.
3110 * @bfqq: the queue to move.
3111 * @bfqg: the group to move to.
3113 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
3114 * it on the new one. Avoid putting the entity on the old group idle tree.
3116 * Must be called under the queue lock; the cgroup owning @bfqg must
3117 * not disappear (by now this just means that we are called under
3120 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3121 struct bfq_group
*bfqg
)
3123 struct bfq_entity
*entity
= &bfqq
->entity
;
3125 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3126 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3127 * from data structures related to current group. Otherwise we
3128 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3131 if (bfqq
== bfqd
->in_service_queue
)
3132 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3133 false, BFQQE_PREEMPTED
);
3135 if (bfq_bfqq_busy(bfqq
))
3136 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
3137 else if (entity
->on_st
)
3138 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
3139 bfqg_put(bfqq_group(bfqq
));
3142 * Here we use a reference to bfqg. We don't need a refcounter
3143 * as the cgroup reference will not be dropped, so that its
3144 * destroy() callback will not be invoked.
3146 entity
->parent
= bfqg
->my_entity
;
3147 entity
->sched_data
= &bfqg
->sched_data
;
3150 if (bfq_bfqq_busy(bfqq
)) {
3151 bfq_pos_tree_add_move(bfqd
, bfqq
);
3152 bfq_activate_bfqq(bfqd
, bfqq
);
3155 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
3156 bfq_schedule_dispatch(bfqd
);
3160 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3161 * @bfqd: the queue descriptor.
3162 * @bic: the bic to move.
3163 * @blkcg: the blk-cgroup to move to.
3165 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3166 * has to make sure that the reference to cgroup is valid across the call.
3168 * NOTE: an alternative approach might have been to store the current
3169 * cgroup in bfqq and getting a reference to it, reducing the lookup
3170 * time here, at the price of slightly more complex code.
3172 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
3173 struct bfq_io_cq
*bic
,
3174 struct blkcg
*blkcg
)
3176 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
3177 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
3178 struct bfq_group
*bfqg
;
3179 struct bfq_entity
*entity
;
3181 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
3183 if (unlikely(!bfqg
))
3184 bfqg
= bfqd
->root_group
;
3187 entity
= &async_bfqq
->entity
;
3189 if (entity
->sched_data
!= &bfqg
->sched_data
) {
3190 bic_set_bfqq(bic
, NULL
, 0);
3191 bfq_log_bfqq(bfqd
, async_bfqq
,
3192 "bic_change_group: %p %d",
3193 async_bfqq
, async_bfqq
->ref
);
3194 bfq_put_queue(async_bfqq
);
3199 entity
= &sync_bfqq
->entity
;
3200 if (entity
->sched_data
!= &bfqg
->sched_data
)
3201 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
3207 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
3209 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
3210 struct bfq_group
*bfqg
= NULL
;
3214 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
3217 * Check whether blkcg has changed. The condition may trigger
3218 * spuriously on a newly created cic but there's no harm.
3220 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
3223 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
3224 bic
->blkcg_serial_nr
= serial_nr
;
3230 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3231 * @st: the service tree being flushed.
3233 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
3235 struct bfq_entity
*entity
= st
->first_idle
;
3237 for (; entity
; entity
= st
->first_idle
)
3238 __bfq_deactivate_entity(entity
, false);
3242 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3243 * @bfqd: the device data structure with the root group.
3244 * @entity: the entity to move.
3246 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
3247 struct bfq_entity
*entity
)
3249 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3251 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
3255 * bfq_reparent_active_entities - move to the root group all active
3257 * @bfqd: the device data structure with the root group.
3258 * @bfqg: the group to move from.
3259 * @st: the service tree with the entities.
3261 * Needs queue_lock to be taken and reference to be valid over the call.
3263 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
3264 struct bfq_group
*bfqg
,
3265 struct bfq_service_tree
*st
)
3267 struct rb_root
*active
= &st
->active
;
3268 struct bfq_entity
*entity
= NULL
;
3270 if (!RB_EMPTY_ROOT(&st
->active
))
3271 entity
= bfq_entity_of(rb_first(active
));
3273 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3274 bfq_reparent_leaf_entity(bfqd
, entity
);
3276 if (bfqg
->sched_data
.in_service_entity
)
3277 bfq_reparent_leaf_entity(bfqd
,
3278 bfqg
->sched_data
.in_service_entity
);
3282 * bfq_pd_offline - deactivate the entity associated with @pd,
3283 * and reparent its children entities.
3284 * @pd: descriptor of the policy going offline.
3286 * blkio already grabs the queue_lock for us, so no need to use
3289 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3291 struct bfq_service_tree
*st
;
3292 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3293 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3294 struct bfq_entity
*entity
= bfqg
->my_entity
;
3295 unsigned long flags
;
3298 if (!entity
) /* root group */
3301 spin_lock_irqsave(&bfqd
->lock
, flags
);
3303 * Empty all service_trees belonging to this group before
3304 * deactivating the group itself.
3306 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3307 st
= bfqg
->sched_data
.service_tree
+ i
;
3310 * The idle tree may still contain bfq_queues belonging
3311 * to exited task because they never migrated to a different
3312 * cgroup from the one being destroyed now. No one else
3313 * can access them so it's safe to act without any lock.
3315 bfq_flush_idle_tree(st
);
3318 * It may happen that some queues are still active
3319 * (busy) upon group destruction (if the corresponding
3320 * processes have been forced to terminate). We move
3321 * all the leaf entities corresponding to these queues
3322 * to the root_group.
3323 * Also, it may happen that the group has an entity
3324 * in service, which is disconnected from the active
3325 * tree: it must be moved, too.
3326 * There is no need to put the sync queues, as the
3327 * scheduler has taken no reference.
3329 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3332 __bfq_deactivate_entity(entity
, false);
3333 bfq_put_async_queues(bfqd
, bfqg
);
3335 bfq_unlock_put_ioc_restore(bfqd
, flags
);
3337 * @blkg is going offline and will be ignored by
3338 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3339 * that they don't get lost. If IOs complete after this point, the
3340 * stats for them will be lost. Oh well...
3342 bfqg_stats_xfer_dead(bfqg
);
3345 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3347 struct blkcg_gq
*blkg
;
3349 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
3350 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3352 bfq_end_wr_async_queues(bfqd
, bfqg
);
3354 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3357 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3359 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3360 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3361 unsigned int val
= 0;
3364 val
= bfqgd
->weight
;
3366 seq_printf(sf
, "%u\n", val
);
3371 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3372 struct cftype
*cftype
,
3375 struct blkcg
*blkcg
= css_to_blkcg(css
);
3376 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3377 struct blkcg_gq
*blkg
;
3380 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3384 spin_lock_irq(&blkcg
->lock
);
3385 bfqgd
->weight
= (unsigned short)val
;
3386 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3387 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3392 * Setting the prio_changed flag of the entity
3393 * to 1 with new_weight == weight would re-set
3394 * the value of the weight to its ioprio mapping.
3395 * Set the flag only if necessary.
3397 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3398 bfqg
->entity
.new_weight
= (unsigned short)val
;
3400 * Make sure that the above new value has been
3401 * stored in bfqg->entity.new_weight before
3402 * setting the prio_changed flag. In fact,
3403 * this flag may be read asynchronously (in
3404 * critical sections protected by a different
3405 * lock than that held here), and finding this
3406 * flag set may cause the execution of the code
3407 * for updating parameters whose value may
3408 * depend also on bfqg->entity.new_weight (in
3409 * __bfq_entity_update_weight_prio).
3410 * This barrier makes sure that the new value
3411 * of bfqg->entity.new_weight is correctly
3412 * seen in that code.
3415 bfqg
->entity
.prio_changed
= 1;
3418 spin_unlock_irq(&blkcg
->lock
);
3423 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3424 char *buf
, size_t nbytes
,
3428 /* First unsigned long found in the file is used */
3429 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3434 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3437 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3439 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3440 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3444 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3446 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3447 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3451 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3452 struct blkg_policy_data
*pd
, int off
)
3454 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3455 &blkcg_policy_bfq
, off
);
3456 return __blkg_prfill_u64(sf
, pd
, sum
);
3459 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3460 struct blkg_policy_data
*pd
, int off
)
3462 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3465 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3468 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3470 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3471 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3472 seq_cft(sf
)->private, false);
3476 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3478 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3479 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3480 seq_cft(sf
)->private, true);
3484 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3487 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3489 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3492 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3494 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3495 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3499 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3500 struct blkg_policy_data
*pd
, int off
)
3502 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3503 offsetof(struct blkcg_gq
, stat_bytes
));
3504 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3505 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3507 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3510 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3512 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3513 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3518 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3519 struct blkg_policy_data
*pd
, int off
)
3521 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3522 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3526 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3527 v
= div64_u64(v
, samples
);
3529 __blkg_prfill_u64(sf
, pd
, v
);
3533 /* print avg_queue_size */
3534 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3536 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3537 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3542 static struct bfq_group
*
3543 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3547 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3551 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3554 static struct cftype bfq_blkcg_legacy_files
[] = {
3556 .name
= "bfq.weight",
3557 .flags
= CFTYPE_NOT_ON_ROOT
,
3558 .seq_show
= bfq_io_show_weight
,
3559 .write_u64
= bfq_io_set_weight_legacy
,
3562 /* statistics, covers only the tasks in the bfqg */
3565 .private = offsetof(struct bfq_group
, stats
.time
),
3566 .seq_show
= bfqg_print_stat
,
3569 .name
= "bfq.sectors",
3570 .seq_show
= bfqg_print_stat_sectors
,
3573 .name
= "bfq.io_service_bytes",
3574 .private = (unsigned long)&blkcg_policy_bfq
,
3575 .seq_show
= blkg_print_stat_bytes
,
3578 .name
= "bfq.io_serviced",
3579 .private = (unsigned long)&blkcg_policy_bfq
,
3580 .seq_show
= blkg_print_stat_ios
,
3583 .name
= "bfq.io_service_time",
3584 .private = offsetof(struct bfq_group
, stats
.service_time
),
3585 .seq_show
= bfqg_print_rwstat
,
3588 .name
= "bfq.io_wait_time",
3589 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3590 .seq_show
= bfqg_print_rwstat
,
3593 .name
= "bfq.io_merged",
3594 .private = offsetof(struct bfq_group
, stats
.merged
),
3595 .seq_show
= bfqg_print_rwstat
,
3598 .name
= "bfq.io_queued",
3599 .private = offsetof(struct bfq_group
, stats
.queued
),
3600 .seq_show
= bfqg_print_rwstat
,
3603 /* the same statictics which cover the bfqg and its descendants */
3605 .name
= "bfq.time_recursive",
3606 .private = offsetof(struct bfq_group
, stats
.time
),
3607 .seq_show
= bfqg_print_stat_recursive
,
3610 .name
= "bfq.sectors_recursive",
3611 .seq_show
= bfqg_print_stat_sectors_recursive
,
3614 .name
= "bfq.io_service_bytes_recursive",
3615 .private = (unsigned long)&blkcg_policy_bfq
,
3616 .seq_show
= blkg_print_stat_bytes_recursive
,
3619 .name
= "bfq.io_serviced_recursive",
3620 .private = (unsigned long)&blkcg_policy_bfq
,
3621 .seq_show
= blkg_print_stat_ios_recursive
,
3624 .name
= "bfq.io_service_time_recursive",
3625 .private = offsetof(struct bfq_group
, stats
.service_time
),
3626 .seq_show
= bfqg_print_rwstat_recursive
,
3629 .name
= "bfq.io_wait_time_recursive",
3630 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3631 .seq_show
= bfqg_print_rwstat_recursive
,
3634 .name
= "bfq.io_merged_recursive",
3635 .private = offsetof(struct bfq_group
, stats
.merged
),
3636 .seq_show
= bfqg_print_rwstat_recursive
,
3639 .name
= "bfq.io_queued_recursive",
3640 .private = offsetof(struct bfq_group
, stats
.queued
),
3641 .seq_show
= bfqg_print_rwstat_recursive
,
3644 .name
= "bfq.avg_queue_size",
3645 .seq_show
= bfqg_print_avg_queue_size
,
3648 .name
= "bfq.group_wait_time",
3649 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3650 .seq_show
= bfqg_print_stat
,
3653 .name
= "bfq.idle_time",
3654 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3655 .seq_show
= bfqg_print_stat
,
3658 .name
= "bfq.empty_time",
3659 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3660 .seq_show
= bfqg_print_stat
,
3663 .name
= "bfq.dequeue",
3664 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3665 .seq_show
= bfqg_print_stat
,
3670 static struct cftype bfq_blkg_files
[] = {
3672 .name
= "bfq.weight",
3673 .flags
= CFTYPE_NOT_ON_ROOT
,
3674 .seq_show
= bfq_io_show_weight
,
3675 .write
= bfq_io_set_weight
,
3680 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3682 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3683 struct bfq_queue
*bfqq
, unsigned int op
) { }
3685 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3687 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3688 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3689 uint64_t start_time
, uint64_t io_start_time
,
3690 unsigned int op
) { }
3692 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3693 struct bfq_group
*curr_bfqg
) { }
3694 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3695 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3696 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3697 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3698 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3699 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3701 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3702 struct bfq_group
*bfqg
) {}
3704 static void bfq_init_entity(struct bfq_entity
*entity
,
3705 struct bfq_group
*bfqg
)
3707 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3709 entity
->weight
= entity
->new_weight
;
3710 entity
->orig_weight
= entity
->new_weight
;
3712 bfqq
->ioprio
= bfqq
->new_ioprio
;
3713 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3715 entity
->sched_data
= &bfqg
->sched_data
;
3718 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3720 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3722 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3725 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3726 struct blkcg
*blkcg
)
3728 return bfqd
->root_group
;
3731 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3733 return bfqq
->bfqd
->root_group
;
3736 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3739 struct bfq_group
*bfqg
;
3742 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3746 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3747 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3751 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3753 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3754 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3756 #define bfq_sample_valid(samples) ((samples) > 80)
3759 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3760 * We choose the request that is closesr to the head right now. Distance
3761 * behind the head is penalized and only allowed to a certain extent.
3763 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3764 struct request
*rq1
,
3765 struct request
*rq2
,
3768 sector_t s1
, s2
, d1
= 0, d2
= 0;
3769 unsigned long back_max
;
3770 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3771 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3772 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3774 if (!rq1
|| rq1
== rq2
)
3779 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3781 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3783 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3785 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3788 s1
= blk_rq_pos(rq1
);
3789 s2
= blk_rq_pos(rq2
);
3792 * By definition, 1KiB is 2 sectors.
3794 back_max
= bfqd
->bfq_back_max
* 2;
3797 * Strict one way elevator _except_ in the case where we allow
3798 * short backward seeks which are biased as twice the cost of a
3799 * similar forward seek.
3803 else if (s1
+ back_max
>= last
)
3804 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3806 wrap
|= BFQ_RQ1_WRAP
;
3810 else if (s2
+ back_max
>= last
)
3811 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3813 wrap
|= BFQ_RQ2_WRAP
;
3815 /* Found required data */
3818 * By doing switch() on the bit mask "wrap" we avoid having to
3819 * check two variables for all permutations: --> faster!
3822 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3837 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3840 * Since both rqs are wrapped,
3841 * start with the one that's further behind head
3842 * (--> only *one* back seek required),
3843 * since back seek takes more time than forward.
3852 static struct bfq_queue
*
3853 bfq_rq_pos_tree_lookup(struct bfq_data
*bfqd
, struct rb_root
*root
,
3854 sector_t sector
, struct rb_node
**ret_parent
,
3855 struct rb_node
***rb_link
)
3857 struct rb_node
**p
, *parent
;
3858 struct bfq_queue
*bfqq
= NULL
;
3866 bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
3869 * Sort strictly based on sector. Smallest to the left,
3870 * largest to the right.
3872 if (sector
> blk_rq_pos(bfqq
->next_rq
))
3873 n
= &(*p
)->rb_right
;
3874 else if (sector
< blk_rq_pos(bfqq
->next_rq
))
3882 *ret_parent
= parent
;
3886 bfq_log(bfqd
, "rq_pos_tree_lookup %llu: returning %d",
3887 (unsigned long long)sector
,
3888 bfqq
? bfqq
->pid
: 0);
3893 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
3895 struct rb_node
**p
, *parent
;
3896 struct bfq_queue
*__bfqq
;
3898 if (bfqq
->pos_root
) {
3899 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
3900 bfqq
->pos_root
= NULL
;
3903 if (bfq_class_idle(bfqq
))
3908 bfqq
->pos_root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
3909 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, bfqq
->pos_root
,
3910 blk_rq_pos(bfqq
->next_rq
), &parent
, &p
);
3912 rb_link_node(&bfqq
->pos_node
, parent
, p
);
3913 rb_insert_color(&bfqq
->pos_node
, bfqq
->pos_root
);
3915 bfqq
->pos_root
= NULL
;
3919 * Return expired entry, or NULL to just start from scratch in rbtree.
3921 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3922 struct request
*last
)
3926 if (bfq_bfqq_fifo_expire(bfqq
))
3929 bfq_mark_bfqq_fifo_expire(bfqq
);
3931 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3933 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3936 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3940 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3941 struct bfq_queue
*bfqq
,
3942 struct request
*last
)
3944 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3945 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3946 struct request
*next
, *prev
= NULL
;
3948 /* Follow expired path, else get first next available. */
3949 next
= bfq_check_fifo(bfqq
, last
);
3954 prev
= rb_entry_rq(rbprev
);
3957 next
= rb_entry_rq(rbnext
);
3959 rbnext
= rb_first(&bfqq
->sort_list
);
3960 if (rbnext
&& rbnext
!= &last
->rb_node
)
3961 next
= rb_entry_rq(rbnext
);
3964 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3967 /* see the definition of bfq_async_charge_factor for details */
3968 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3969 struct bfq_queue
*bfqq
)
3971 if (bfq_bfqq_sync(bfqq
) || bfqq
->wr_coeff
> 1)
3972 return blk_rq_sectors(rq
);
3975 * If there are no weight-raised queues, then amplify service
3976 * by just the async charge factor; otherwise amplify service
3977 * by twice the async charge factor, to further reduce latency
3978 * for weight-raised queues.
3980 if (bfqq
->bfqd
->wr_busy_queues
== 0)
3981 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
3983 return blk_rq_sectors(rq
) * 2 * bfq_async_charge_factor
;
3987 * bfq_updated_next_req - update the queue after a new next_rq selection.
3988 * @bfqd: the device data the queue belongs to.
3989 * @bfqq: the queue to update.
3991 * If the first request of a queue changes we make sure that the queue
3992 * has enough budget to serve at least its first request (if the
3993 * request has grown). We do this because if the queue has not enough
3994 * budget for its first request, it has to go through two dispatch
3995 * rounds to actually get it dispatched.
3997 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3998 struct bfq_queue
*bfqq
)
4000 struct bfq_entity
*entity
= &bfqq
->entity
;
4001 struct request
*next_rq
= bfqq
->next_rq
;
4002 unsigned long new_budget
;
4007 if (bfqq
== bfqd
->in_service_queue
)
4009 * In order not to break guarantees, budgets cannot be
4010 * changed after an entity has been selected.
4014 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
4015 bfq_serv_to_charge(next_rq
, bfqq
));
4016 if (entity
->budget
!= new_budget
) {
4017 entity
->budget
= new_budget
;
4018 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
4020 bfq_requeue_bfqq(bfqd
, bfqq
);
4025 bfq_bfqq_resume_state(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
4027 if (bic
->saved_idle_window
)
4028 bfq_mark_bfqq_idle_window(bfqq
);
4030 bfq_clear_bfqq_idle_window(bfqq
);
4032 if (bic
->saved_IO_bound
)
4033 bfq_mark_bfqq_IO_bound(bfqq
);
4035 bfq_clear_bfqq_IO_bound(bfqq
);
4037 bfqq
->ttime
= bic
->saved_ttime
;
4038 bfqq
->wr_coeff
= bic
->saved_wr_coeff
;
4039 bfqq
->wr_start_at_switch_to_srt
= bic
->saved_wr_start_at_switch_to_srt
;
4040 bfqq
->last_wr_start_finish
= bic
->saved_last_wr_start_finish
;
4041 bfqq
->wr_cur_max_time
= bic
->saved_wr_cur_max_time
;
4043 if (bfqq
->wr_coeff
> 1 &&
4044 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
4045 bfqq
->wr_cur_max_time
)) {
4046 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4047 "resume state: switching off wr");
4052 /* make sure weight will be updated, however we got here */
4053 bfqq
->entity
.prio_changed
= 1;
4056 static int bfqq_process_refs(struct bfq_queue
*bfqq
)
4058 return bfqq
->ref
- bfqq
->allocated
- bfqq
->entity
.on_st
;
4061 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
4063 struct bfq_entity
*entity
= &bfqq
->entity
;
4065 return entity
->budget
- entity
->service
;
4069 * If enough samples have been computed, return the current max budget
4070 * stored in bfqd, which is dynamically updated according to the
4071 * estimated disk peak rate; otherwise return the default max budget
4073 static int bfq_max_budget(struct bfq_data
*bfqd
)
4075 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4076 return bfq_default_max_budget
;
4078 return bfqd
->bfq_max_budget
;
4082 * Return min budget, which is a fraction of the current or default
4083 * max budget (trying with 1/32)
4085 static int bfq_min_budget(struct bfq_data
*bfqd
)
4087 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4088 return bfq_default_max_budget
/ 32;
4090 return bfqd
->bfq_max_budget
/ 32;
4093 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4094 struct bfq_queue
*bfqq
,
4096 enum bfqq_expiration reason
);
4099 * The next function, invoked after the input queue bfqq switches from
4100 * idle to busy, updates the budget of bfqq. The function also tells
4101 * whether the in-service queue should be expired, by returning
4102 * true. The purpose of expiring the in-service queue is to give bfqq
4103 * the chance to possibly preempt the in-service queue, and the reason
4104 * for preempting the in-service queue is to achieve one of the two
4107 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
4108 * expired because it has remained idle. In particular, bfqq may have
4109 * expired for one of the following two reasons:
4111 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
4112 * and did not make it to issue a new request before its last
4113 * request was served;
4115 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
4116 * a new request before the expiration of the idling-time.
4118 * Even if bfqq has expired for one of the above reasons, the process
4119 * associated with the queue may be however issuing requests greedily,
4120 * and thus be sensitive to the bandwidth it receives (bfqq may have
4121 * remained idle for other reasons: CPU high load, bfqq not enjoying
4122 * idling, I/O throttling somewhere in the path from the process to
4123 * the I/O scheduler, ...). But if, after every expiration for one of
4124 * the above two reasons, bfqq has to wait for the service of at least
4125 * one full budget of another queue before being served again, then
4126 * bfqq is likely to get a much lower bandwidth or resource time than
4127 * its reserved ones. To address this issue, two countermeasures need
4130 * First, the budget and the timestamps of bfqq need to be updated in
4131 * a special way on bfqq reactivation: they need to be updated as if
4132 * bfqq did not remain idle and did not expire. In fact, if they are
4133 * computed as if bfqq expired and remained idle until reactivation,
4134 * then the process associated with bfqq is treated as if, instead of
4135 * being greedy, it stopped issuing requests when bfqq remained idle,
4136 * and restarts issuing requests only on this reactivation. In other
4137 * words, the scheduler does not help the process recover the "service
4138 * hole" between bfqq expiration and reactivation. As a consequence,
4139 * the process receives a lower bandwidth than its reserved one. In
4140 * contrast, to recover this hole, the budget must be updated as if
4141 * bfqq was not expired at all before this reactivation, i.e., it must
4142 * be set to the value of the remaining budget when bfqq was
4143 * expired. Along the same line, timestamps need to be assigned the
4144 * value they had the last time bfqq was selected for service, i.e.,
4145 * before last expiration. Thus timestamps need to be back-shifted
4146 * with respect to their normal computation (see [1] for more details
4147 * on this tricky aspect).
4149 * Secondly, to allow the process to recover the hole, the in-service
4150 * queue must be expired too, to give bfqq the chance to preempt it
4151 * immediately. In fact, if bfqq has to wait for a full budget of the
4152 * in-service queue to be completed, then it may become impossible to
4153 * let the process recover the hole, even if the back-shifted
4154 * timestamps of bfqq are lower than those of the in-service queue. If
4155 * this happens for most or all of the holes, then the process may not
4156 * receive its reserved bandwidth. In this respect, it is worth noting
4157 * that, being the service of outstanding requests unpreemptible, a
4158 * little fraction of the holes may however be unrecoverable, thereby
4159 * causing a little loss of bandwidth.
4161 * The last important point is detecting whether bfqq does need this
4162 * bandwidth recovery. In this respect, the next function deems the
4163 * process associated with bfqq greedy, and thus allows it to recover
4164 * the hole, if: 1) the process is waiting for the arrival of a new
4165 * request (which implies that bfqq expired for one of the above two
4166 * reasons), and 2) such a request has arrived soon. The first
4167 * condition is controlled through the flag non_blocking_wait_rq,
4168 * while the second through the flag arrived_in_time. If both
4169 * conditions hold, then the function computes the budget in the
4170 * above-described special way, and signals that the in-service queue
4171 * should be expired. Timestamp back-shifting is done later in
4172 * __bfq_activate_entity.
4174 * 2. Reduce latency. Even if timestamps are not backshifted to let
4175 * the process associated with bfqq recover a service hole, bfqq may
4176 * however happen to have, after being (re)activated, a lower finish
4177 * timestamp than the in-service queue. That is, the next budget of
4178 * bfqq may have to be completed before the one of the in-service
4179 * queue. If this is the case, then preempting the in-service queue
4180 * allows this goal to be achieved, apart from the unpreemptible,
4181 * outstanding requests mentioned above.
4183 * Unfortunately, regardless of which of the above two goals one wants
4184 * to achieve, service trees need first to be updated to know whether
4185 * the in-service queue must be preempted. To have service trees
4186 * correctly updated, the in-service queue must be expired and
4187 * rescheduled, and bfqq must be scheduled too. This is one of the
4188 * most costly operations (in future versions, the scheduling
4189 * mechanism may be re-designed in such a way to make it possible to
4190 * know whether preemption is needed without needing to update service
4191 * trees). In addition, queue preemptions almost always cause random
4192 * I/O, and thus loss of throughput. Because of these facts, the next
4193 * function adopts the following simple scheme to avoid both costly
4194 * operations and too frequent preemptions: it requests the expiration
4195 * of the in-service queue (unconditionally) only for queues that need
4196 * to recover a hole, or that either are weight-raised or deserve to
4199 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
4200 struct bfq_queue
*bfqq
,
4201 bool arrived_in_time
,
4202 bool wr_or_deserves_wr
)
4204 struct bfq_entity
*entity
= &bfqq
->entity
;
4206 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
4208 * We do not clear the flag non_blocking_wait_rq here, as
4209 * the latter is used in bfq_activate_bfqq to signal
4210 * that timestamps need to be back-shifted (and is
4211 * cleared right after).
4215 * In next assignment we rely on that either
4216 * entity->service or entity->budget are not updated
4217 * on expiration if bfqq is empty (see
4218 * __bfq_bfqq_recalc_budget). Thus both quantities
4219 * remain unchanged after such an expiration, and the
4220 * following statement therefore assigns to
4221 * entity->budget the remaining budget on such an
4222 * expiration. For clarity, entity->service is not
4223 * updated on expiration in any case, and, in normal
4224 * operation, is reset only when bfqq is selected for
4225 * service (see bfq_get_next_queue).
4227 entity
->budget
= min_t(unsigned long,
4228 bfq_bfqq_budget_left(bfqq
),
4234 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
4235 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
4236 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
4237 return wr_or_deserves_wr
;
4240 static unsigned int bfq_wr_duration(struct bfq_data
*bfqd
)
4244 if (bfqd
->bfq_wr_max_time
> 0)
4245 return bfqd
->bfq_wr_max_time
;
4247 dur
= bfqd
->RT_prod
;
4248 do_div(dur
, bfqd
->peak_rate
);
4251 * Limit duration between 3 and 13 seconds. Tests show that
4252 * higher values than 13 seconds often yield the opposite of
4253 * the desired result, i.e., worsen responsiveness by letting
4254 * non-interactive and non-soft-real-time applications
4255 * preserve weight raising for a too long time interval.
4257 * On the other end, lower values than 3 seconds make it
4258 * difficult for most interactive tasks to complete their jobs
4259 * before weight-raising finishes.
4261 if (dur
> msecs_to_jiffies(13000))
4262 dur
= msecs_to_jiffies(13000);
4263 else if (dur
< msecs_to_jiffies(3000))
4264 dur
= msecs_to_jiffies(3000);
4269 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data
*bfqd
,
4270 struct bfq_queue
*bfqq
,
4271 unsigned int old_wr_coeff
,
4272 bool wr_or_deserves_wr
,
4276 if (old_wr_coeff
== 1 && wr_or_deserves_wr
) {
4277 /* start a weight-raising period */
4279 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4280 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4282 bfqq
->wr_start_at_switch_to_srt
= jiffies
;
4283 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4284 BFQ_SOFTRT_WEIGHT_FACTOR
;
4285 bfqq
->wr_cur_max_time
=
4286 bfqd
->bfq_wr_rt_max_time
;
4290 * If needed, further reduce budget to make sure it is
4291 * close to bfqq's backlog, so as to reduce the
4292 * scheduling-error component due to a too large
4293 * budget. Do not care about throughput consequences,
4294 * but only about latency. Finally, do not assign a
4295 * too small budget either, to avoid increasing
4296 * latency by causing too frequent expirations.
4298 bfqq
->entity
.budget
= min_t(unsigned long,
4299 bfqq
->entity
.budget
,
4300 2 * bfq_min_budget(bfqd
));
4301 } else if (old_wr_coeff
> 1) {
4302 if (interactive
) { /* update wr coeff and duration */
4303 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4304 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4305 } else if (soft_rt
) {
4307 * The application is now or still meeting the
4308 * requirements for being deemed soft rt. We
4309 * can then correctly and safely (re)charge
4310 * the weight-raising duration for the
4311 * application with the weight-raising
4312 * duration for soft rt applications.
4314 * In particular, doing this recharge now, i.e.,
4315 * before the weight-raising period for the
4316 * application finishes, reduces the probability
4317 * of the following negative scenario:
4318 * 1) the weight of a soft rt application is
4319 * raised at startup (as for any newly
4320 * created application),
4321 * 2) since the application is not interactive,
4322 * at a certain time weight-raising is
4323 * stopped for the application,
4324 * 3) at that time the application happens to
4325 * still have pending requests, and hence
4326 * is destined to not have a chance to be
4327 * deemed soft rt before these requests are
4328 * completed (see the comments to the
4329 * function bfq_bfqq_softrt_next_start()
4330 * for details on soft rt detection),
4331 * 4) these pending requests experience a high
4332 * latency because the application is not
4333 * weight-raised while they are pending.
4335 if (bfqq
->wr_cur_max_time
!=
4336 bfqd
->bfq_wr_rt_max_time
) {
4337 bfqq
->wr_start_at_switch_to_srt
=
4338 bfqq
->last_wr_start_finish
;
4340 bfqq
->wr_cur_max_time
=
4341 bfqd
->bfq_wr_rt_max_time
;
4342 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4343 BFQ_SOFTRT_WEIGHT_FACTOR
;
4345 bfqq
->last_wr_start_finish
= jiffies
;
4350 static bool bfq_bfqq_idle_for_long_time(struct bfq_data
*bfqd
,
4351 struct bfq_queue
*bfqq
)
4353 return bfqq
->dispatched
== 0 &&
4354 time_is_before_jiffies(
4355 bfqq
->budget_timeout
+
4356 bfqd
->bfq_wr_min_idle_time
);
4359 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
4360 struct bfq_queue
*bfqq
,
4365 bool soft_rt
, wr_or_deserves_wr
, bfqq_wants_to_preempt
,
4366 idle_for_long_time
= bfq_bfqq_idle_for_long_time(bfqd
, bfqq
),
4368 * See the comments on
4369 * bfq_bfqq_update_budg_for_activation for
4370 * details on the usage of the next variable.
4372 arrived_in_time
= ktime_get_ns() <=
4373 bfqq
->ttime
.last_end_request
+
4374 bfqd
->bfq_slice_idle
* 3;
4376 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
4379 * bfqq deserves to be weight-raised if:
4381 * - it has been idle for enough time or is soft real-time,
4382 * - is linked to a bfq_io_cq (it is not shared in any sense).
4384 soft_rt
= bfqd
->bfq_wr_max_softrt_rate
> 0 &&
4385 time_is_before_jiffies(bfqq
->soft_rt_next_start
);
4386 *interactive
= idle_for_long_time
;
4387 wr_or_deserves_wr
= bfqd
->low_latency
&&
4388 (bfqq
->wr_coeff
> 1 ||
4389 (bfq_bfqq_sync(bfqq
) &&
4390 bfqq
->bic
&& (*interactive
|| soft_rt
)));
4393 * Using the last flag, update budget and check whether bfqq
4394 * may want to preempt the in-service queue.
4396 bfqq_wants_to_preempt
=
4397 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
4401 if (!bfq_bfqq_IO_bound(bfqq
)) {
4402 if (arrived_in_time
) {
4403 bfqq
->requests_within_timer
++;
4404 if (bfqq
->requests_within_timer
>=
4405 bfqd
->bfq_requests_within_timer
)
4406 bfq_mark_bfqq_IO_bound(bfqq
);
4408 bfqq
->requests_within_timer
= 0;
4411 if (bfqd
->low_latency
) {
4412 if (unlikely(time_is_after_jiffies(bfqq
->split_time
)))
4415 jiffies
- bfqd
->bfq_wr_min_idle_time
- 1;
4417 if (time_is_before_jiffies(bfqq
->split_time
+
4418 bfqd
->bfq_wr_min_idle_time
)) {
4419 bfq_update_bfqq_wr_on_rq_arrival(bfqd
, bfqq
,
4425 if (old_wr_coeff
!= bfqq
->wr_coeff
)
4426 bfqq
->entity
.prio_changed
= 1;
4430 bfqq
->last_idle_bklogged
= jiffies
;
4431 bfqq
->service_from_backlogged
= 0;
4432 bfq_clear_bfqq_softrt_update(bfqq
);
4434 bfq_add_bfqq_busy(bfqd
, bfqq
);
4437 * Expire in-service queue only if preemption may be needed
4438 * for guarantees. In this respect, the function
4439 * next_queue_may_preempt just checks a simple, necessary
4440 * condition, and not a sufficient condition based on
4441 * timestamps. In fact, for the latter condition to be
4442 * evaluated, timestamps would need first to be updated, and
4443 * this operation is quite costly (see the comments on the
4444 * function bfq_bfqq_update_budg_for_activation).
4446 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
4447 bfqd
->in_service_queue
->wr_coeff
< bfqq
->wr_coeff
&&
4448 next_queue_may_preempt(bfqd
))
4449 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
4450 false, BFQQE_PREEMPTED
);
4453 static void bfq_add_request(struct request
*rq
)
4455 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4456 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4457 struct request
*next_rq
, *prev
;
4458 unsigned int old_wr_coeff
= bfqq
->wr_coeff
;
4459 bool interactive
= false;
4461 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
4462 bfqq
->queued
[rq_is_sync(rq
)]++;
4465 elv_rb_add(&bfqq
->sort_list
, rq
);
4468 * Check if this request is a better next-serve candidate.
4470 prev
= bfqq
->next_rq
;
4471 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
4472 bfqq
->next_rq
= next_rq
;
4475 * Adjust priority tree position, if next_rq changes.
4477 if (prev
!= bfqq
->next_rq
)
4478 bfq_pos_tree_add_move(bfqd
, bfqq
);
4480 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
4481 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, old_wr_coeff
,
4484 if (bfqd
->low_latency
&& old_wr_coeff
== 1 && !rq_is_sync(rq
) &&
4485 time_is_before_jiffies(
4486 bfqq
->last_wr_start_finish
+
4487 bfqd
->bfq_wr_min_inter_arr_async
)) {
4488 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4489 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4491 bfqd
->wr_busy_queues
++;
4492 bfqq
->entity
.prio_changed
= 1;
4494 if (prev
!= bfqq
->next_rq
)
4495 bfq_updated_next_req(bfqd
, bfqq
);
4499 * Assign jiffies to last_wr_start_finish in the following
4502 * . if bfqq is not going to be weight-raised, because, for
4503 * non weight-raised queues, last_wr_start_finish stores the
4504 * arrival time of the last request; as of now, this piece
4505 * of information is used only for deciding whether to
4506 * weight-raise async queues
4508 * . if bfqq is not weight-raised, because, if bfqq is now
4509 * switching to weight-raised, then last_wr_start_finish
4510 * stores the time when weight-raising starts
4512 * . if bfqq is interactive, because, regardless of whether
4513 * bfqq is currently weight-raised, the weight-raising
4514 * period must start or restart (this case is considered
4515 * separately because it is not detected by the above
4516 * conditions, if bfqq is already weight-raised)
4518 * last_wr_start_finish has to be updated also if bfqq is soft
4519 * real-time, because the weight-raising period is constantly
4520 * restarted on idle-to-busy transitions for these queues, but
4521 * this is already done in bfq_bfqq_handle_idle_busy_switch if
4524 if (bfqd
->low_latency
&&
4525 (old_wr_coeff
== 1 || bfqq
->wr_coeff
== 1 || interactive
))
4526 bfqq
->last_wr_start_finish
= jiffies
;
4529 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
4531 struct request_queue
*q
)
4533 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4537 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
4542 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
4545 return abs(blk_rq_pos(rq
) - last_pos
);
4550 #if 0 /* Still not clear if we can do without next two functions */
4551 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
4553 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4555 bfqd
->rq_in_driver
++;
4558 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
4560 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4562 bfqd
->rq_in_driver
--;
4566 static void bfq_remove_request(struct request_queue
*q
,
4569 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4570 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4571 const int sync
= rq_is_sync(rq
);
4573 if (bfqq
->next_rq
== rq
) {
4574 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
4575 bfq_updated_next_req(bfqd
, bfqq
);
4578 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
4579 list_del_init(&rq
->queuelist
);
4580 bfqq
->queued
[sync
]--;
4582 elv_rb_del(&bfqq
->sort_list
, rq
);
4584 elv_rqhash_del(q
, rq
);
4585 if (q
->last_merge
== rq
)
4586 q
->last_merge
= NULL
;
4588 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4589 bfqq
->next_rq
= NULL
;
4591 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
4592 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
4594 * bfqq emptied. In normal operation, when
4595 * bfqq is empty, bfqq->entity.service and
4596 * bfqq->entity.budget must contain,
4597 * respectively, the service received and the
4598 * budget used last time bfqq emptied. These
4599 * facts do not hold in this case, as at least
4600 * this last removal occurred while bfqq is
4601 * not in service. To avoid inconsistencies,
4602 * reset both bfqq->entity.service and
4603 * bfqq->entity.budget, if bfqq has still a
4604 * process that may issue I/O requests to it.
4606 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
4610 * Remove queue from request-position tree as it is empty.
4612 if (bfqq
->pos_root
) {
4613 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
4614 bfqq
->pos_root
= NULL
;
4618 if (rq
->cmd_flags
& REQ_META
)
4619 bfqq
->meta_pending
--;
4621 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
4624 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
4626 struct request_queue
*q
= hctx
->queue
;
4627 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4628 struct request
*free
= NULL
;
4630 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4631 * store its return value for later use, to avoid nesting
4632 * queue_lock inside the bfqd->lock. We assume that the bic
4633 * returned by bfq_bic_lookup does not go away before
4634 * bfqd->lock is taken.
4636 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
4639 spin_lock_irq(&bfqd
->lock
);
4642 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
4644 bfqd
->bio_bfqq
= NULL
;
4645 bfqd
->bio_bic
= bic
;
4647 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
4650 blk_mq_free_request(free
);
4651 spin_unlock_irq(&bfqd
->lock
);
4656 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
4659 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4660 struct request
*__rq
;
4662 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
4663 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
4665 return ELEVATOR_FRONT_MERGE
;
4668 return ELEVATOR_NO_MERGE
;
4671 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
4672 enum elv_merge type
)
4674 if (type
== ELEVATOR_FRONT_MERGE
&&
4675 rb_prev(&req
->rb_node
) &&
4677 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
4678 struct request
, rb_node
))) {
4679 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
4680 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4681 struct request
*prev
, *next_rq
;
4683 /* Reposition request in its sort_list */
4684 elv_rb_del(&bfqq
->sort_list
, req
);
4685 elv_rb_add(&bfqq
->sort_list
, req
);
4687 /* Choose next request to be served for bfqq */
4688 prev
= bfqq
->next_rq
;
4689 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4690 bfqd
->last_position
);
4691 bfqq
->next_rq
= next_rq
;
4693 * If next_rq changes, update both the queue's budget to
4694 * fit the new request and the queue's position in its
4697 if (prev
!= bfqq
->next_rq
) {
4698 bfq_updated_next_req(bfqd
, bfqq
);
4699 bfq_pos_tree_add_move(bfqd
, bfqq
);
4704 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4705 struct request
*next
)
4707 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4709 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4711 spin_lock_irq(&bfqq
->bfqd
->lock
);
4714 * If next and rq belong to the same bfq_queue and next is older
4715 * than rq, then reposition rq in the fifo (by substituting next
4716 * with rq). Otherwise, if next and rq belong to different
4717 * bfq_queues, never reposition rq: in fact, we would have to
4718 * reposition it with respect to next's position in its own fifo,
4719 * which would most certainly be too expensive with respect to
4722 if (bfqq
== next_bfqq
&&
4723 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4724 next
->fifo_time
< rq
->fifo_time
) {
4725 list_del_init(&rq
->queuelist
);
4726 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4727 rq
->fifo_time
= next
->fifo_time
;
4730 if (bfqq
->next_rq
== next
)
4733 bfq_remove_request(q
, next
);
4735 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4737 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4740 /* Must be called with bfqq != NULL */
4741 static void bfq_bfqq_end_wr(struct bfq_queue
*bfqq
)
4743 if (bfq_bfqq_busy(bfqq
))
4744 bfqq
->bfqd
->wr_busy_queues
--;
4746 bfqq
->wr_cur_max_time
= 0;
4747 bfqq
->last_wr_start_finish
= jiffies
;
4749 * Trigger a weight change on the next invocation of
4750 * __bfq_entity_update_weight_prio.
4752 bfqq
->entity
.prio_changed
= 1;
4755 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
4756 struct bfq_group
*bfqg
)
4760 for (i
= 0; i
< 2; i
++)
4761 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
4762 if (bfqg
->async_bfqq
[i
][j
])
4763 bfq_bfqq_end_wr(bfqg
->async_bfqq
[i
][j
]);
4764 if (bfqg
->async_idle_bfqq
)
4765 bfq_bfqq_end_wr(bfqg
->async_idle_bfqq
);
4768 static void bfq_end_wr(struct bfq_data
*bfqd
)
4770 struct bfq_queue
*bfqq
;
4772 spin_lock_irq(&bfqd
->lock
);
4774 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
)
4775 bfq_bfqq_end_wr(bfqq
);
4776 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
)
4777 bfq_bfqq_end_wr(bfqq
);
4778 bfq_end_wr_async(bfqd
);
4780 spin_unlock_irq(&bfqd
->lock
);
4783 static sector_t
bfq_io_struct_pos(void *io_struct
, bool request
)
4786 return blk_rq_pos(io_struct
);
4788 return ((struct bio
*)io_struct
)->bi_iter
.bi_sector
;
4791 static int bfq_rq_close_to_sector(void *io_struct
, bool request
,
4794 return abs(bfq_io_struct_pos(io_struct
, request
) - sector
) <=
4798 static struct bfq_queue
*bfqq_find_close(struct bfq_data
*bfqd
,
4799 struct bfq_queue
*bfqq
,
4802 struct rb_root
*root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
4803 struct rb_node
*parent
, *node
;
4804 struct bfq_queue
*__bfqq
;
4806 if (RB_EMPTY_ROOT(root
))
4810 * First, if we find a request starting at the end of the last
4811 * request, choose it.
4813 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, root
, sector
, &parent
, NULL
);
4818 * If the exact sector wasn't found, the parent of the NULL leaf
4819 * will contain the closest sector (rq_pos_tree sorted by
4820 * next_request position).
4822 __bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
4823 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
4826 if (blk_rq_pos(__bfqq
->next_rq
) < sector
)
4827 node
= rb_next(&__bfqq
->pos_node
);
4829 node
= rb_prev(&__bfqq
->pos_node
);
4833 __bfqq
= rb_entry(node
, struct bfq_queue
, pos_node
);
4834 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
4840 static struct bfq_queue
*bfq_find_close_cooperator(struct bfq_data
*bfqd
,
4841 struct bfq_queue
*cur_bfqq
,
4844 struct bfq_queue
*bfqq
;
4847 * We shall notice if some of the queues are cooperating,
4848 * e.g., working closely on the same area of the device. In
4849 * that case, we can group them together and: 1) don't waste
4850 * time idling, and 2) serve the union of their requests in
4851 * the best possible order for throughput.
4853 bfqq
= bfqq_find_close(bfqd
, cur_bfqq
, sector
);
4854 if (!bfqq
|| bfqq
== cur_bfqq
)
4860 static struct bfq_queue
*
4861 bfq_setup_merge(struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
4863 int process_refs
, new_process_refs
;
4864 struct bfq_queue
*__bfqq
;
4867 * If there are no process references on the new_bfqq, then it is
4868 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
4869 * may have dropped their last reference (not just their last process
4872 if (!bfqq_process_refs(new_bfqq
))
4875 /* Avoid a circular list and skip interim queue merges. */
4876 while ((__bfqq
= new_bfqq
->new_bfqq
)) {
4882 process_refs
= bfqq_process_refs(bfqq
);
4883 new_process_refs
= bfqq_process_refs(new_bfqq
);
4885 * If the process for the bfqq has gone away, there is no
4886 * sense in merging the queues.
4888 if (process_refs
== 0 || new_process_refs
== 0)
4891 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "scheduling merge with queue %d",
4895 * Merging is just a redirection: the requests of the process
4896 * owning one of the two queues are redirected to the other queue.
4897 * The latter queue, in its turn, is set as shared if this is the
4898 * first time that the requests of some process are redirected to
4901 * We redirect bfqq to new_bfqq and not the opposite, because we
4902 * are in the context of the process owning bfqq, hence we have
4903 * the io_cq of this process. So we can immediately configure this
4904 * io_cq to redirect the requests of the process to new_bfqq.
4906 * NOTE, even if new_bfqq coincides with the in-service queue, the
4907 * io_cq of new_bfqq is not available, because, if the in-service
4908 * queue is shared, bfqd->in_service_bic may not point to the
4909 * io_cq of the in-service queue.
4910 * Redirecting the requests of the process owning bfqq to the
4911 * currently in-service queue is in any case the best option, as
4912 * we feed the in-service queue with new requests close to the
4913 * last request served and, by doing so, hopefully increase the
4916 bfqq
->new_bfqq
= new_bfqq
;
4917 new_bfqq
->ref
+= process_refs
;
4921 static bool bfq_may_be_close_cooperator(struct bfq_queue
*bfqq
,
4922 struct bfq_queue
*new_bfqq
)
4924 if (bfq_class_idle(bfqq
) || bfq_class_idle(new_bfqq
) ||
4925 (bfqq
->ioprio_class
!= new_bfqq
->ioprio_class
))
4929 * If either of the queues has already been detected as seeky,
4930 * then merging it with the other queue is unlikely to lead to
4933 if (BFQQ_SEEKY(bfqq
) || BFQQ_SEEKY(new_bfqq
))
4937 * Interleaved I/O is known to be done by (some) applications
4938 * only for reads, so it does not make sense to merge async
4941 if (!bfq_bfqq_sync(bfqq
) || !bfq_bfqq_sync(new_bfqq
))
4948 * If this function returns true, then bfqq cannot be merged. The idea
4949 * is that true cooperation happens very early after processes start
4950 * to do I/O. Usually, late cooperations are just accidental false
4951 * positives. In case bfqq is weight-raised, such false positives
4952 * would evidently degrade latency guarantees for bfqq.
4954 static bool wr_from_too_long(struct bfq_queue
*bfqq
)
4956 return bfqq
->wr_coeff
> 1 &&
4957 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
4958 msecs_to_jiffies(100));
4962 * Attempt to schedule a merge of bfqq with the currently in-service
4963 * queue or with a close queue among the scheduled queues. Return
4964 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
4965 * structure otherwise.
4967 * The OOM queue is not allowed to participate to cooperation: in fact, since
4968 * the requests temporarily redirected to the OOM queue could be redirected
4969 * again to dedicated queues at any time, the state needed to correctly
4970 * handle merging with the OOM queue would be quite complex and expensive
4971 * to maintain. Besides, in such a critical condition as an out of memory,
4972 * the benefits of queue merging may be little relevant, or even negligible.
4974 * Weight-raised queues can be merged only if their weight-raising
4975 * period has just started. In fact cooperating processes are usually
4976 * started together. Thus, with this filter we avoid false positives
4977 * that would jeopardize low-latency guarantees.
4979 * WARNING: queue merging may impair fairness among non-weight raised
4980 * queues, for at least two reasons: 1) the original weight of a
4981 * merged queue may change during the merged state, 2) even being the
4982 * weight the same, a merged queue may be bloated with many more
4983 * requests than the ones produced by its originally-associated
4986 static struct bfq_queue
*
4987 bfq_setup_cooperator(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
4988 void *io_struct
, bool request
)
4990 struct bfq_queue
*in_service_bfqq
, *new_bfqq
;
4993 return bfqq
->new_bfqq
;
4996 wr_from_too_long(bfqq
) ||
4997 unlikely(bfqq
== &bfqd
->oom_bfqq
))
5000 /* If there is only one backlogged queue, don't search. */
5001 if (bfqd
->busy_queues
== 1)
5004 in_service_bfqq
= bfqd
->in_service_queue
;
5006 if (!in_service_bfqq
|| in_service_bfqq
== bfqq
||
5007 !bfqd
->in_service_bic
|| wr_from_too_long(in_service_bfqq
) ||
5008 unlikely(in_service_bfqq
== &bfqd
->oom_bfqq
))
5009 goto check_scheduled
;
5011 if (bfq_rq_close_to_sector(io_struct
, request
, bfqd
->last_position
) &&
5012 bfqq
->entity
.parent
== in_service_bfqq
->entity
.parent
&&
5013 bfq_may_be_close_cooperator(bfqq
, in_service_bfqq
)) {
5014 new_bfqq
= bfq_setup_merge(bfqq
, in_service_bfqq
);
5019 * Check whether there is a cooperator among currently scheduled
5020 * queues. The only thing we need is that the bio/request is not
5021 * NULL, as we need it to establish whether a cooperator exists.
5024 new_bfqq
= bfq_find_close_cooperator(bfqd
, bfqq
,
5025 bfq_io_struct_pos(io_struct
, request
));
5027 if (new_bfqq
&& !wr_from_too_long(new_bfqq
) &&
5028 likely(new_bfqq
!= &bfqd
->oom_bfqq
) &&
5029 bfq_may_be_close_cooperator(bfqq
, new_bfqq
))
5030 return bfq_setup_merge(bfqq
, new_bfqq
);
5035 static void bfq_bfqq_save_state(struct bfq_queue
*bfqq
)
5037 struct bfq_io_cq
*bic
= bfqq
->bic
;
5040 * If !bfqq->bic, the queue is already shared or its requests
5041 * have already been redirected to a shared queue; both idle window
5042 * and weight raising state have already been saved. Do nothing.
5047 bic
->saved_ttime
= bfqq
->ttime
;
5048 bic
->saved_idle_window
= bfq_bfqq_idle_window(bfqq
);
5049 bic
->saved_IO_bound
= bfq_bfqq_IO_bound(bfqq
);
5050 bic
->saved_wr_coeff
= bfqq
->wr_coeff
;
5051 bic
->saved_wr_start_at_switch_to_srt
= bfqq
->wr_start_at_switch_to_srt
;
5052 bic
->saved_last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5053 bic
->saved_wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5056 static void bfq_get_bic_reference(struct bfq_queue
*bfqq
)
5059 * If bfqq->bic has a non-NULL value, the bic to which it belongs
5060 * is about to begin using a shared bfq_queue.
5063 atomic_long_inc(&bfqq
->bic
->icq
.ioc
->refcount
);
5067 bfq_merge_bfqqs(struct bfq_data
*bfqd
, struct bfq_io_cq
*bic
,
5068 struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
5070 bfq_log_bfqq(bfqd
, bfqq
, "merging with queue %lu",
5071 (unsigned long)new_bfqq
->pid
);
5072 /* Save weight raising and idle window of the merged queues */
5073 bfq_bfqq_save_state(bfqq
);
5074 bfq_bfqq_save_state(new_bfqq
);
5075 if (bfq_bfqq_IO_bound(bfqq
))
5076 bfq_mark_bfqq_IO_bound(new_bfqq
);
5077 bfq_clear_bfqq_IO_bound(bfqq
);
5080 * If bfqq is weight-raised, then let new_bfqq inherit
5081 * weight-raising. To reduce false positives, neglect the case
5082 * where bfqq has just been created, but has not yet made it
5083 * to be weight-raised (which may happen because EQM may merge
5084 * bfqq even before bfq_add_request is executed for the first
5087 if (new_bfqq
->wr_coeff
== 1 && bfqq
->wr_coeff
> 1) {
5088 new_bfqq
->wr_coeff
= bfqq
->wr_coeff
;
5089 new_bfqq
->wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5090 new_bfqq
->last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5091 new_bfqq
->wr_start_at_switch_to_srt
=
5092 bfqq
->wr_start_at_switch_to_srt
;
5093 if (bfq_bfqq_busy(new_bfqq
))
5094 bfqd
->wr_busy_queues
++;
5095 new_bfqq
->entity
.prio_changed
= 1;
5098 if (bfqq
->wr_coeff
> 1) { /* bfqq has given its wr to new_bfqq */
5100 bfqq
->entity
.prio_changed
= 1;
5101 if (bfq_bfqq_busy(bfqq
))
5102 bfqd
->wr_busy_queues
--;
5105 bfq_log_bfqq(bfqd
, new_bfqq
, "merge_bfqqs: wr_busy %d",
5106 bfqd
->wr_busy_queues
);
5109 * Grab a reference to the bic, to prevent it from being destroyed
5110 * before being possibly touched by a bfq_split_bfqq().
5112 bfq_get_bic_reference(bfqq
);
5113 bfq_get_bic_reference(new_bfqq
);
5115 * Merge queues (that is, let bic redirect its requests to new_bfqq)
5117 bic_set_bfqq(bic
, new_bfqq
, 1);
5118 bfq_mark_bfqq_coop(new_bfqq
);
5120 * new_bfqq now belongs to at least two bics (it is a shared queue):
5121 * set new_bfqq->bic to NULL. bfqq either:
5122 * - does not belong to any bic any more, and hence bfqq->bic must
5123 * be set to NULL, or
5124 * - is a queue whose owning bics have already been redirected to a
5125 * different queue, hence the queue is destined to not belong to
5126 * any bic soon and bfqq->bic is already NULL (therefore the next
5127 * assignment causes no harm).
5129 new_bfqq
->bic
= NULL
;
5131 /* release process reference to bfqq */
5132 bfq_put_queue(bfqq
);
5135 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
5138 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5139 bool is_sync
= op_is_sync(bio
->bi_opf
);
5140 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
, *new_bfqq
;
5143 * Disallow merge of a sync bio into an async request.
5145 if (is_sync
&& !rq_is_sync(rq
))
5149 * Lookup the bfqq that this bio will be queued with. Allow
5150 * merge only if rq is queued there.
5156 * We take advantage of this function to perform an early merge
5157 * of the queues of possible cooperating processes.
5159 new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, bio
, false);
5162 * bic still points to bfqq, then it has not yet been
5163 * redirected to some other bfq_queue, and a queue
5164 * merge beween bfqq and new_bfqq can be safely
5165 * fulfillled, i.e., bic can be redirected to new_bfqq
5166 * and bfqq can be put.
5168 bfq_merge_bfqqs(bfqd
, bfqd
->bio_bic
, bfqq
,
5171 * If we get here, bio will be queued into new_queue,
5172 * so use new_bfqq to decide whether bio and rq can be
5178 * Change also bqfd->bio_bfqq, as
5179 * bfqd->bio_bic now points to new_bfqq, and
5180 * this function may be invoked again (and then may
5181 * use again bqfd->bio_bfqq).
5183 bfqd
->bio_bfqq
= bfqq
;
5186 return bfqq
== RQ_BFQQ(rq
);
5190 * Set the maximum time for the in-service queue to consume its
5191 * budget. This prevents seeky processes from lowering the throughput.
5192 * In practice, a time-slice service scheme is used with seeky
5195 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
,
5196 struct bfq_queue
*bfqq
)
5198 unsigned int timeout_coeff
;
5200 if (bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
)
5203 timeout_coeff
= bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
;
5205 bfqd
->last_budget_start
= ktime_get();
5207 bfqq
->budget_timeout
= jiffies
+
5208 bfqd
->bfq_timeout
* timeout_coeff
;
5211 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
5212 struct bfq_queue
*bfqq
)
5215 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
5216 bfq_clear_bfqq_fifo_expire(bfqq
);
5218 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
5220 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
) &&
5221 bfqq
->wr_coeff
> 1 &&
5222 bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
&&
5223 time_is_before_jiffies(bfqq
->budget_timeout
)) {
5225 * For soft real-time queues, move the start
5226 * of the weight-raising period forward by the
5227 * time the queue has not received any
5228 * service. Otherwise, a relatively long
5229 * service delay is likely to cause the
5230 * weight-raising period of the queue to end,
5231 * because of the short duration of the
5232 * weight-raising period of a soft real-time
5233 * queue. It is worth noting that this move
5234 * is not so dangerous for the other queues,
5235 * because soft real-time queues are not
5238 * To not add a further variable, we use the
5239 * overloaded field budget_timeout to
5240 * determine for how long the queue has not
5241 * received service, i.e., how much time has
5242 * elapsed since the queue expired. However,
5243 * this is a little imprecise, because
5244 * budget_timeout is set to jiffies if bfqq
5245 * not only expires, but also remains with no
5248 if (time_after(bfqq
->budget_timeout
,
5249 bfqq
->last_wr_start_finish
))
5250 bfqq
->last_wr_start_finish
+=
5251 jiffies
- bfqq
->budget_timeout
;
5253 bfqq
->last_wr_start_finish
= jiffies
;
5256 bfq_set_budget_timeout(bfqd
, bfqq
);
5257 bfq_log_bfqq(bfqd
, bfqq
,
5258 "set_in_service_queue, cur-budget = %d",
5259 bfqq
->entity
.budget
);
5262 bfqd
->in_service_queue
= bfqq
;
5266 * Get and set a new queue for service.
5268 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
5270 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
5272 __bfq_set_in_service_queue(bfqd
, bfqq
);
5276 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
5278 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5279 struct bfq_io_cq
*bic
;
5282 /* Processes have exited, don't wait. */
5283 bic
= bfqd
->in_service_bic
;
5284 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
5287 bfq_mark_bfqq_wait_request(bfqq
);
5290 * We don't want to idle for seeks, but we do want to allow
5291 * fair distribution of slice time for a process doing back-to-back
5292 * seeks. So allow a little bit of time for him to submit a new rq.
5294 sl
= bfqd
->bfq_slice_idle
;
5296 * Unless the queue is being weight-raised, grant only minimum
5297 * idle time if the queue is seeky. A long idling is preserved
5298 * for a weight-raised queue, because it is needed for
5299 * guaranteeing to the queue its reserved share of the
5302 if (BFQQ_SEEKY(bfqq
) && bfqq
->wr_coeff
== 1)
5303 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
5305 bfqd
->last_idling_start
= ktime_get();
5306 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
5308 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
5312 * In autotuning mode, max_budget is dynamically recomputed as the
5313 * amount of sectors transferred in timeout at the estimated peak
5314 * rate. This enables BFQ to utilize a full timeslice with a full
5315 * budget, even if the in-service queue is served at peak rate. And
5316 * this maximises throughput with sequential workloads.
5318 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
5320 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
5321 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
5325 * Update parameters related to throughput and responsiveness, as a
5326 * function of the estimated peak rate. See comments on
5327 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
5329 static void update_thr_responsiveness_params(struct bfq_data
*bfqd
)
5331 int dev_type
= blk_queue_nonrot(bfqd
->queue
);
5333 if (bfqd
->bfq_user_max_budget
== 0)
5334 bfqd
->bfq_max_budget
=
5335 bfq_calc_max_budget(bfqd
);
5337 if (bfqd
->device_speed
== BFQ_BFQD_FAST
&&
5338 bfqd
->peak_rate
< device_speed_thresh
[dev_type
]) {
5339 bfqd
->device_speed
= BFQ_BFQD_SLOW
;
5340 bfqd
->RT_prod
= R_slow
[dev_type
] *
5342 } else if (bfqd
->device_speed
== BFQ_BFQD_SLOW
&&
5343 bfqd
->peak_rate
> device_speed_thresh
[dev_type
]) {
5344 bfqd
->device_speed
= BFQ_BFQD_FAST
;
5345 bfqd
->RT_prod
= R_fast
[dev_type
] *
5350 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
5351 dev_type
== 0 ? "ROT" : "NONROT",
5352 bfqd
->device_speed
== BFQ_BFQD_FAST
? "FAST" : "SLOW",
5353 bfqd
->device_speed
== BFQ_BFQD_FAST
?
5354 (USEC_PER_SEC
*(u64
)R_fast
[dev_type
])>>BFQ_RATE_SHIFT
:
5355 (USEC_PER_SEC
*(u64
)R_slow
[dev_type
])>>BFQ_RATE_SHIFT
,
5356 (USEC_PER_SEC
*(u64
)device_speed_thresh
[dev_type
])>>
5360 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
5363 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
5364 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
5365 bfqd
->peak_rate_samples
= 1;
5366 bfqd
->sequential_samples
= 0;
5367 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
5369 } else /* no new rq dispatched, just reset the number of samples */
5370 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
5373 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
5374 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
5375 bfqd
->tot_sectors_dispatched
);
5378 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
5380 u32 rate
, weight
, divisor
;
5383 * For the convergence property to hold (see comments on
5384 * bfq_update_peak_rate()) and for the assessment to be
5385 * reliable, a minimum number of samples must be present, and
5386 * a minimum amount of time must have elapsed. If not so, do
5387 * not compute new rate. Just reset parameters, to get ready
5388 * for a new evaluation attempt.
5390 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
5391 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
5392 goto reset_computation
;
5395 * If a new request completion has occurred after last
5396 * dispatch, then, to approximate the rate at which requests
5397 * have been served by the device, it is more precise to
5398 * extend the observation interval to the last completion.
5400 bfqd
->delta_from_first
=
5401 max_t(u64
, bfqd
->delta_from_first
,
5402 bfqd
->last_completion
- bfqd
->first_dispatch
);
5405 * Rate computed in sects/usec, and not sects/nsec, for
5408 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
5409 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
5412 * Peak rate not updated if:
5413 * - the percentage of sequential dispatches is below 3/4 of the
5414 * total, and rate is below the current estimated peak rate
5415 * - rate is unreasonably high (> 20M sectors/sec)
5417 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
5418 rate
<= bfqd
->peak_rate
) ||
5419 rate
> 20<<BFQ_RATE_SHIFT
)
5420 goto reset_computation
;
5423 * We have to update the peak rate, at last! To this purpose,
5424 * we use a low-pass filter. We compute the smoothing constant
5425 * of the filter as a function of the 'weight' of the new
5428 * As can be seen in next formulas, we define this weight as a
5429 * quantity proportional to how sequential the workload is,
5430 * and to how long the observation time interval is.
5432 * The weight runs from 0 to 8. The maximum value of the
5433 * weight, 8, yields the minimum value for the smoothing
5434 * constant. At this minimum value for the smoothing constant,
5435 * the measured rate contributes for half of the next value of
5436 * the estimated peak rate.
5438 * So, the first step is to compute the weight as a function
5439 * of how sequential the workload is. Note that the weight
5440 * cannot reach 9, because bfqd->sequential_samples cannot
5441 * become equal to bfqd->peak_rate_samples, which, in its
5442 * turn, holds true because bfqd->sequential_samples is not
5443 * incremented for the first sample.
5445 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
5448 * Second step: further refine the weight as a function of the
5449 * duration of the observation interval.
5451 weight
= min_t(u32
, 8,
5452 div_u64(weight
* bfqd
->delta_from_first
,
5453 BFQ_RATE_REF_INTERVAL
));
5456 * Divisor ranging from 10, for minimum weight, to 2, for
5459 divisor
= 10 - weight
;
5462 * Finally, update peak rate:
5464 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
5466 bfqd
->peak_rate
*= divisor
-1;
5467 bfqd
->peak_rate
/= divisor
;
5468 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
5470 bfqd
->peak_rate
+= rate
;
5471 update_thr_responsiveness_params(bfqd
);
5474 bfq_reset_rate_computation(bfqd
, rq
);
5478 * Update the read/write peak rate (the main quantity used for
5479 * auto-tuning, see update_thr_responsiveness_params()).
5481 * It is not trivial to estimate the peak rate (correctly): because of
5482 * the presence of sw and hw queues between the scheduler and the
5483 * device components that finally serve I/O requests, it is hard to
5484 * say exactly when a given dispatched request is served inside the
5485 * device, and for how long. As a consequence, it is hard to know
5486 * precisely at what rate a given set of requests is actually served
5489 * On the opposite end, the dispatch time of any request is trivially
5490 * available, and, from this piece of information, the "dispatch rate"
5491 * of requests can be immediately computed. So, the idea in the next
5492 * function is to use what is known, namely request dispatch times
5493 * (plus, when useful, request completion times), to estimate what is
5494 * unknown, namely in-device request service rate.
5496 * The main issue is that, because of the above facts, the rate at
5497 * which a certain set of requests is dispatched over a certain time
5498 * interval can vary greatly with respect to the rate at which the
5499 * same requests are then served. But, since the size of any
5500 * intermediate queue is limited, and the service scheme is lossless
5501 * (no request is silently dropped), the following obvious convergence
5502 * property holds: the number of requests dispatched MUST become
5503 * closer and closer to the number of requests completed as the
5504 * observation interval grows. This is the key property used in
5505 * the next function to estimate the peak service rate as a function
5506 * of the observed dispatch rate. The function assumes to be invoked
5507 * on every request dispatch.
5509 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
5511 u64 now_ns
= ktime_get_ns();
5513 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
5514 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
5515 bfqd
->peak_rate_samples
);
5516 bfq_reset_rate_computation(bfqd
, rq
);
5517 goto update_last_values
; /* will add one sample */
5521 * Device idle for very long: the observation interval lasting
5522 * up to this dispatch cannot be a valid observation interval
5523 * for computing a new peak rate (similarly to the late-
5524 * completion event in bfq_completed_request()). Go to
5525 * update_rate_and_reset to have the following three steps
5527 * - close the observation interval at the last (previous)
5528 * request dispatch or completion
5529 * - compute rate, if possible, for that observation interval
5530 * - start a new observation interval with this dispatch
5532 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
5533 bfqd
->rq_in_driver
== 0)
5534 goto update_rate_and_reset
;
5536 /* Update sampling information */
5537 bfqd
->peak_rate_samples
++;
5539 if ((bfqd
->rq_in_driver
> 0 ||
5540 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
5541 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
5542 bfqd
->sequential_samples
++;
5544 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
5546 /* Reset max observed rq size every 32 dispatches */
5547 if (likely(bfqd
->peak_rate_samples
% 32))
5548 bfqd
->last_rq_max_size
=
5549 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
5551 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
5553 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
5555 /* Target observation interval not yet reached, go on sampling */
5556 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
5557 goto update_last_values
;
5559 update_rate_and_reset
:
5560 bfq_update_rate_reset(bfqd
, rq
);
5562 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
5563 bfqd
->last_dispatch
= now_ns
;
5567 * Remove request from internal lists.
5569 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
5571 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5574 * For consistency, the next instruction should have been
5575 * executed after removing the request from the queue and
5576 * dispatching it. We execute instead this instruction before
5577 * bfq_remove_request() (and hence introduce a temporary
5578 * inconsistency), for efficiency. In fact, should this
5579 * dispatch occur for a non in-service bfqq, this anticipated
5580 * increment prevents two counters related to bfqq->dispatched
5581 * from risking to be, first, uselessly decremented, and then
5582 * incremented again when the (new) value of bfqq->dispatched
5583 * happens to be taken into account.
5586 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
5588 bfq_remove_request(q
, rq
);
5591 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5594 * If this bfqq is shared between multiple processes, check
5595 * to make sure that those processes are still issuing I/Os
5596 * within the mean seek distance. If not, it may be time to
5597 * break the queues apart again.
5599 if (bfq_bfqq_coop(bfqq
) && BFQQ_SEEKY(bfqq
))
5600 bfq_mark_bfqq_split_coop(bfqq
);
5602 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
5603 if (bfqq
->dispatched
== 0)
5605 * Overloading budget_timeout field to store
5606 * the time at which the queue remains with no
5607 * backlog and no outstanding request; used by
5608 * the weight-raising mechanism.
5610 bfqq
->budget_timeout
= jiffies
;
5612 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
5614 bfq_requeue_bfqq(bfqd
, bfqq
);
5616 * Resort priority tree of potential close cooperators.
5618 bfq_pos_tree_add_move(bfqd
, bfqq
);
5622 * All in-service entities must have been properly deactivated
5623 * or requeued before executing the next function, which
5624 * resets all in-service entites as no more in service.
5626 __bfq_bfqd_reset_in_service(bfqd
);
5630 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
5631 * @bfqd: device data.
5632 * @bfqq: queue to update.
5633 * @reason: reason for expiration.
5635 * Handle the feedback on @bfqq budget at queue expiration.
5636 * See the body for detailed comments.
5638 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
5639 struct bfq_queue
*bfqq
,
5640 enum bfqq_expiration reason
)
5642 struct request
*next_rq
;
5643 int budget
, min_budget
;
5645 min_budget
= bfq_min_budget(bfqd
);
5647 if (bfqq
->wr_coeff
== 1)
5648 budget
= bfqq
->max_budget
;
5650 * Use a constant, low budget for weight-raised queues,
5651 * to help achieve a low latency. Keep it slightly higher
5652 * than the minimum possible budget, to cause a little
5653 * bit fewer expirations.
5655 budget
= 2 * min_budget
;
5657 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
5658 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
5659 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
5660 budget
, bfq_min_budget(bfqd
));
5661 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
5662 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
5664 if (bfq_bfqq_sync(bfqq
) && bfqq
->wr_coeff
== 1) {
5667 * Caveat: in all the following cases we trade latency
5670 case BFQQE_TOO_IDLE
:
5672 * This is the only case where we may reduce
5673 * the budget: if there is no request of the
5674 * process still waiting for completion, then
5675 * we assume (tentatively) that the timer has
5676 * expired because the batch of requests of
5677 * the process could have been served with a
5678 * smaller budget. Hence, betting that
5679 * process will behave in the same way when it
5680 * becomes backlogged again, we reduce its
5681 * next budget. As long as we guess right,
5682 * this budget cut reduces the latency
5683 * experienced by the process.
5685 * However, if there are still outstanding
5686 * requests, then the process may have not yet
5687 * issued its next request just because it is
5688 * still waiting for the completion of some of
5689 * the still outstanding ones. So in this
5690 * subcase we do not reduce its budget, on the
5691 * contrary we increase it to possibly boost
5692 * the throughput, as discussed in the
5693 * comments to the BUDGET_TIMEOUT case.
5695 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
5696 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5698 if (budget
> 5 * min_budget
)
5699 budget
-= 4 * min_budget
;
5701 budget
= min_budget
;
5704 case BFQQE_BUDGET_TIMEOUT
:
5706 * We double the budget here because it gives
5707 * the chance to boost the throughput if this
5708 * is not a seeky process (and has bumped into
5709 * this timeout because of, e.g., ZBR).
5711 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5713 case BFQQE_BUDGET_EXHAUSTED
:
5715 * The process still has backlog, and did not
5716 * let either the budget timeout or the disk
5717 * idling timeout expire. Hence it is not
5718 * seeky, has a short thinktime and may be
5719 * happy with a higher budget too. So
5720 * definitely increase the budget of this good
5721 * candidate to boost the disk throughput.
5723 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
5725 case BFQQE_NO_MORE_REQUESTS
:
5727 * For queues that expire for this reason, it
5728 * is particularly important to keep the
5729 * budget close to the actual service they
5730 * need. Doing so reduces the timestamp
5731 * misalignment problem described in the
5732 * comments in the body of
5733 * __bfq_activate_entity. In fact, suppose
5734 * that a queue systematically expires for
5735 * BFQQE_NO_MORE_REQUESTS and presents a
5736 * new request in time to enjoy timestamp
5737 * back-shifting. The larger the budget of the
5738 * queue is with respect to the service the
5739 * queue actually requests in each service
5740 * slot, the more times the queue can be
5741 * reactivated with the same virtual finish
5742 * time. It follows that, even if this finish
5743 * time is pushed to the system virtual time
5744 * to reduce the consequent timestamp
5745 * misalignment, the queue unjustly enjoys for
5746 * many re-activations a lower finish time
5747 * than all newly activated queues.
5749 * The service needed by bfqq is measured
5750 * quite precisely by bfqq->entity.service.
5751 * Since bfqq does not enjoy device idling,
5752 * bfqq->entity.service is equal to the number
5753 * of sectors that the process associated with
5754 * bfqq requested to read/write before waiting
5755 * for request completions, or blocking for
5758 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
5763 } else if (!bfq_bfqq_sync(bfqq
)) {
5765 * Async queues get always the maximum possible
5766 * budget, as for them we do not care about latency
5767 * (in addition, their ability to dispatch is limited
5768 * by the charging factor).
5770 budget
= bfqd
->bfq_max_budget
;
5773 bfqq
->max_budget
= budget
;
5775 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
5776 !bfqd
->bfq_user_max_budget
)
5777 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
5780 * If there is still backlog, then assign a new budget, making
5781 * sure that it is large enough for the next request. Since
5782 * the finish time of bfqq must be kept in sync with the
5783 * budget, be sure to call __bfq_bfqq_expire() *after* this
5786 * If there is no backlog, then no need to update the budget;
5787 * it will be updated on the arrival of a new request.
5789 next_rq
= bfqq
->next_rq
;
5791 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
5792 bfq_serv_to_charge(next_rq
, bfqq
));
5794 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
5795 next_rq
? blk_rq_sectors(next_rq
) : 0,
5796 bfqq
->entity
.budget
);
5800 * Return true if the process associated with bfqq is "slow". The slow
5801 * flag is used, in addition to the budget timeout, to reduce the
5802 * amount of service provided to seeky processes, and thus reduce
5803 * their chances to lower the throughput. More details in the comments
5804 * on the function bfq_bfqq_expire().
5806 * An important observation is in order: as discussed in the comments
5807 * on the function bfq_update_peak_rate(), with devices with internal
5808 * queues, it is hard if ever possible to know when and for how long
5809 * an I/O request is processed by the device (apart from the trivial
5810 * I/O pattern where a new request is dispatched only after the
5811 * previous one has been completed). This makes it hard to evaluate
5812 * the real rate at which the I/O requests of each bfq_queue are
5813 * served. In fact, for an I/O scheduler like BFQ, serving a
5814 * bfq_queue means just dispatching its requests during its service
5815 * slot (i.e., until the budget of the queue is exhausted, or the
5816 * queue remains idle, or, finally, a timeout fires). But, during the
5817 * service slot of a bfq_queue, around 100 ms at most, the device may
5818 * be even still processing requests of bfq_queues served in previous
5819 * service slots. On the opposite end, the requests of the in-service
5820 * bfq_queue may be completed after the service slot of the queue
5823 * Anyway, unless more sophisticated solutions are used
5824 * (where possible), the sum of the sizes of the requests dispatched
5825 * during the service slot of a bfq_queue is probably the only
5826 * approximation available for the service received by the bfq_queue
5827 * during its service slot. And this sum is the quantity used in this
5828 * function to evaluate the I/O speed of a process.
5830 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5831 bool compensate
, enum bfqq_expiration reason
,
5832 unsigned long *delta_ms
)
5834 ktime_t delta_ktime
;
5836 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
5838 if (!bfq_bfqq_sync(bfqq
))
5842 delta_ktime
= bfqd
->last_idling_start
;
5844 delta_ktime
= ktime_get();
5845 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
5846 delta_usecs
= ktime_to_us(delta_ktime
);
5848 /* don't use too short time intervals */
5849 if (delta_usecs
< 1000) {
5850 if (blk_queue_nonrot(bfqd
->queue
))
5852 * give same worst-case guarantees as idling
5855 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
5856 else /* charge at least one seek */
5857 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
5862 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
5865 * Use only long (> 20ms) intervals to filter out excessive
5866 * spikes in service rate estimation.
5868 if (delta_usecs
> 20000) {
5870 * Caveat for rotational devices: processes doing I/O
5871 * in the slower disk zones tend to be slow(er) even
5872 * if not seeky. In this respect, the estimated peak
5873 * rate is likely to be an average over the disk
5874 * surface. Accordingly, to not be too harsh with
5875 * unlucky processes, a process is deemed slow only if
5876 * its rate has been lower than half of the estimated
5879 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
5882 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
5888 * To be deemed as soft real-time, an application must meet two
5889 * requirements. First, the application must not require an average
5890 * bandwidth higher than the approximate bandwidth required to playback or
5891 * record a compressed high-definition video.
5892 * The next function is invoked on the completion of the last request of a
5893 * batch, to compute the next-start time instant, soft_rt_next_start, such
5894 * that, if the next request of the application does not arrive before
5895 * soft_rt_next_start, then the above requirement on the bandwidth is met.
5897 * The second requirement is that the request pattern of the application is
5898 * isochronous, i.e., that, after issuing a request or a batch of requests,
5899 * the application stops issuing new requests until all its pending requests
5900 * have been completed. After that, the application may issue a new batch,
5902 * For this reason the next function is invoked to compute
5903 * soft_rt_next_start only for applications that meet this requirement,
5904 * whereas soft_rt_next_start is set to infinity for applications that do
5907 * Unfortunately, even a greedy application may happen to behave in an
5908 * isochronous way if the CPU load is high. In fact, the application may
5909 * stop issuing requests while the CPUs are busy serving other processes,
5910 * then restart, then stop again for a while, and so on. In addition, if
5911 * the disk achieves a low enough throughput with the request pattern
5912 * issued by the application (e.g., because the request pattern is random
5913 * and/or the device is slow), then the application may meet the above
5914 * bandwidth requirement too. To prevent such a greedy application to be
5915 * deemed as soft real-time, a further rule is used in the computation of
5916 * soft_rt_next_start: soft_rt_next_start must be higher than the current
5917 * time plus the maximum time for which the arrival of a request is waited
5918 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
5919 * This filters out greedy applications, as the latter issue instead their
5920 * next request as soon as possible after the last one has been completed
5921 * (in contrast, when a batch of requests is completed, a soft real-time
5922 * application spends some time processing data).
5924 * Unfortunately, the last filter may easily generate false positives if
5925 * only bfqd->bfq_slice_idle is used as a reference time interval and one
5926 * or both the following cases occur:
5927 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
5928 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
5930 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
5931 * for a while, then suddenly 'jump' by several units to recover the lost
5932 * increments. This seems to happen, e.g., inside virtual machines.
5933 * To address this issue, we do not use as a reference time interval just
5934 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
5935 * particular we add the minimum number of jiffies for which the filter
5936 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
5939 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data
*bfqd
,
5940 struct bfq_queue
*bfqq
)
5942 return max(bfqq
->last_idle_bklogged
+
5943 HZ
* bfqq
->service_from_backlogged
/
5944 bfqd
->bfq_wr_max_softrt_rate
,
5945 jiffies
+ nsecs_to_jiffies(bfqq
->bfqd
->bfq_slice_idle
) + 4);
5949 * Return the farthest future time instant according to jiffies
5952 static unsigned long bfq_greatest_from_now(void)
5954 return jiffies
+ MAX_JIFFY_OFFSET
;
5958 * Return the farthest past time instant according to jiffies
5961 static unsigned long bfq_smallest_from_now(void)
5963 return jiffies
- MAX_JIFFY_OFFSET
;
5967 * bfq_bfqq_expire - expire a queue.
5968 * @bfqd: device owning the queue.
5969 * @bfqq: the queue to expire.
5970 * @compensate: if true, compensate for the time spent idling.
5971 * @reason: the reason causing the expiration.
5973 * If the process associated with bfqq does slow I/O (e.g., because it
5974 * issues random requests), we charge bfqq with the time it has been
5975 * in service instead of the service it has received (see
5976 * bfq_bfqq_charge_time for details on how this goal is achieved). As
5977 * a consequence, bfqq will typically get higher timestamps upon
5978 * reactivation, and hence it will be rescheduled as if it had
5979 * received more service than what it has actually received. In the
5980 * end, bfqq receives less service in proportion to how slowly its
5981 * associated process consumes its budgets (and hence how seriously it
5982 * tends to lower the throughput). In addition, this time-charging
5983 * strategy guarantees time fairness among slow processes. In
5984 * contrast, if the process associated with bfqq is not slow, we
5985 * charge bfqq exactly with the service it has received.
5987 * Charging time to the first type of queues and the exact service to
5988 * the other has the effect of using the WF2Q+ policy to schedule the
5989 * former on a timeslice basis, without violating service domain
5990 * guarantees among the latter.
5992 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
5993 struct bfq_queue
*bfqq
,
5995 enum bfqq_expiration reason
)
5998 unsigned long delta
= 0;
5999 struct bfq_entity
*entity
= &bfqq
->entity
;
6003 * Check whether the process is slow (see bfq_bfqq_is_slow).
6005 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
6008 * Increase service_from_backlogged before next statement,
6009 * because the possible next invocation of
6010 * bfq_bfqq_charge_time would likely inflate
6011 * entity->service. In contrast, service_from_backlogged must
6012 * contain real service, to enable the soft real-time
6013 * heuristic to correctly compute the bandwidth consumed by
6016 bfqq
->service_from_backlogged
+= entity
->service
;
6019 * As above explained, charge slow (typically seeky) and
6020 * timed-out queues with the time and not the service
6021 * received, to favor sequential workloads.
6023 * Processes doing I/O in the slower disk zones will tend to
6024 * be slow(er) even if not seeky. Therefore, since the
6025 * estimated peak rate is actually an average over the disk
6026 * surface, these processes may timeout just for bad luck. To
6027 * avoid punishing them, do not charge time to processes that
6028 * succeeded in consuming at least 2/3 of their budget. This
6029 * allows BFQ to preserve enough elasticity to still perform
6030 * bandwidth, and not time, distribution with little unlucky
6031 * or quasi-sequential processes.
6033 if (bfqq
->wr_coeff
== 1 &&
6035 (reason
== BFQQE_BUDGET_TIMEOUT
&&
6036 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3)))
6037 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
6039 if (reason
== BFQQE_TOO_IDLE
&&
6040 entity
->service
<= 2 * entity
->budget
/ 10)
6041 bfq_clear_bfqq_IO_bound(bfqq
);
6043 if (bfqd
->low_latency
&& bfqq
->wr_coeff
== 1)
6044 bfqq
->last_wr_start_finish
= jiffies
;
6046 if (bfqd
->low_latency
&& bfqd
->bfq_wr_max_softrt_rate
> 0 &&
6047 RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
6049 * If we get here, and there are no outstanding
6050 * requests, then the request pattern is isochronous
6051 * (see the comments on the function
6052 * bfq_bfqq_softrt_next_start()). Thus we can compute
6053 * soft_rt_next_start. If, instead, the queue still
6054 * has outstanding requests, then we have to wait for
6055 * the completion of all the outstanding requests to
6056 * discover whether the request pattern is actually
6059 if (bfqq
->dispatched
== 0)
6060 bfqq
->soft_rt_next_start
=
6061 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
6064 * The application is still waiting for the
6065 * completion of one or more requests:
6066 * prevent it from possibly being incorrectly
6067 * deemed as soft real-time by setting its
6068 * soft_rt_next_start to infinity. In fact,
6069 * without this assignment, the application
6070 * would be incorrectly deemed as soft
6072 * 1) it issued a new request before the
6073 * completion of all its in-flight
6075 * 2) at that time, its soft_rt_next_start
6076 * happened to be in the past.
6078 bfqq
->soft_rt_next_start
=
6079 bfq_greatest_from_now();
6081 * Schedule an update of soft_rt_next_start to when
6082 * the task may be discovered to be isochronous.
6084 bfq_mark_bfqq_softrt_update(bfqq
);
6088 bfq_log_bfqq(bfqd
, bfqq
,
6089 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
6090 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
6093 * Increase, decrease or leave budget unchanged according to
6096 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
6098 __bfq_bfqq_expire(bfqd
, bfqq
);
6100 /* mark bfqq as waiting a request only if a bic still points to it */
6101 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
6102 reason
!= BFQQE_BUDGET_TIMEOUT
&&
6103 reason
!= BFQQE_BUDGET_EXHAUSTED
)
6104 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
6108 * Budget timeout is not implemented through a dedicated timer, but
6109 * just checked on request arrivals and completions, as well as on
6110 * idle timer expirations.
6112 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
6114 return time_is_before_eq_jiffies(bfqq
->budget_timeout
);
6118 * If we expire a queue that is actively waiting (i.e., with the
6119 * device idled) for the arrival of a new request, then we may incur
6120 * the timestamp misalignment problem described in the body of the
6121 * function __bfq_activate_entity. Hence we return true only if this
6122 * condition does not hold, or if the queue is slow enough to deserve
6123 * only to be kicked off for preserving a high throughput.
6125 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
6127 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
6128 "may_budget_timeout: wait_request %d left %d timeout %d",
6129 bfq_bfqq_wait_request(bfqq
),
6130 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
6131 bfq_bfqq_budget_timeout(bfqq
));
6133 return (!bfq_bfqq_wait_request(bfqq
) ||
6134 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
6136 bfq_bfqq_budget_timeout(bfqq
);
6140 * For a queue that becomes empty, device idling is allowed only if
6141 * this function returns true for the queue. As a consequence, since
6142 * device idling plays a critical role in both throughput boosting and
6143 * service guarantees, the return value of this function plays a
6144 * critical role in both these aspects as well.
6146 * In a nutshell, this function returns true only if idling is
6147 * beneficial for throughput or, even if detrimental for throughput,
6148 * idling is however necessary to preserve service guarantees (low
6149 * latency, desired throughput distribution, ...). In particular, on
6150 * NCQ-capable devices, this function tries to return false, so as to
6151 * help keep the drives' internal queues full, whenever this helps the
6152 * device boost the throughput without causing any service-guarantee
6155 * In more detail, the return value of this function is obtained by,
6156 * first, computing a number of boolean variables that take into
6157 * account throughput and service-guarantee issues, and, then,
6158 * combining these variables in a logical expression. Most of the
6159 * issues taken into account are not trivial. We discuss these issues
6160 * individually while introducing the variables.
6162 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
6164 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6165 bool idling_boosts_thr
, idling_boosts_thr_without_issues
,
6166 asymmetric_scenario
;
6168 if (bfqd
->strict_guarantees
)
6172 * The next variable takes into account the cases where idling
6173 * boosts the throughput.
6175 * The value of the variable is computed considering that
6176 * idling is usually beneficial for the throughput if:
6177 * (a) the device is not NCQ-capable, or
6178 * (b) regardless of the presence of NCQ, the request pattern
6179 * for bfqq is I/O-bound (possible throughput losses
6180 * caused by granting idling to seeky queues are mitigated
6181 * by the fact that, in all scenarios where boosting
6182 * throughput is the best thing to do, i.e., in all
6183 * symmetric scenarios, only a minimal idle time is
6184 * allowed to seeky queues).
6186 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
6189 * The value of the next variable,
6190 * idling_boosts_thr_without_issues, is equal to that of
6191 * idling_boosts_thr, unless a special case holds. In this
6192 * special case, described below, idling may cause problems to
6193 * weight-raised queues.
6195 * When the request pool is saturated (e.g., in the presence
6196 * of write hogs), if the processes associated with
6197 * non-weight-raised queues ask for requests at a lower rate,
6198 * then processes associated with weight-raised queues have a
6199 * higher probability to get a request from the pool
6200 * immediately (or at least soon) when they need one. Thus
6201 * they have a higher probability to actually get a fraction
6202 * of the device throughput proportional to their high
6203 * weight. This is especially true with NCQ-capable drives,
6204 * which enqueue several requests in advance, and further
6205 * reorder internally-queued requests.
6207 * For this reason, we force to false the value of
6208 * idling_boosts_thr_without_issues if there are weight-raised
6209 * busy queues. In this case, and if bfqq is not weight-raised,
6210 * this guarantees that the device is not idled for bfqq (if,
6211 * instead, bfqq is weight-raised, then idling will be
6212 * guaranteed by another variable, see below). Combined with
6213 * the timestamping rules of BFQ (see [1] for details), this
6214 * behavior causes bfqq, and hence any sync non-weight-raised
6215 * queue, to get a lower number of requests served, and thus
6216 * to ask for a lower number of requests from the request
6217 * pool, before the busy weight-raised queues get served
6218 * again. This often mitigates starvation problems in the
6219 * presence of heavy write workloads and NCQ, thereby
6220 * guaranteeing a higher application and system responsiveness
6221 * in these hostile scenarios.
6223 idling_boosts_thr_without_issues
= idling_boosts_thr
&&
6224 bfqd
->wr_busy_queues
== 0;
6227 * There is then a case where idling must be performed not for
6228 * throughput concerns, but to preserve service guarantees. To
6229 * introduce it, we can note that allowing the drive to
6230 * enqueue more than one request at a time, and hence
6231 * delegating de facto final scheduling decisions to the
6232 * drive's internal scheduler, causes loss of control on the
6233 * actual request service order. In particular, the critical
6234 * situation is when requests from different processes happens
6235 * to be present, at the same time, in the internal queue(s)
6236 * of the drive. In such a situation, the drive, by deciding
6237 * the service order of the internally-queued requests, does
6238 * determine also the actual throughput distribution among
6239 * these processes. But the drive typically has no notion or
6240 * concern about per-process throughput distribution, and
6241 * makes its decisions only on a per-request basis. Therefore,
6242 * the service distribution enforced by the drive's internal
6243 * scheduler is likely to coincide with the desired
6244 * device-throughput distribution only in a completely
6245 * symmetric scenario where: (i) each of these processes must
6246 * get the same throughput as the others; (ii) all these
6247 * processes have the same I/O pattern (either sequential or
6248 * random). In fact, in such a scenario, the drive will tend
6249 * to treat the requests of each of these processes in about
6250 * the same way as the requests of the others, and thus to
6251 * provide each of these processes with about the same
6252 * throughput (which is exactly the desired throughput
6253 * distribution). In contrast, in any asymmetric scenario,
6254 * device idling is certainly needed to guarantee that bfqq
6255 * receives its assigned fraction of the device throughput
6256 * (see [1] for details).
6258 * As for sub-condition (i), actually we check only whether
6259 * bfqq is being weight-raised. In fact, if bfqq is not being
6260 * weight-raised, we have that:
6261 * - if the process associated with bfqq is not I/O-bound, then
6262 * it is not either latency- or throughput-critical; therefore
6263 * idling is not needed for bfqq;
6264 * - if the process asociated with bfqq is I/O-bound, then
6265 * idling is already granted with bfqq (see the comments on
6266 * idling_boosts_thr).
6268 * We do not check sub-condition (ii) at all, i.e., the next
6269 * variable is true if and only if bfqq is being
6270 * weight-raised. We do not need to control sub-condition (ii)
6271 * for the following reason:
6272 * - if bfqq is being weight-raised, then idling is already
6273 * guaranteed to bfqq by sub-condition (i);
6274 * - if bfqq is not being weight-raised, then idling is
6275 * already guaranteed to bfqq (only) if it matters, i.e., if
6276 * bfqq is associated to a currently I/O-bound process (see
6277 * the above comment on sub-condition (i)).
6279 * As a side note, it is worth considering that the above
6280 * device-idling countermeasures may however fail in the
6281 * following unlucky scenario: if idling is (correctly)
6282 * disabled in a time period during which the symmetry
6283 * sub-condition holds, and hence the device is allowed to
6284 * enqueue many requests, but at some later point in time some
6285 * sub-condition stops to hold, then it may become impossible
6286 * to let requests be served in the desired order until all
6287 * the requests already queued in the device have been served.
6289 asymmetric_scenario
= bfqq
->wr_coeff
> 1;
6292 * We have now all the components we need to compute the return
6293 * value of the function, which is true only if both the following
6295 * 1) bfqq is sync, because idling make sense only for sync queues;
6296 * 2) idling either boosts the throughput (without issues), or
6297 * is necessary to preserve service guarantees.
6299 return bfq_bfqq_sync(bfqq
) &&
6300 (idling_boosts_thr_without_issues
|| asymmetric_scenario
);
6304 * If the in-service queue is empty but the function bfq_bfqq_may_idle
6305 * returns true, then:
6306 * 1) the queue must remain in service and cannot be expired, and
6307 * 2) the device must be idled to wait for the possible arrival of a new
6308 * request for the queue.
6309 * See the comments on the function bfq_bfqq_may_idle for the reasons
6310 * why performing device idling is the best choice to boost the throughput
6311 * and preserve service guarantees when bfq_bfqq_may_idle itself
6314 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
6316 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6318 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
6319 bfq_bfqq_may_idle(bfqq
);
6323 * Select a queue for service. If we have a current queue in service,
6324 * check whether to continue servicing it, or retrieve and set a new one.
6326 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
6328 struct bfq_queue
*bfqq
;
6329 struct request
*next_rq
;
6330 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
6332 bfqq
= bfqd
->in_service_queue
;
6336 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
6338 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
6339 !bfq_bfqq_wait_request(bfqq
) &&
6340 !bfq_bfqq_must_idle(bfqq
))
6345 * This loop is rarely executed more than once. Even when it
6346 * happens, it is much more convenient to re-execute this loop
6347 * than to return NULL and trigger a new dispatch to get a
6350 next_rq
= bfqq
->next_rq
;
6352 * If bfqq has requests queued and it has enough budget left to
6353 * serve them, keep the queue, otherwise expire it.
6356 if (bfq_serv_to_charge(next_rq
, bfqq
) >
6357 bfq_bfqq_budget_left(bfqq
)) {
6359 * Expire the queue for budget exhaustion,
6360 * which makes sure that the next budget is
6361 * enough to serve the next request, even if
6362 * it comes from the fifo expired path.
6364 reason
= BFQQE_BUDGET_EXHAUSTED
;
6368 * The idle timer may be pending because we may
6369 * not disable disk idling even when a new request
6372 if (bfq_bfqq_wait_request(bfqq
)) {
6374 * If we get here: 1) at least a new request
6375 * has arrived but we have not disabled the
6376 * timer because the request was too small,
6377 * 2) then the block layer has unplugged
6378 * the device, causing the dispatch to be
6381 * Since the device is unplugged, now the
6382 * requests are probably large enough to
6383 * provide a reasonable throughput.
6384 * So we disable idling.
6386 bfq_clear_bfqq_wait_request(bfqq
);
6387 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
6388 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
6395 * No requests pending. However, if the in-service queue is idling
6396 * for a new request, or has requests waiting for a completion and
6397 * may idle after their completion, then keep it anyway.
6399 if (bfq_bfqq_wait_request(bfqq
) ||
6400 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
6405 reason
= BFQQE_NO_MORE_REQUESTS
;
6407 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
6409 bfqq
= bfq_set_in_service_queue(bfqd
);
6411 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
6416 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
6418 bfq_log(bfqd
, "select_queue: no queue returned");
6423 static void bfq_update_wr_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6425 struct bfq_entity
*entity
= &bfqq
->entity
;
6427 if (bfqq
->wr_coeff
> 1) { /* queue is being weight-raised */
6428 bfq_log_bfqq(bfqd
, bfqq
,
6429 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
6430 jiffies_to_msecs(jiffies
- bfqq
->last_wr_start_finish
),
6431 jiffies_to_msecs(bfqq
->wr_cur_max_time
),
6433 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
6435 if (entity
->prio_changed
)
6436 bfq_log_bfqq(bfqd
, bfqq
, "WARN: pending prio change");
6439 * If too much time has elapsed from the beginning of
6440 * this weight-raising period, then end weight raising.
6442 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
+
6443 bfqq
->wr_cur_max_time
)) {
6444 if (bfqq
->wr_cur_max_time
!= bfqd
->bfq_wr_rt_max_time
||
6445 time_is_before_jiffies(bfqq
->wr_start_at_switch_to_srt
+
6446 bfq_wr_duration(bfqd
)))
6447 bfq_bfqq_end_wr(bfqq
);
6449 /* switch back to interactive wr */
6450 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
6451 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
6452 bfqq
->last_wr_start_finish
=
6453 bfqq
->wr_start_at_switch_to_srt
;
6454 bfqq
->entity
.prio_changed
= 1;
6458 /* Update weight both if it must be raised and if it must be lowered */
6459 if ((entity
->weight
> entity
->orig_weight
) != (bfqq
->wr_coeff
> 1))
6460 __bfq_entity_update_weight_prio(
6461 bfq_entity_service_tree(entity
),
6466 * Dispatch next request from bfqq.
6468 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
6469 struct bfq_queue
*bfqq
)
6471 struct request
*rq
= bfqq
->next_rq
;
6472 unsigned long service_to_charge
;
6474 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
6476 bfq_bfqq_served(bfqq
, service_to_charge
);
6478 bfq_dispatch_remove(bfqd
->queue
, rq
);
6481 * If weight raising has to terminate for bfqq, then next
6482 * function causes an immediate update of bfqq's weight,
6483 * without waiting for next activation. As a consequence, on
6484 * expiration, bfqq will be timestamped as if has never been
6485 * weight-raised during this service slot, even if it has
6486 * received part or even most of the service as a
6487 * weight-raised queue. This inflates bfqq's timestamps, which
6488 * is beneficial, as bfqq is then more willing to leave the
6489 * device immediately to possible other weight-raised queues.
6491 bfq_update_wr_data(bfqd
, bfqq
);
6493 if (!bfqd
->in_service_bic
) {
6494 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
6495 bfqd
->in_service_bic
= RQ_BIC(rq
);
6499 * Expire bfqq, pretending that its budget expired, if bfqq
6500 * belongs to CLASS_IDLE and other queues are waiting for
6503 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
6509 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
6513 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
6515 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6518 * Avoiding lock: a race on bfqd->busy_queues should cause at
6519 * most a call to dispatch for nothing
6521 return !list_empty_careful(&bfqd
->dispatch
) ||
6522 bfqd
->busy_queues
> 0;
6525 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
6527 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6528 struct request
*rq
= NULL
;
6529 struct bfq_queue
*bfqq
= NULL
;
6531 if (!list_empty(&bfqd
->dispatch
)) {
6532 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
6534 list_del_init(&rq
->queuelist
);
6540 * Increment counters here, because this
6541 * dispatch does not follow the standard
6542 * dispatch flow (where counters are
6547 goto inc_in_driver_start_rq
;
6551 * We exploit the put_rq_private hook to decrement
6552 * rq_in_driver, but put_rq_private will not be
6553 * invoked on this request. So, to avoid unbalance,
6554 * just start this request, without incrementing
6555 * rq_in_driver. As a negative consequence,
6556 * rq_in_driver is deceptively lower than it should be
6557 * while this request is in service. This may cause
6558 * bfq_schedule_dispatch to be invoked uselessly.
6560 * As for implementing an exact solution, the
6561 * put_request hook, if defined, is probably invoked
6562 * also on this request. So, by exploiting this hook,
6563 * we could 1) increment rq_in_driver here, and 2)
6564 * decrement it in put_request. Such a solution would
6565 * let the value of the counter be always accurate,
6566 * but it would entail using an extra interface
6567 * function. This cost seems higher than the benefit,
6568 * being the frequency of non-elevator-private
6569 * requests very low.
6574 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
6576 if (bfqd
->busy_queues
== 0)
6580 * Force device to serve one request at a time if
6581 * strict_guarantees is true. Forcing this service scheme is
6582 * currently the ONLY way to guarantee that the request
6583 * service order enforced by the scheduler is respected by a
6584 * queueing device. Otherwise the device is free even to make
6585 * some unlucky request wait for as long as the device
6588 * Of course, serving one request at at time may cause loss of
6591 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
6594 bfqq
= bfq_select_queue(bfqd
);
6598 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
6601 inc_in_driver_start_rq
:
6602 bfqd
->rq_in_driver
++;
6604 rq
->rq_flags
|= RQF_STARTED
;
6610 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
6612 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6615 spin_lock_irq(&bfqd
->lock
);
6617 rq
= __bfq_dispatch_request(hctx
);
6618 bfq_unlock_put_ioc(bfqd
);
6624 * Task holds one reference to the queue, dropped when task exits. Each rq
6625 * in-flight on this queue also holds a reference, dropped when rq is freed.
6627 * Scheduler lock must be held here. Recall not to use bfqq after calling
6628 * this function on it.
6630 static void bfq_put_queue(struct bfq_queue
*bfqq
)
6632 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6633 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
6637 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
6644 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
6646 kmem_cache_free(bfq_pool
, bfqq
);
6647 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6652 static void bfq_put_cooperator(struct bfq_queue
*bfqq
)
6654 struct bfq_queue
*__bfqq
, *next
;
6657 * If this queue was scheduled to merge with another queue, be
6658 * sure to drop the reference taken on that queue (and others in
6659 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
6661 __bfqq
= bfqq
->new_bfqq
;
6665 next
= __bfqq
->new_bfqq
;
6666 bfq_put_queue(__bfqq
);
6671 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6673 if (bfqq
== bfqd
->in_service_queue
) {
6674 __bfq_bfqq_expire(bfqd
, bfqq
);
6675 bfq_schedule_dispatch(bfqd
);
6678 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
6680 bfq_put_cooperator(bfqq
);
6682 bfq_put_queue(bfqq
); /* release process reference */
6685 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
6687 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
6688 struct bfq_data
*bfqd
;
6691 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
6694 unsigned long flags
;
6696 spin_lock_irqsave(&bfqd
->lock
, flags
);
6698 * If the bic is using a shared queue, put the
6699 * reference taken on the io_context when the bic
6700 * started using a shared bfq_queue. This put cannot
6701 * make ioc->ref_count reach 0, then no ioc->lock
6702 * risks to be taken (leading to possible deadlock
6705 if (is_sync
&& bfq_bfqq_coop(bfqq
))
6706 put_io_context(bic
->icq
.ioc
);
6708 bfq_exit_bfqq(bfqd
, bfqq
);
6709 bic_set_bfqq(bic
, NULL
, is_sync
);
6710 bfq_unlock_put_ioc_restore(bfqd
, flags
);
6714 static void bfq_exit_icq(struct io_cq
*icq
)
6716 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
6718 bfq_exit_icq_bfqq(bic
, true);
6719 bfq_exit_icq_bfqq(bic
, false);
6723 * Update the entity prio values; note that the new values will not
6724 * be used until the next (re)activation.
6727 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
6729 struct task_struct
*tsk
= current
;
6731 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6736 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6737 switch (ioprio_class
) {
6739 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
6740 "bfq: bad prio class %d\n", ioprio_class
);
6741 case IOPRIO_CLASS_NONE
:
6743 * No prio set, inherit CPU scheduling settings.
6745 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
6746 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
6748 case IOPRIO_CLASS_RT
:
6749 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6750 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
6752 case IOPRIO_CLASS_BE
:
6753 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6754 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
6756 case IOPRIO_CLASS_IDLE
:
6757 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
6758 bfqq
->new_ioprio
= 7;
6759 bfq_clear_bfqq_idle_window(bfqq
);
6763 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
6764 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
6766 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
6769 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
6770 bfqq
->entity
.prio_changed
= 1;
6773 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
6775 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
6776 struct bfq_queue
*bfqq
;
6777 int ioprio
= bic
->icq
.ioc
->ioprio
;
6780 * This condition may trigger on a newly created bic, be sure to
6781 * drop the lock before returning.
6783 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
6786 bic
->ioprio
= ioprio
;
6788 bfqq
= bic_to_bfqq(bic
, false);
6790 /* release process reference on this queue */
6791 bfq_put_queue(bfqq
);
6792 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
6793 bic_set_bfqq(bic
, bfqq
, false);
6796 bfqq
= bic_to_bfqq(bic
, true);
6798 bfq_set_next_ioprio_data(bfqq
, bic
);
6801 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6802 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
6804 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
6805 INIT_LIST_HEAD(&bfqq
->fifo
);
6811 bfq_set_next_ioprio_data(bfqq
, bic
);
6814 if (!bfq_class_idle(bfqq
))
6815 bfq_mark_bfqq_idle_window(bfqq
);
6816 bfq_mark_bfqq_sync(bfqq
);
6818 bfq_clear_bfqq_sync(bfqq
);
6820 /* set end request to minus infinity from now */
6821 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
6823 bfq_mark_bfqq_IO_bound(bfqq
);
6827 /* Tentative initial value to trade off between thr and lat */
6828 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
6829 bfqq
->budget_timeout
= bfq_smallest_from_now();
6832 bfqq
->last_wr_start_finish
= jiffies
;
6833 bfqq
->wr_start_at_switch_to_srt
= bfq_smallest_from_now();
6834 bfqq
->split_time
= bfq_smallest_from_now();
6837 * Set to the value for which bfqq will not be deemed as
6838 * soft rt when it becomes backlogged.
6840 bfqq
->soft_rt_next_start
= bfq_greatest_from_now();
6842 /* first request is almost certainly seeky */
6843 bfqq
->seek_history
= 1;
6846 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
6847 struct bfq_group
*bfqg
,
6848 int ioprio_class
, int ioprio
)
6850 switch (ioprio_class
) {
6851 case IOPRIO_CLASS_RT
:
6852 return &bfqg
->async_bfqq
[0][ioprio
];
6853 case IOPRIO_CLASS_NONE
:
6854 ioprio
= IOPRIO_NORM
;
6856 case IOPRIO_CLASS_BE
:
6857 return &bfqg
->async_bfqq
[1][ioprio
];
6858 case IOPRIO_CLASS_IDLE
:
6859 return &bfqg
->async_idle_bfqq
;
6865 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
6866 struct bio
*bio
, bool is_sync
,
6867 struct bfq_io_cq
*bic
)
6869 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6870 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6871 struct bfq_queue
**async_bfqq
= NULL
;
6872 struct bfq_queue
*bfqq
;
6873 struct bfq_group
*bfqg
;
6877 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
6879 bfqq
= &bfqd
->oom_bfqq
;
6884 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
6891 bfqq
= kmem_cache_alloc_node(bfq_pool
,
6892 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
6896 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
6898 bfq_init_entity(&bfqq
->entity
, bfqg
);
6899 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
6901 bfqq
= &bfqd
->oom_bfqq
;
6902 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
6907 * Pin the queue now that it's allocated, scheduler exit will
6912 * Extra group reference, w.r.t. sync
6913 * queue. This extra reference is removed
6914 * only if bfqq->bfqg disappears, to
6915 * guarantee that this queue is not freed
6916 * until its group goes away.
6918 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
6924 bfqq
->ref
++; /* get a process reference to this queue */
6925 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
6930 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
6931 struct bfq_queue
*bfqq
)
6933 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
6934 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
6936 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
6938 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
6939 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
6940 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
6941 ttime
->ttime_samples
);
6945 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6948 bfqq
->seek_history
<<= 1;
6949 bfqq
->seek_history
|=
6950 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
6951 (!blk_queue_nonrot(bfqd
->queue
) ||
6952 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
6956 * Disable idle window if the process thinks too long or seeks so much that
6957 * it doesn't matter.
6959 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
6960 struct bfq_queue
*bfqq
,
6961 struct bfq_io_cq
*bic
)
6965 /* Don't idle for async or idle io prio class. */
6966 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
6969 /* Idle window just restored, statistics are meaningless. */
6970 if (time_is_after_eq_jiffies(bfqq
->split_time
+
6971 bfqd
->bfq_wr_min_idle_time
))
6974 enable_idle
= bfq_bfqq_idle_window(bfqq
);
6976 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
6977 bfqd
->bfq_slice_idle
== 0 ||
6978 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
6979 bfqq
->wr_coeff
== 1))
6981 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
6982 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
6983 bfqq
->wr_coeff
== 1)
6988 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
6992 bfq_mark_bfqq_idle_window(bfqq
);
6994 bfq_clear_bfqq_idle_window(bfqq
);
6998 * Called when a new fs request (rq) is added to bfqq. Check if there's
6999 * something we should do about it.
7001 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7004 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
7006 if (rq
->cmd_flags
& REQ_META
)
7007 bfqq
->meta_pending
++;
7009 bfq_update_io_thinktime(bfqd
, bfqq
);
7010 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
7011 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
7013 bfq_update_idle_window(bfqd
, bfqq
, bic
);
7015 bfq_log_bfqq(bfqd
, bfqq
,
7016 "rq_enqueued: idle_window=%d (seeky %d)",
7017 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
7019 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
7021 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
7022 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
7023 blk_rq_sectors(rq
) < 32;
7024 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
7027 * There is just this request queued: if the request
7028 * is small and the queue is not to be expired, then
7031 * In this way, if the device is being idled to wait
7032 * for a new request from the in-service queue, we
7033 * avoid unplugging the device and committing the
7034 * device to serve just a small request. On the
7035 * contrary, we wait for the block layer to decide
7036 * when to unplug the device: hopefully, new requests
7037 * will be merged to this one quickly, then the device
7038 * will be unplugged and larger requests will be
7041 if (small_req
&& !budget_timeout
)
7045 * A large enough request arrived, or the queue is to
7046 * be expired: in both cases disk idling is to be
7047 * stopped, so clear wait_request flag and reset
7050 bfq_clear_bfqq_wait_request(bfqq
);
7051 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
7052 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
7055 * The queue is not empty, because a new request just
7056 * arrived. Hence we can safely expire the queue, in
7057 * case of budget timeout, without risking that the
7058 * timestamps of the queue are not updated correctly.
7059 * See [1] for more details.
7062 bfq_bfqq_expire(bfqd
, bfqq
, false,
7063 BFQQE_BUDGET_TIMEOUT
);
7067 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
7069 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
),
7070 *new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, rq
, true);
7073 if (bic_to_bfqq(RQ_BIC(rq
), 1) != bfqq
)
7074 new_bfqq
= bic_to_bfqq(RQ_BIC(rq
), 1);
7076 * Release the request's reference to the old bfqq
7077 * and make sure one is taken to the shared queue.
7079 new_bfqq
->allocated
++;
7083 * If the bic associated with the process
7084 * issuing this request still points to bfqq
7085 * (and thus has not been already redirected
7086 * to new_bfqq or even some other bfq_queue),
7087 * then complete the merge and redirect it to
7090 if (bic_to_bfqq(RQ_BIC(rq
), 1) == bfqq
)
7091 bfq_merge_bfqqs(bfqd
, RQ_BIC(rq
),
7094 * rq is about to be enqueued into new_bfqq,
7095 * release rq reference on bfqq
7097 bfq_put_queue(bfqq
);
7098 rq
->elv
.priv
[1] = new_bfqq
;
7102 bfq_add_request(rq
);
7104 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
7105 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
7107 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
7110 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
7113 struct request_queue
*q
= hctx
->queue
;
7114 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
7116 spin_lock_irq(&bfqd
->lock
);
7117 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
7118 spin_unlock_irq(&bfqd
->lock
);
7122 spin_unlock_irq(&bfqd
->lock
);
7124 blk_mq_sched_request_inserted(rq
);
7126 spin_lock_irq(&bfqd
->lock
);
7127 if (at_head
|| blk_rq_is_passthrough(rq
)) {
7129 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
7131 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
7133 __bfq_insert_request(bfqd
, rq
);
7135 if (rq_mergeable(rq
)) {
7136 elv_rqhash_add(q
, rq
);
7142 bfq_unlock_put_ioc(bfqd
);
7145 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
7146 struct list_head
*list
, bool at_head
)
7148 while (!list_empty(list
)) {
7151 rq
= list_first_entry(list
, struct request
, queuelist
);
7152 list_del_init(&rq
->queuelist
);
7153 bfq_insert_request(hctx
, rq
, at_head
);
7157 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
7159 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
7160 bfqd
->rq_in_driver
);
7162 if (bfqd
->hw_tag
== 1)
7166 * This sample is valid if the number of outstanding requests
7167 * is large enough to allow a queueing behavior. Note that the
7168 * sum is not exact, as it's not taking into account deactivated
7171 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
7174 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
7177 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
7178 bfqd
->max_rq_in_driver
= 0;
7179 bfqd
->hw_tag_samples
= 0;
7182 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
7187 bfq_update_hw_tag(bfqd
);
7189 bfqd
->rq_in_driver
--;
7192 if (!bfqq
->dispatched
&& !bfq_bfqq_busy(bfqq
)) {
7194 * Set budget_timeout (which we overload to store the
7195 * time at which the queue remains with no backlog and
7196 * no outstanding request; used by the weight-raising
7199 bfqq
->budget_timeout
= jiffies
;
7202 now_ns
= ktime_get_ns();
7204 bfqq
->ttime
.last_end_request
= now_ns
;
7207 * Using us instead of ns, to get a reasonable precision in
7208 * computing rate in next check.
7210 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
7213 * If the request took rather long to complete, and, according
7214 * to the maximum request size recorded, this completion latency
7215 * implies that the request was certainly served at a very low
7216 * rate (less than 1M sectors/sec), then the whole observation
7217 * interval that lasts up to this time instant cannot be a
7218 * valid time interval for computing a new peak rate. Invoke
7219 * bfq_update_rate_reset to have the following three steps
7221 * - close the observation interval at the last (previous)
7222 * request dispatch or completion
7223 * - compute rate, if possible, for that observation interval
7224 * - reset to zero samples, which will trigger a proper
7225 * re-initialization of the observation interval on next
7228 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
7229 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
7230 1UL<<(BFQ_RATE_SHIFT
- 10))
7231 bfq_update_rate_reset(bfqd
, NULL
);
7232 bfqd
->last_completion
= now_ns
;
7235 * If we are waiting to discover whether the request pattern
7236 * of the task associated with the queue is actually
7237 * isochronous, and both requisites for this condition to hold
7238 * are now satisfied, then compute soft_rt_next_start (see the
7239 * comments on the function bfq_bfqq_softrt_next_start()). We
7240 * schedule this delayed check when bfqq expires, if it still
7241 * has in-flight requests.
7243 if (bfq_bfqq_softrt_update(bfqq
) && bfqq
->dispatched
== 0 &&
7244 RB_EMPTY_ROOT(&bfqq
->sort_list
))
7245 bfqq
->soft_rt_next_start
=
7246 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
7249 * If this is the in-service queue, check if it needs to be expired,
7250 * or if we want to idle in case it has no pending requests.
7252 if (bfqd
->in_service_queue
== bfqq
) {
7253 if (bfqq
->dispatched
== 0 && bfq_bfqq_must_idle(bfqq
)) {
7254 bfq_arm_slice_timer(bfqd
);
7256 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
7257 bfq_bfqq_expire(bfqd
, bfqq
, false,
7258 BFQQE_BUDGET_TIMEOUT
);
7259 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
7260 (bfqq
->dispatched
== 0 ||
7261 !bfq_bfqq_may_idle(bfqq
)))
7262 bfq_bfqq_expire(bfqd
, bfqq
, false,
7263 BFQQE_NO_MORE_REQUESTS
);
7267 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
7271 bfq_put_queue(bfqq
);
7274 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
7276 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
7277 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7279 if (rq
->rq_flags
& RQF_STARTED
)
7280 bfqg_stats_update_completion(bfqq_group(bfqq
),
7281 rq_start_time_ns(rq
),
7282 rq_io_start_time_ns(rq
),
7285 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
7286 unsigned long flags
;
7288 spin_lock_irqsave(&bfqd
->lock
, flags
);
7290 bfq_completed_request(bfqq
, bfqd
);
7291 bfq_put_rq_priv_body(bfqq
);
7293 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7296 * Request rq may be still/already in the scheduler,
7297 * in which case we need to remove it. And we cannot
7298 * defer such a check and removal, to avoid
7299 * inconsistencies in the time interval from the end
7300 * of this function to the start of the deferred work.
7301 * This situation seems to occur only in process
7302 * context, as a consequence of a merge. In the
7303 * current version of the code, this implies that the
7307 if (!RB_EMPTY_NODE(&rq
->rb_node
))
7308 bfq_remove_request(q
, rq
);
7309 bfq_put_rq_priv_body(bfqq
);
7312 rq
->elv
.priv
[0] = NULL
;
7313 rq
->elv
.priv
[1] = NULL
;
7317 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
7318 * was the last process referring to that bfqq.
7320 static struct bfq_queue
*
7321 bfq_split_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
)
7323 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "splitting queue");
7325 if (bfqq_process_refs(bfqq
) == 1) {
7326 bfqq
->pid
= current
->pid
;
7327 bfq_clear_bfqq_coop(bfqq
);
7328 bfq_clear_bfqq_split_coop(bfqq
);
7332 bic_set_bfqq(bic
, NULL
, 1);
7334 bfq_put_cooperator(bfqq
);
7336 bfq_put_queue(bfqq
);
7340 static struct bfq_queue
*bfq_get_bfqq_handle_split(struct bfq_data
*bfqd
,
7341 struct bfq_io_cq
*bic
,
7343 bool split
, bool is_sync
,
7346 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
7348 if (likely(bfqq
&& bfqq
!= &bfqd
->oom_bfqq
))
7355 bfq_put_queue(bfqq
);
7356 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
7358 bic_set_bfqq(bic
, bfqq
, is_sync
);
7359 if (split
&& is_sync
)
7360 bfqq
->split_time
= jiffies
;
7366 * Allocate bfq data structures associated with this request.
7368 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
7371 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
7372 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
7373 const int is_sync
= rq_is_sync(rq
);
7374 struct bfq_queue
*bfqq
;
7375 bool new_queue
= false;
7377 spin_lock_irq(&bfqd
->lock
);
7379 bfq_check_ioprio_change(bic
, bio
);
7384 bfq_bic_update_cgroup(bic
, bio
);
7386 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
, false, is_sync
,
7389 if (likely(!new_queue
)) {
7390 /* If the queue was seeky for too long, break it apart. */
7391 if (bfq_bfqq_coop(bfqq
) && bfq_bfqq_split_coop(bfqq
)) {
7392 bfq_log_bfqq(bfqd
, bfqq
, "breaking apart bfqq");
7393 bfqq
= bfq_split_bfqq(bic
, bfqq
);
7395 * A reference to bic->icq.ioc needs to be
7396 * released after a queue split. Do not do it
7397 * immediately, to not risk to possibly take
7398 * an ioc->lock while holding the scheduler
7401 bfqd
->ioc_to_put
= bic
->icq
.ioc
;
7404 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
,
7412 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
7413 rq
, bfqq
, bfqq
->ref
);
7415 rq
->elv
.priv
[0] = bic
;
7416 rq
->elv
.priv
[1] = bfqq
;
7419 * If a bfq_queue has only one process reference, it is owned
7420 * by only this bic: we can then set bfqq->bic = bic. in
7421 * addition, if the queue has also just been split, we have to
7424 if (likely(bfqq
!= &bfqd
->oom_bfqq
) && bfqq_process_refs(bfqq
) == 1) {
7426 if (bfqd
->ioc_to_put
) { /* if true, there has been a split */
7428 * The queue has just been split from a shared
7429 * queue: restore the idle window and the
7430 * possible weight raising period.
7432 bfq_bfqq_resume_state(bfqq
, bic
);
7436 bfq_unlock_put_ioc(bfqd
);
7441 spin_unlock_irq(&bfqd
->lock
);
7446 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
7448 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7449 enum bfqq_expiration reason
;
7450 unsigned long flags
;
7452 spin_lock_irqsave(&bfqd
->lock
, flags
);
7453 bfq_clear_bfqq_wait_request(bfqq
);
7455 if (bfqq
!= bfqd
->in_service_queue
) {
7456 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
7460 if (bfq_bfqq_budget_timeout(bfqq
))
7462 * Also here the queue can be safely expired
7463 * for budget timeout without wasting
7466 reason
= BFQQE_BUDGET_TIMEOUT
;
7467 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
7469 * The queue may not be empty upon timer expiration,
7470 * because we may not disable the timer when the
7471 * first request of the in-service queue arrives
7472 * during disk idling.
7474 reason
= BFQQE_TOO_IDLE
;
7476 goto schedule_dispatch
;
7478 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
7481 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7482 bfq_schedule_dispatch(bfqd
);
7486 * Handler of the expiration of the timer running if the in-service queue
7487 * is idling inside its time slice.
7489 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
7491 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
7493 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
7496 * Theoretical race here: the in-service queue can be NULL or
7497 * different from the queue that was idling if a new request
7498 * arrives for the current queue and there is a full dispatch
7499 * cycle that changes the in-service queue. This can hardly
7500 * happen, but in the worst case we just expire a queue too
7504 bfq_idle_slice_timer_body(bfqq
);
7506 return HRTIMER_NORESTART
;
7509 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
7510 struct bfq_queue
**bfqq_ptr
)
7512 struct bfq_queue
*bfqq
= *bfqq_ptr
;
7514 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
7516 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
7518 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
7520 bfq_put_queue(bfqq
);
7526 * Release all the bfqg references to its async queues. If we are
7527 * deallocating the group these queues may still contain requests, so
7528 * we reparent them to the root cgroup (i.e., the only one that will
7529 * exist for sure until all the requests on a device are gone).
7531 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
7535 for (i
= 0; i
< 2; i
++)
7536 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
7537 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
7539 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
7542 static void bfq_exit_queue(struct elevator_queue
*e
)
7544 struct bfq_data
*bfqd
= e
->elevator_data
;
7545 struct bfq_queue
*bfqq
, *n
;
7547 hrtimer_cancel(&bfqd
->idle_slice_timer
);
7549 spin_lock_irq(&bfqd
->lock
);
7550 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
7551 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
7552 spin_unlock_irq(&bfqd
->lock
);
7554 hrtimer_cancel(&bfqd
->idle_slice_timer
);
7556 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7557 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
7559 spin_lock_irq(&bfqd
->lock
);
7560 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
7561 kfree(bfqd
->root_group
);
7562 spin_unlock_irq(&bfqd
->lock
);
7568 static void bfq_init_root_group(struct bfq_group
*root_group
,
7569 struct bfq_data
*bfqd
)
7573 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7574 root_group
->entity
.parent
= NULL
;
7575 root_group
->my_entity
= NULL
;
7576 root_group
->bfqd
= bfqd
;
7578 root_group
->rq_pos_tree
= RB_ROOT
;
7579 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
7580 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
7581 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
7584 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
7586 struct bfq_data
*bfqd
;
7587 struct elevator_queue
*eq
;
7589 eq
= elevator_alloc(q
, e
);
7593 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
7595 kobject_put(&eq
->kobj
);
7598 eq
->elevator_data
= bfqd
;
7600 spin_lock_irq(q
->queue_lock
);
7602 spin_unlock_irq(q
->queue_lock
);
7605 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
7606 * Grab a permanent reference to it, so that the normal code flow
7607 * will not attempt to free it.
7609 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
7610 bfqd
->oom_bfqq
.ref
++;
7611 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
7612 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
7613 bfqd
->oom_bfqq
.entity
.new_weight
=
7614 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
7616 * Trigger weight initialization, according to ioprio, at the
7617 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
7618 * class won't be changed any more.
7620 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
7624 INIT_LIST_HEAD(&bfqd
->dispatch
);
7626 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
7628 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
7630 INIT_LIST_HEAD(&bfqd
->active_list
);
7631 INIT_LIST_HEAD(&bfqd
->idle_list
);
7635 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
7637 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
7638 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
7639 bfqd
->bfq_back_max
= bfq_back_max
;
7640 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
7641 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
7642 bfqd
->bfq_timeout
= bfq_timeout
;
7644 bfqd
->bfq_requests_within_timer
= 120;
7646 bfqd
->low_latency
= true;
7649 * Trade-off between responsiveness and fairness.
7651 bfqd
->bfq_wr_coeff
= 30;
7652 bfqd
->bfq_wr_rt_max_time
= msecs_to_jiffies(300);
7653 bfqd
->bfq_wr_max_time
= 0;
7654 bfqd
->bfq_wr_min_idle_time
= msecs_to_jiffies(2000);
7655 bfqd
->bfq_wr_min_inter_arr_async
= msecs_to_jiffies(500);
7656 bfqd
->bfq_wr_max_softrt_rate
= 7000; /*
7657 * Approximate rate required
7658 * to playback or record a
7659 * high-definition compressed
7662 bfqd
->wr_busy_queues
= 0;
7665 * Begin by assuming, optimistically, that the device is a
7666 * high-speed one, and that its peak rate is equal to 2/3 of
7667 * the highest reference rate.
7669 bfqd
->RT_prod
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] *
7670 T_fast
[blk_queue_nonrot(bfqd
->queue
)];
7671 bfqd
->peak_rate
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] * 2 / 3;
7672 bfqd
->device_speed
= BFQ_BFQD_FAST
;
7674 spin_lock_init(&bfqd
->lock
);
7677 * The invocation of the next bfq_create_group_hierarchy
7678 * function is the head of a chain of function calls
7679 * (bfq_create_group_hierarchy->blkcg_activate_policy->
7680 * blk_mq_freeze_queue) that may lead to the invocation of the
7681 * has_work hook function. For this reason,
7682 * bfq_create_group_hierarchy is invoked only after all
7683 * scheduler data has been initialized, apart from the fields
7684 * that can be initialized only after invoking
7685 * bfq_create_group_hierarchy. This, in particular, enables
7686 * has_work to correctly return false. Of course, to avoid
7687 * other inconsistencies, the blk-mq stack must then refrain
7688 * from invoking further scheduler hooks before this init
7689 * function is finished.
7691 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
7692 if (!bfqd
->root_group
)
7694 bfq_init_root_group(bfqd
->root_group
, bfqd
);
7695 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
7702 kobject_put(&eq
->kobj
);
7706 static void bfq_slab_kill(void)
7708 kmem_cache_destroy(bfq_pool
);
7711 static int __init
bfq_slab_setup(void)
7713 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
7719 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
7721 return sprintf(page
, "%u\n", var
);
7724 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
7727 unsigned long new_val
;
7728 int ret
= kstrtoul(page
, 10, &new_val
);
7736 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
7737 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
7739 struct bfq_data *bfqd = e->elevator_data; \
7740 u64 __data = __VAR; \
7742 __data = jiffies_to_msecs(__data); \
7743 else if (__CONV == 2) \
7744 __data = div_u64(__data, NSEC_PER_MSEC); \
7745 return bfq_var_show(__data, (page)); \
7747 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
7748 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
7749 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
7750 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
7751 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
7752 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
7753 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
7754 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
7755 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
7756 #undef SHOW_FUNCTION
7758 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
7759 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
7761 struct bfq_data *bfqd = e->elevator_data; \
7762 u64 __data = __VAR; \
7763 __data = div_u64(__data, NSEC_PER_USEC); \
7764 return bfq_var_show(__data, (page)); \
7766 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
7767 #undef USEC_SHOW_FUNCTION
7769 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
7771 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
7773 struct bfq_data *bfqd = e->elevator_data; \
7774 unsigned long uninitialized_var(__data); \
7775 int ret = bfq_var_store(&__data, (page), count); \
7776 if (__data < (MIN)) \
7778 else if (__data > (MAX)) \
7781 *(__PTR) = msecs_to_jiffies(__data); \
7782 else if (__CONV == 2) \
7783 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
7785 *(__PTR) = __data; \
7788 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
7790 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
7792 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
7793 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
7795 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
7796 #undef STORE_FUNCTION
7798 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
7799 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
7801 struct bfq_data *bfqd = e->elevator_data; \
7802 unsigned long uninitialized_var(__data); \
7803 int ret = bfq_var_store(&__data, (page), count); \
7804 if (__data < (MIN)) \
7806 else if (__data > (MAX)) \
7808 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
7811 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
7813 #undef USEC_STORE_FUNCTION
7815 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
7816 const char *page
, size_t count
)
7818 struct bfq_data
*bfqd
= e
->elevator_data
;
7819 unsigned long uninitialized_var(__data
);
7820 int ret
= bfq_var_store(&__data
, (page
), count
);
7823 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
7825 if (__data
> INT_MAX
)
7827 bfqd
->bfq_max_budget
= __data
;
7830 bfqd
->bfq_user_max_budget
= __data
;
7836 * Leaving this name to preserve name compatibility with cfq
7837 * parameters, but this timeout is used for both sync and async.
7839 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
7840 const char *page
, size_t count
)
7842 struct bfq_data
*bfqd
= e
->elevator_data
;
7843 unsigned long uninitialized_var(__data
);
7844 int ret
= bfq_var_store(&__data
, (page
), count
);
7848 else if (__data
> INT_MAX
)
7851 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
7852 if (bfqd
->bfq_user_max_budget
== 0)
7853 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
7858 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
7859 const char *page
, size_t count
)
7861 struct bfq_data
*bfqd
= e
->elevator_data
;
7862 unsigned long uninitialized_var(__data
);
7863 int ret
= bfq_var_store(&__data
, (page
), count
);
7867 if (!bfqd
->strict_guarantees
&& __data
== 1
7868 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
7869 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
7871 bfqd
->strict_guarantees
= __data
;
7876 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
7877 const char *page
, size_t count
)
7879 struct bfq_data
*bfqd
= e
->elevator_data
;
7880 unsigned long uninitialized_var(__data
);
7881 int ret
= bfq_var_store(&__data
, (page
), count
);
7885 if (__data
== 0 && bfqd
->low_latency
!= 0)
7887 bfqd
->low_latency
= __data
;
7892 #define BFQ_ATTR(name) \
7893 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
7895 static struct elv_fs_entry bfq_attrs
[] = {
7896 BFQ_ATTR(fifo_expire_sync
),
7897 BFQ_ATTR(fifo_expire_async
),
7898 BFQ_ATTR(back_seek_max
),
7899 BFQ_ATTR(back_seek_penalty
),
7900 BFQ_ATTR(slice_idle
),
7901 BFQ_ATTR(slice_idle_us
),
7902 BFQ_ATTR(max_budget
),
7903 BFQ_ATTR(timeout_sync
),
7904 BFQ_ATTR(strict_guarantees
),
7905 BFQ_ATTR(low_latency
),
7909 static struct elevator_type iosched_bfq_mq
= {
7911 .get_rq_priv
= bfq_get_rq_private
,
7912 .put_rq_priv
= bfq_put_rq_private
,
7913 .exit_icq
= bfq_exit_icq
,
7914 .insert_requests
= bfq_insert_requests
,
7915 .dispatch_request
= bfq_dispatch_request
,
7916 .next_request
= elv_rb_latter_request
,
7917 .former_request
= elv_rb_former_request
,
7918 .allow_merge
= bfq_allow_bio_merge
,
7919 .bio_merge
= bfq_bio_merge
,
7920 .request_merge
= bfq_request_merge
,
7921 .requests_merged
= bfq_requests_merged
,
7922 .request_merged
= bfq_request_merged
,
7923 .has_work
= bfq_has_work
,
7924 .init_sched
= bfq_init_queue
,
7925 .exit_sched
= bfq_exit_queue
,
7929 .icq_size
= sizeof(struct bfq_io_cq
),
7930 .icq_align
= __alignof__(struct bfq_io_cq
),
7931 .elevator_attrs
= bfq_attrs
,
7932 .elevator_name
= "bfq",
7933 .elevator_owner
= THIS_MODULE
,
7936 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7937 static struct blkcg_policy blkcg_policy_bfq
= {
7938 .dfl_cftypes
= bfq_blkg_files
,
7939 .legacy_cftypes
= bfq_blkcg_legacy_files
,
7941 .cpd_alloc_fn
= bfq_cpd_alloc
,
7942 .cpd_init_fn
= bfq_cpd_init
,
7943 .cpd_bind_fn
= bfq_cpd_init
,
7944 .cpd_free_fn
= bfq_cpd_free
,
7946 .pd_alloc_fn
= bfq_pd_alloc
,
7947 .pd_init_fn
= bfq_pd_init
,
7948 .pd_offline_fn
= bfq_pd_offline
,
7949 .pd_free_fn
= bfq_pd_free
,
7950 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
7954 static int __init
bfq_init(void)
7958 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7959 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
7965 if (bfq_slab_setup())
7969 * Times to load large popular applications for the typical
7970 * systems installed on the reference devices (see the
7971 * comments before the definitions of the next two
7972 * arrays). Actually, we use slightly slower values, as the
7973 * estimated peak rate tends to be smaller than the actual
7974 * peak rate. The reason for this last fact is that estimates
7975 * are computed over much shorter time intervals than the long
7976 * intervals typically used for benchmarking. Why? First, to
7977 * adapt more quickly to variations. Second, because an I/O
7978 * scheduler cannot rely on a peak-rate-evaluation workload to
7979 * be run for a long time.
7981 T_slow
[0] = msecs_to_jiffies(3500); /* actually 4 sec */
7982 T_slow
[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
7983 T_fast
[0] = msecs_to_jiffies(7000); /* actually 8 sec */
7984 T_fast
[1] = msecs_to_jiffies(2500); /* actually 3 sec */
7987 * Thresholds that determine the switch between speed classes
7988 * (see the comments before the definition of the array
7989 * device_speed_thresh). These thresholds are biased towards
7990 * transitions to the fast class. This is safer than the
7991 * opposite bias. In fact, a wrong transition to the slow
7992 * class results in short weight-raising periods, because the
7993 * speed of the device then tends to be higher that the
7994 * reference peak rate. On the opposite end, a wrong
7995 * transition to the fast class tends to increase
7996 * weight-raising periods, because of the opposite reason.
7998 device_speed_thresh
[0] = (4 * R_slow
[0]) / 3;
7999 device_speed_thresh
[1] = (4 * R_slow
[1]) / 3;
8001 ret
= elv_register(&iosched_bfq_mq
);
8008 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8009 blkcg_policy_unregister(&blkcg_policy_bfq
);
8014 static void __exit
bfq_exit(void)
8016 elv_unregister(&iosched_bfq_mq
);
8017 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8018 blkcg_policy_unregister(&blkcg_policy_bfq
);
8023 module_init(bfq_init
);
8024 module_exit(bfq_exit
);
8026 MODULE_AUTHOR("Paolo Valente");
8027 MODULE_LICENSE("GPL");
8028 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");