2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
127 #define BFQ_SOFTRT_WEIGHT_FACTOR 100
132 * struct bfq_service_tree - per ioprio_class service tree.
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
139 struct bfq_service_tree
{
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active
;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
145 /* idle entity with minimum F_i */
146 struct bfq_entity
*first_idle
;
147 /* idle entity with maximum F_i */
148 struct bfq_entity
*last_idle
;
150 /* scheduler virtual time */
152 /* scheduler weight sum; active and idle entities contribute to it */
157 * struct bfq_sched_data - multi-class scheduler.
159 * bfq_sched_data is the basic scheduler queue. It supports three
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
173 struct bfq_sched_data
{
174 /* entity in service */
175 struct bfq_entity
*in_service_entity
;
176 /* head-of-line entity (see comments above) */
177 struct bfq_entity
*next_in_service
;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service
;
186 * struct bfq_weight_counter - counter of the number of all active entities
187 * with a given weight.
189 struct bfq_weight_counter
{
190 unsigned int weight
; /* weight of the entities this counter refers to */
191 unsigned int num_active
; /* nr of active entities with this weight */
193 * Weights tree member (see bfq_data's @queue_weights_tree and
194 * @group_weights_tree)
196 struct rb_node weights_node
;
200 * struct bfq_entity - schedulable entity.
202 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
203 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
204 * entity belongs to the sched_data of the parent group in the cgroup
205 * hierarchy. Non-leaf entities have also their own sched_data, stored
208 * Each entity stores independently its priority values; this would
209 * allow different weights on different devices, but this
210 * functionality is not exported to userspace by now. Priorities and
211 * weights are updated lazily, first storing the new values into the
212 * new_* fields, then setting the @prio_changed flag. As soon as
213 * there is a transition in the entity state that allows the priority
214 * update to take place the effective and the requested priority
215 * values are synchronized.
217 * Unless cgroups are used, the weight value is calculated from the
218 * ioprio to export the same interface as CFQ. When dealing with
219 * ``well-behaved'' queues (i.e., queues that do not spend too much
220 * time to consume their budget and have true sequential behavior, and
221 * when there are no external factors breaking anticipation) the
222 * relative weights at each level of the cgroups hierarchy should be
223 * guaranteed. All the fields are protected by the queue lock of the
227 /* service_tree member */
228 struct rb_node rb_node
;
229 /* pointer to the weight counter associated with this entity */
230 struct bfq_weight_counter
*weight_counter
;
233 * Flag, true if the entity is on a tree (either the active or
234 * the idle one of its service_tree) or is in service.
238 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
241 /* tree the entity is enqueued into; %NULL if not on a tree */
242 struct rb_root
*tree
;
245 * minimum start time of the (active) subtree rooted at this
246 * entity; used for O(log N) lookups into active trees
250 /* amount of service received during the last service slot */
253 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
256 /* weight of the queue */
258 /* next weight if a change is in progress */
261 /* original weight, used to implement weight boosting */
264 /* parent entity, for hierarchical scheduling */
265 struct bfq_entity
*parent
;
268 * For non-leaf nodes in the hierarchy, the associated
269 * scheduler queue, %NULL on leaf nodes.
271 struct bfq_sched_data
*my_sched_data
;
272 /* the scheduler queue this entity belongs to */
273 struct bfq_sched_data
*sched_data
;
275 /* flag, set to request a weight, ioprio or ioprio_class change */
282 * struct bfq_ttime - per process thinktime stats.
285 /* completion time of the last request */
286 u64 last_end_request
;
288 /* total process thinktime */
290 /* number of thinktime samples */
291 unsigned long ttime_samples
;
292 /* average process thinktime */
297 * struct bfq_queue - leaf schedulable entity.
299 * A bfq_queue is a leaf request queue; it can be associated with an
300 * io_context or more, if it is async or shared between cooperating
301 * processes. @cgroup holds a reference to the cgroup, to be sure that it
302 * does not disappear while a bfqq still references it (mostly to avoid
303 * races between request issuing and task migration followed by cgroup
305 * All the fields are protected by the queue lock of the containing bfqd.
308 /* reference counter */
310 /* parent bfq_data */
311 struct bfq_data
*bfqd
;
313 /* current ioprio and ioprio class */
314 unsigned short ioprio
, ioprio_class
;
315 /* next ioprio and ioprio class if a change is in progress */
316 unsigned short new_ioprio
, new_ioprio_class
;
319 * Shared bfq_queue if queue is cooperating with one or more
322 struct bfq_queue
*new_bfqq
;
323 /* request-position tree member (see bfq_group's @rq_pos_tree) */
324 struct rb_node pos_node
;
325 /* request-position tree root (see bfq_group's @rq_pos_tree) */
326 struct rb_root
*pos_root
;
328 /* sorted list of pending requests */
329 struct rb_root sort_list
;
330 /* if fifo isn't expired, next request to serve */
331 struct request
*next_rq
;
332 /* number of sync and async requests queued */
334 /* number of requests currently allocated */
336 /* number of pending metadata requests */
338 /* fifo list of requests in sort_list */
339 struct list_head fifo
;
341 /* entity representing this queue in the scheduler */
342 struct bfq_entity entity
;
344 /* maximum budget allowed from the feedback mechanism */
346 /* budget expiration (in jiffies) */
347 unsigned long budget_timeout
;
349 /* number of requests on the dispatch list or inside driver */
355 /* node for active/idle bfqq list inside parent bfqd */
356 struct list_head bfqq_list
;
358 /* associated @bfq_ttime struct */
359 struct bfq_ttime ttime
;
361 /* bit vector: a 1 for each seeky requests in history */
363 /* position of the last request enqueued */
364 sector_t last_request_pos
;
366 /* Number of consecutive pairs of request completion and
367 * arrival, such that the queue becomes idle after the
368 * completion, but the next request arrives within an idle
369 * time slice; used only if the queue's IO_bound flag has been
372 unsigned int requests_within_timer
;
374 /* pid of the process owning the queue, used for logging purposes */
378 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
379 * if the queue is shared.
381 struct bfq_io_cq
*bic
;
383 /* current maximum weight-raising time for this queue */
384 unsigned long wr_cur_max_time
;
386 * Minimum time instant such that, only if a new request is
387 * enqueued after this time instant in an idle @bfq_queue with
388 * no outstanding requests, then the task associated with the
389 * queue it is deemed as soft real-time (see the comments on
390 * the function bfq_bfqq_softrt_next_start())
392 unsigned long soft_rt_next_start
;
394 * Start time of the current weight-raising period if
395 * the @bfq-queue is being weight-raised, otherwise
396 * finish time of the last weight-raising period.
398 unsigned long last_wr_start_finish
;
399 /* factor by which the weight of this queue is multiplied */
400 unsigned int wr_coeff
;
402 * Time of the last transition of the @bfq_queue from idle to
405 unsigned long last_idle_bklogged
;
407 * Cumulative service received from the @bfq_queue since the
408 * last transition from idle to backlogged.
410 unsigned long service_from_backlogged
;
413 * Value of wr start time when switching to soft rt
415 unsigned long wr_start_at_switch_to_srt
;
417 unsigned long split_time
; /* time of last split */
421 * struct bfq_io_cq - per (request_queue, io_context) structure.
424 /* associated io_cq structure */
425 struct io_cq icq
; /* must be the first member */
426 /* array of two process queues, the sync and the async */
427 struct bfq_queue
*bfqq
[2];
428 /* per (request_queue, blkcg) ioprio */
430 #ifdef CONFIG_BFQ_GROUP_IOSCHED
431 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
434 * Snapshot of the idle window before merging; taken to
435 * remember this value while the queue is merged, so as to be
436 * able to restore it in case of split.
438 bool saved_idle_window
;
440 * Same purpose as the previous two fields for the I/O bound
441 * classification of a queue.
446 * Similar to previous fields: save wr information.
448 unsigned long saved_wr_coeff
;
449 unsigned long saved_last_wr_start_finish
;
450 unsigned long saved_wr_start_at_switch_to_srt
;
451 unsigned int saved_wr_cur_max_time
;
452 struct bfq_ttime saved_ttime
;
455 enum bfq_device_speed
{
461 * struct bfq_data - per-device data structure.
463 * All the fields are protected by @lock.
466 /* device request queue */
467 struct request_queue
*queue
;
469 struct list_head dispatch
;
471 /* root bfq_group for the device */
472 struct bfq_group
*root_group
;
475 * rbtree of weight counters of @bfq_queues, sorted by
476 * weight. Used to keep track of whether all @bfq_queues have
477 * the same weight. The tree contains one counter for each
478 * distinct weight associated to some active and not
479 * weight-raised @bfq_queue (see the comments to the functions
480 * bfq_weights_tree_[add|remove] for further details).
482 struct rb_root queue_weights_tree
;
484 * rbtree of non-queue @bfq_entity weight counters, sorted by
485 * weight. Used to keep track of whether all @bfq_groups have
486 * the same weight. The tree contains one counter for each
487 * distinct weight associated to some active @bfq_group (see
488 * the comments to the functions bfq_weights_tree_[add|remove]
489 * for further details).
491 struct rb_root group_weights_tree
;
494 * Number of bfq_queues containing requests (including the
495 * queue in service, even if it is idling).
498 /* number of weight-raised busy @bfq_queues */
500 /* number of queued requests */
502 /* number of requests dispatched and waiting for completion */
506 * Maximum number of requests in driver in the last
507 * @hw_tag_samples completed requests.
509 int max_rq_in_driver
;
510 /* number of samples used to calculate hw_tag */
512 /* flag set to one if the driver is showing a queueing behavior */
515 /* number of budgets assigned */
516 int budgets_assigned
;
519 * Timer set when idling (waiting) for the next request from
520 * the queue in service.
522 struct hrtimer idle_slice_timer
;
524 /* bfq_queue in service */
525 struct bfq_queue
*in_service_queue
;
526 /* bfq_io_cq (bic) associated with the @in_service_queue */
527 struct bfq_io_cq
*in_service_bic
;
529 /* on-disk position of the last served request */
530 sector_t last_position
;
532 /* time of last request completion (ns) */
535 /* time of first rq dispatch in current observation interval (ns) */
537 /* time of last rq dispatch in current observation interval (ns) */
540 /* beginning of the last budget */
541 ktime_t last_budget_start
;
542 /* beginning of the last idle slice */
543 ktime_t last_idling_start
;
545 /* number of samples in current observation interval */
546 int peak_rate_samples
;
547 /* num of samples of seq dispatches in current observation interval */
548 u32 sequential_samples
;
549 /* total num of sectors transferred in current observation interval */
550 u64 tot_sectors_dispatched
;
551 /* max rq size seen during current observation interval (sectors) */
552 u32 last_rq_max_size
;
553 /* time elapsed from first dispatch in current observ. interval (us) */
554 u64 delta_from_first
;
556 * Current estimate of the device peak rate, measured in
557 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
558 * BFQ_RATE_SHIFT is performed to increase precision in
559 * fixed-point calculations.
563 /* maximum budget allotted to a bfq_queue before rescheduling */
566 /* list of all the bfq_queues active on the device */
567 struct list_head active_list
;
568 /* list of all the bfq_queues idle on the device */
569 struct list_head idle_list
;
572 * Timeout for async/sync requests; when it fires, requests
573 * are served in fifo order.
575 u64 bfq_fifo_expire
[2];
576 /* weight of backward seeks wrt forward ones */
577 unsigned int bfq_back_penalty
;
578 /* maximum allowed backward seek */
579 unsigned int bfq_back_max
;
580 /* maximum idling time */
583 /* user-configured max budget value (0 for auto-tuning) */
584 int bfq_user_max_budget
;
586 * Timeout for bfq_queues to consume their budget; used to
587 * prevent seeky queues from imposing long latencies to
588 * sequential or quasi-sequential ones (this also implies that
589 * seeky queues cannot receive guarantees in the service
590 * domain; after a timeout they are charged for the time they
591 * have been in service, to preserve fairness among them, but
592 * without service-domain guarantees).
594 unsigned int bfq_timeout
;
597 * Number of consecutive requests that must be issued within
598 * the idle time slice to set again idling to a queue which
599 * was marked as non-I/O-bound (see the definition of the
600 * IO_bound flag for further details).
602 unsigned int bfq_requests_within_timer
;
605 * Force device idling whenever needed to provide accurate
606 * service guarantees, without caring about throughput
607 * issues. CAVEAT: this may even increase latencies, in case
608 * of useless idling for processes that did stop doing I/O.
610 bool strict_guarantees
;
612 /* if set to true, low-latency heuristics are enabled */
615 * Maximum factor by which the weight of a weight-raised queue
618 unsigned int bfq_wr_coeff
;
619 /* maximum duration of a weight-raising period (jiffies) */
620 unsigned int bfq_wr_max_time
;
622 /* Maximum weight-raising duration for soft real-time processes */
623 unsigned int bfq_wr_rt_max_time
;
625 * Minimum idle period after which weight-raising may be
626 * reactivated for a queue (in jiffies).
628 unsigned int bfq_wr_min_idle_time
;
630 * Minimum period between request arrivals after which
631 * weight-raising may be reactivated for an already busy async
632 * queue (in jiffies).
634 unsigned long bfq_wr_min_inter_arr_async
;
636 /* Max service-rate for a soft real-time queue, in sectors/sec */
637 unsigned int bfq_wr_max_softrt_rate
;
639 * Cached value of the product R*T, used for computing the
640 * maximum duration of weight raising automatically.
643 /* device-speed class for the low-latency heuristic */
644 enum bfq_device_speed device_speed
;
646 /* fallback dummy bfqq for extreme OOM conditions */
647 struct bfq_queue oom_bfqq
;
652 * bic associated with the task issuing current bio for
653 * merging. This and the next field are used as a support to
654 * be able to perform the bic lookup, needed by bio-merge
655 * functions, before the scheduler lock is taken, and thus
656 * avoid taking the request-queue lock while the scheduler
657 * lock is being held.
659 struct bfq_io_cq
*bio_bic
;
660 /* bfqq associated with the task issuing current bio for merging */
661 struct bfq_queue
*bio_bfqq
;
664 * io context to put right after bfqd->lock is released. This
665 * filed is used to perform put_io_context, when needed, to
666 * after the scheduler lock has been released, and thus
667 * prevent an ioc->lock from being possibly taken while the
668 * scheduler lock is being held.
670 struct io_context
*ioc_to_put
;
673 enum bfqq_state_flags
{
674 BFQQF_busy
= 0, /* has requests or is in service */
675 BFQQF_wait_request
, /* waiting for a request */
676 BFQQF_non_blocking_wait_rq
, /*
677 * waiting for a request
678 * without idling the device
680 BFQQF_fifo_expire
, /* FIFO checked in this slice */
681 BFQQF_idle_window
, /* slice idling enabled */
682 BFQQF_sync
, /* synchronous queue */
684 * bfqq has timed-out at least once
685 * having consumed at most 2/10 of
688 BFQQF_softrt_update
, /*
689 * may need softrt-next-start
692 BFQQF_coop
, /* bfqq is shared */
693 BFQQF_split_coop
/* shared bfqq will be split */
696 #define BFQ_BFQQ_FNS(name) \
697 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
699 __set_bit(BFQQF_##name, &(bfqq)->flags); \
701 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
703 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
705 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
707 return test_bit(BFQQF_##name, &(bfqq)->flags); \
711 BFQ_BFQQ_FNS(wait_request
);
712 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
713 BFQ_BFQQ_FNS(fifo_expire
);
714 BFQ_BFQQ_FNS(idle_window
);
716 BFQ_BFQQ_FNS(IO_bound
);
718 BFQ_BFQQ_FNS(split_coop
);
719 BFQ_BFQQ_FNS(softrt_update
);
722 /* Logging facilities. */
723 #ifdef CONFIG_BFQ_GROUP_IOSCHED
724 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
725 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
727 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
730 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
731 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
732 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
736 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
739 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
740 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
743 #else /* CONFIG_BFQ_GROUP_IOSCHED */
745 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
746 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
747 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
749 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
751 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
753 #define bfq_log(bfqd, fmt, args...) \
754 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
756 /* Expiration reasons. */
757 enum bfqq_expiration
{
758 BFQQE_TOO_IDLE
= 0, /*
759 * queue has been idling for
762 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
763 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
764 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
765 BFQQE_PREEMPTED
/* preemption in progress */
769 #ifdef CONFIG_BFQ_GROUP_IOSCHED
770 /* number of ios merged */
771 struct blkg_rwstat merged
;
772 /* total time spent on device in ns, may not be accurate w/ queueing */
773 struct blkg_rwstat service_time
;
774 /* total time spent waiting in scheduler queue in ns */
775 struct blkg_rwstat wait_time
;
776 /* number of IOs queued up */
777 struct blkg_rwstat queued
;
778 /* total disk time and nr sectors dispatched by this group */
779 struct blkg_stat time
;
780 /* sum of number of ios queued across all samples */
781 struct blkg_stat avg_queue_size_sum
;
782 /* count of samples taken for average */
783 struct blkg_stat avg_queue_size_samples
;
784 /* how many times this group has been removed from service tree */
785 struct blkg_stat dequeue
;
786 /* total time spent waiting for it to be assigned a timeslice. */
787 struct blkg_stat group_wait_time
;
788 /* time spent idling for this blkcg_gq */
789 struct blkg_stat idle_time
;
790 /* total time with empty current active q with other requests queued */
791 struct blkg_stat empty_time
;
792 /* fields after this shouldn't be cleared on stat reset */
793 uint64_t start_group_wait_time
;
794 uint64_t start_idle_time
;
795 uint64_t start_empty_time
;
797 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
800 #ifdef CONFIG_BFQ_GROUP_IOSCHED
803 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
805 * @ps: @blkcg_policy_storage that this structure inherits
806 * @weight: weight of the bfq_group
808 struct bfq_group_data
{
809 /* must be the first member */
810 struct blkcg_policy_data pd
;
816 * struct bfq_group - per (device, cgroup) data structure.
817 * @entity: schedulable entity to insert into the parent group sched_data.
818 * @sched_data: own sched_data, to contain child entities (they may be
819 * both bfq_queues and bfq_groups).
820 * @bfqd: the bfq_data for the device this group acts upon.
821 * @async_bfqq: array of async queues for all the tasks belonging to
822 * the group, one queue per ioprio value per ioprio_class,
823 * except for the idle class that has only one queue.
824 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
825 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
826 * to avoid too many special cases during group creation/
828 * @stats: stats for this bfqg.
829 * @active_entities: number of active entities belonging to the group;
830 * unused for the root group. Used to know whether there
831 * are groups with more than one active @bfq_entity
832 * (see the comments to the function
833 * bfq_bfqq_may_idle()).
834 * @rq_pos_tree: rbtree sorted by next_request position, used when
835 * determining if two or more queues have interleaving
836 * requests (see bfq_find_close_cooperator()).
838 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
839 * there is a set of bfq_groups, each one collecting the lower-level
840 * entities belonging to the group that are acting on the same device.
842 * Locking works as follows:
843 * o @bfqd is protected by the queue lock, RCU is used to access it
845 * o All the other fields are protected by the @bfqd queue lock.
848 /* must be the first member */
849 struct blkg_policy_data pd
;
851 struct bfq_entity entity
;
852 struct bfq_sched_data sched_data
;
856 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
857 struct bfq_queue
*async_idle_bfqq
;
859 struct bfq_entity
*my_entity
;
863 struct rb_root rq_pos_tree
;
865 struct bfqg_stats stats
;
870 struct bfq_sched_data sched_data
;
872 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
873 struct bfq_queue
*async_idle_bfqq
;
875 struct rb_root rq_pos_tree
;
879 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
881 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
883 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
885 return bfqq
? bfqq
->ioprio_class
- 1 :
886 BFQ_DEFAULT_GRP_CLASS
- 1;
889 static struct bfq_service_tree
*
890 bfq_entity_service_tree(struct bfq_entity
*entity
)
892 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
893 unsigned int idx
= bfq_class_idx(entity
);
895 return sched_data
->service_tree
+ idx
;
898 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
900 return bic
->bfqq
[is_sync
];
903 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
906 bic
->bfqq
[is_sync
] = bfqq
;
909 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
911 return bic
->icq
.q
->elevator
->elevator_data
;
914 #ifdef CONFIG_BFQ_GROUP_IOSCHED
916 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
918 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
921 group_entity
= &bfqq
->bfqd
->root_group
->entity
;
923 return container_of(group_entity
, struct bfq_group
, entity
);
928 static struct bfq_group
*bfq_bfqq_to_bfqg(struct bfq_queue
*bfqq
)
930 return bfqq
->bfqd
->root_group
;
935 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
936 static void bfq_put_queue(struct bfq_queue
*bfqq
);
937 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
938 struct bio
*bio
, bool is_sync
,
939 struct bfq_io_cq
*bic
);
940 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
941 struct bfq_group
*bfqg
);
942 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
943 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
945 /* Expiration time of sync (0) and async (1) requests, in ns. */
946 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
948 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
949 static const int bfq_back_max
= 16 * 1024;
951 /* Penalty of a backwards seek, in number of sectors. */
952 static const int bfq_back_penalty
= 2;
954 /* Idling period duration, in ns. */
955 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
957 /* Minimum number of assigned budgets for which stats are safe to compute. */
958 static const int bfq_stats_min_budgets
= 194;
960 /* Default maximum budget values, in sectors and number of requests. */
961 static const int bfq_default_max_budget
= 16 * 1024;
964 * Async to sync throughput distribution is controlled as follows:
965 * when an async request is served, the entity is charged the number
966 * of sectors of the request, multiplied by the factor below
968 static const int bfq_async_charge_factor
= 10;
970 /* Default timeout values, in jiffies, approximating CFQ defaults. */
971 static const int bfq_timeout
= HZ
/ 8;
973 static struct kmem_cache
*bfq_pool
;
975 /* Below this threshold (in ns), we consider thinktime immediate. */
976 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
978 /* hw_tag detection: parallel requests threshold and min samples needed. */
979 #define BFQ_HW_QUEUE_THRESHOLD 4
980 #define BFQ_HW_QUEUE_SAMPLES 32
982 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
983 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
984 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
985 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
987 /* Min number of samples required to perform peak-rate update */
988 #define BFQ_RATE_MIN_SAMPLES 32
989 /* Min observation time interval required to perform a peak-rate update (ns) */
990 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
991 /* Target observation time interval for a peak-rate update (ns) */
992 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
994 /* Shift used for peak rate fixed precision calculations. */
995 #define BFQ_RATE_SHIFT 16
998 * By default, BFQ computes the duration of the weight raising for
999 * interactive applications automatically, using the following formula:
1000 * duration = (R / r) * T, where r is the peak rate of the device, and
1001 * R and T are two reference parameters.
1002 * In particular, R is the peak rate of the reference device (see below),
1003 * and T is a reference time: given the systems that are likely to be
1004 * installed on the reference device according to its speed class, T is
1005 * about the maximum time needed, under BFQ and while reading two files in
1006 * parallel, to load typical large applications on these systems.
1007 * In practice, the slower/faster the device at hand is, the more/less it
1008 * takes to load applications with respect to the reference device.
1009 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
1012 * BFQ uses four different reference pairs (R, T), depending on:
1013 * . whether the device is rotational or non-rotational;
1014 * . whether the device is slow, such as old or portable HDDs, as well as
1015 * SD cards, or fast, such as newer HDDs and SSDs.
1017 * The device's speed class is dynamically (re)detected in
1018 * bfq_update_peak_rate() every time the estimated peak rate is updated.
1020 * In the following definitions, R_slow[0]/R_fast[0] and
1021 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
1022 * rotational device, whereas R_slow[1]/R_fast[1] and
1023 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
1024 * non-rotational device. Finally, device_speed_thresh are the
1025 * thresholds used to switch between speed classes. The reference
1026 * rates are not the actual peak rates of the devices used as a
1027 * reference, but slightly lower values. The reason for using these
1028 * slightly lower values is that the peak-rate estimator tends to
1029 * yield slightly lower values than the actual peak rate (it can yield
1030 * the actual peak rate only if there is only one process doing I/O,
1031 * and the process does sequential I/O).
1033 * Both the reference peak rates and the thresholds are measured in
1034 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
1036 static int R_slow
[2] = {1000, 10700};
1037 static int R_fast
[2] = {14000, 33000};
1039 * To improve readability, a conversion function is used to initialize the
1040 * following arrays, which entails that they can be initialized only in a
1043 static int T_slow
[2];
1044 static int T_fast
[2];
1045 static int device_speed_thresh
[2];
1047 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1048 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1050 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1051 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1054 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1055 * @icq: the iocontext queue.
1057 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
1059 /* bic->icq is the first member, %NULL will convert to %NULL */
1060 return container_of(icq
, struct bfq_io_cq
, icq
);
1064 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1065 * @bfqd: the lookup key.
1066 * @ioc: the io_context of the process doing I/O.
1067 * @q: the request queue.
1069 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
1070 struct io_context
*ioc
,
1071 struct request_queue
*q
)
1074 unsigned long flags
;
1075 struct bfq_io_cq
*icq
;
1077 spin_lock_irqsave(q
->queue_lock
, flags
);
1078 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
1079 spin_unlock_irqrestore(q
->queue_lock
, flags
);
1088 * Scheduler run of queue, if there are requests pending and no one in the
1089 * driver that will restart queueing.
1091 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
1093 if (bfqd
->queued
!= 0) {
1094 bfq_log(bfqd
, "schedule dispatch");
1095 blk_mq_run_hw_queues(bfqd
->queue
, true);
1100 * Next two functions release bfqd->lock and put the io context
1101 * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
1102 * to take an ioc->lock while the scheduler lock is being held.
1104 static void bfq_unlock_put_ioc(struct bfq_data
*bfqd
)
1106 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1108 bfqd
->ioc_to_put
= NULL
;
1109 spin_unlock_irq(&bfqd
->lock
);
1112 put_io_context(ioc_to_put
);
1115 static void bfq_unlock_put_ioc_restore(struct bfq_data
*bfqd
,
1116 unsigned long flags
)
1118 struct io_context
*ioc_to_put
= bfqd
->ioc_to_put
;
1120 bfqd
->ioc_to_put
= NULL
;
1121 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
1124 put_io_context(ioc_to_put
);
1128 * bfq_gt - compare two timestamps.
1132 * Return @a > @b, dealing with wrapping correctly.
1134 static int bfq_gt(u64 a
, u64 b
)
1136 return (s64
)(a
- b
) > 0;
1139 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
1141 struct rb_node
*node
= tree
->rb_node
;
1143 return rb_entry(node
, struct bfq_entity
, rb_node
);
1146 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
1148 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
1151 * bfq_update_next_in_service - update sd->next_in_service
1152 * @sd: sched_data for which to perform the update.
1153 * @new_entity: if not NULL, pointer to the entity whose activation,
1154 * requeueing or repositionig triggered the invocation of
1157 * This function is called to update sd->next_in_service, which, in
1158 * its turn, may change as a consequence of the insertion or
1159 * extraction of an entity into/from one of the active trees of
1160 * sd. These insertions/extractions occur as a consequence of
1161 * activations/deactivations of entities, with some activations being
1162 * 'true' activations, and other activations being requeueings (i.e.,
1163 * implementing the second, requeueing phase of the mechanism used to
1164 * reposition an entity in its active tree; see comments on
1165 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1166 * both the last two activation sub-cases, new_entity points to the
1167 * just activated or requeued entity.
1169 * Returns true if sd->next_in_service changes in such a way that
1170 * entity->parent may become the next_in_service for its parent
1173 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
1174 struct bfq_entity
*new_entity
)
1176 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
1177 bool parent_sched_may_change
= false;
1180 * If this update is triggered by the activation, requeueing
1181 * or repositiong of an entity that does not coincide with
1182 * sd->next_in_service, then a full lookup in the active tree
1183 * can be avoided. In fact, it is enough to check whether the
1184 * just-modified entity has a higher priority than
1185 * sd->next_in_service, or, even if it has the same priority
1186 * as sd->next_in_service, is eligible and has a lower virtual
1187 * finish time than sd->next_in_service. If this compound
1188 * condition holds, then the new entity becomes the new
1189 * next_in_service. Otherwise no change is needed.
1191 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
1193 * Flag used to decide whether to replace
1194 * sd->next_in_service with new_entity. Tentatively
1195 * set to true, and left as true if
1196 * sd->next_in_service is NULL.
1198 bool replace_next
= true;
1201 * If there is already a next_in_service candidate
1202 * entity, then compare class priorities or timestamps
1203 * to decide whether to replace sd->service_tree with
1206 if (next_in_service
) {
1207 unsigned int new_entity_class_idx
=
1208 bfq_class_idx(new_entity
);
1209 struct bfq_service_tree
*st
=
1210 sd
->service_tree
+ new_entity_class_idx
;
1213 * For efficiency, evaluate the most likely
1214 * sub-condition first.
1217 (new_entity_class_idx
==
1218 bfq_class_idx(next_in_service
)
1220 !bfq_gt(new_entity
->start
, st
->vtime
)
1222 bfq_gt(next_in_service
->finish
,
1223 new_entity
->finish
))
1225 new_entity_class_idx
<
1226 bfq_class_idx(next_in_service
);
1230 next_in_service
= new_entity
;
1231 } else /* invoked because of a deactivation: lookup needed */
1232 next_in_service
= bfq_lookup_next_entity(sd
);
1234 if (next_in_service
) {
1235 parent_sched_may_change
= !sd
->next_in_service
||
1236 bfq_update_parent_budget(next_in_service
);
1239 sd
->next_in_service
= next_in_service
;
1241 if (!next_in_service
)
1242 return parent_sched_may_change
;
1244 return parent_sched_may_change
;
1247 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1248 /* both next loops stop at one of the child entities of the root group */
1249 #define for_each_entity(entity) \
1250 for (; entity ; entity = entity->parent)
1253 * For each iteration, compute parent in advance, so as to be safe if
1254 * entity is deallocated during the iteration. Such a deallocation may
1255 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1256 * containing entity.
1258 #define for_each_entity_safe(entity, parent) \
1259 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1262 * Returns true if this budget changes may let next_in_service->parent
1263 * become the next_in_service entity for its parent entity.
1265 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1267 struct bfq_entity
*bfqg_entity
;
1268 struct bfq_group
*bfqg
;
1269 struct bfq_sched_data
*group_sd
;
1272 group_sd
= next_in_service
->sched_data
;
1274 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
1276 * bfq_group's my_entity field is not NULL only if the group
1277 * is not the root group. We must not touch the root entity
1278 * as it must never become an in-service entity.
1280 bfqg_entity
= bfqg
->my_entity
;
1282 if (bfqg_entity
->budget
> next_in_service
->budget
)
1284 bfqg_entity
->budget
= next_in_service
->budget
;
1291 * This function tells whether entity stops being a candidate for next
1292 * service, according to the following logic.
1294 * This function is invoked for an entity that is about to be set in
1295 * service. If such an entity is a queue, then the entity is no longer
1296 * a candidate for next service (i.e, a candidate entity to serve
1297 * after the in-service entity is expired). The function then returns
1300 * In contrast, the entity could stil be a candidate for next service
1301 * if it is not a queue, and has more than one child. In fact, even if
1302 * one of its children is about to be set in service, other children
1303 * may still be the next to serve. As a consequence, a non-queue
1304 * entity is not a candidate for next-service only if it has only one
1305 * child. And only if this condition holds, then the function returns
1306 * true for a non-queue entity.
1308 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1310 struct bfq_group
*bfqg
;
1312 if (bfq_entity_to_bfqq(entity
))
1315 bfqg
= container_of(entity
, struct bfq_group
, entity
);
1317 if (bfqg
->active_entities
== 1)
1323 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1325 * Next two macros are fake loops when cgroups support is not
1326 * enabled. I fact, in such a case, there is only one level to go up
1327 * (to reach the root group).
1329 #define for_each_entity(entity) \
1330 for (; entity ; entity = NULL)
1332 #define for_each_entity_safe(entity, parent) \
1333 for (parent = NULL; entity ; entity = parent)
1335 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1340 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1345 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1348 * Shift for timestamp calculations. This actually limits the maximum
1349 * service allowed in one timestamp delta (small shift values increase it),
1350 * the maximum total weight that can be used for the queues in the system
1351 * (big shift values increase it), and the period of virtual time
1354 #define WFQ_SERVICE_SHIFT 22
1356 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1358 struct bfq_queue
*bfqq
= NULL
;
1360 if (!entity
->my_sched_data
)
1361 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1368 * bfq_delta - map service into the virtual time domain.
1369 * @service: amount of service.
1370 * @weight: scale factor (weight of an entity or weight sum).
1372 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1374 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1381 * bfq_calc_finish - assign the finish time to an entity.
1382 * @entity: the entity to act upon.
1383 * @service: the service to be charged to the entity.
1385 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1387 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1389 entity
->finish
= entity
->start
+
1390 bfq_delta(service
, entity
->weight
);
1393 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1394 "calc_finish: serv %lu, w %d",
1395 service
, entity
->weight
);
1396 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1397 "calc_finish: start %llu, finish %llu, delta %llu",
1398 entity
->start
, entity
->finish
,
1399 bfq_delta(service
, entity
->weight
));
1404 * bfq_entity_of - get an entity from a node.
1405 * @node: the node field of the entity.
1407 * Convert a node pointer to the relative entity. This is used only
1408 * to simplify the logic of some functions and not as the generic
1409 * conversion mechanism because, e.g., in the tree walking functions,
1410 * the check for a %NULL value would be redundant.
1412 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1414 struct bfq_entity
*entity
= NULL
;
1417 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1423 * bfq_extract - remove an entity from a tree.
1424 * @root: the tree root.
1425 * @entity: the entity to remove.
1427 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1429 entity
->tree
= NULL
;
1430 rb_erase(&entity
->rb_node
, root
);
1434 * bfq_idle_extract - extract an entity from the idle tree.
1435 * @st: the service tree of the owning @entity.
1436 * @entity: the entity being removed.
1438 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1439 struct bfq_entity
*entity
)
1441 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1442 struct rb_node
*next
;
1444 if (entity
== st
->first_idle
) {
1445 next
= rb_next(&entity
->rb_node
);
1446 st
->first_idle
= bfq_entity_of(next
);
1449 if (entity
== st
->last_idle
) {
1450 next
= rb_prev(&entity
->rb_node
);
1451 st
->last_idle
= bfq_entity_of(next
);
1454 bfq_extract(&st
->idle
, entity
);
1457 list_del(&bfqq
->bfqq_list
);
1461 * bfq_insert - generic tree insertion.
1463 * @entity: entity to insert.
1465 * This is used for the idle and the active tree, since they are both
1466 * ordered by finish time.
1468 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1470 struct bfq_entity
*entry
;
1471 struct rb_node
**node
= &root
->rb_node
;
1472 struct rb_node
*parent
= NULL
;
1476 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1478 if (bfq_gt(entry
->finish
, entity
->finish
))
1479 node
= &parent
->rb_left
;
1481 node
= &parent
->rb_right
;
1484 rb_link_node(&entity
->rb_node
, parent
, node
);
1485 rb_insert_color(&entity
->rb_node
, root
);
1487 entity
->tree
= root
;
1491 * bfq_update_min - update the min_start field of a entity.
1492 * @entity: the entity to update.
1493 * @node: one of its children.
1495 * This function is called when @entity may store an invalid value for
1496 * min_start due to updates to the active tree. The function assumes
1497 * that the subtree rooted at @node (which may be its left or its right
1498 * child) has a valid min_start value.
1500 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1502 struct bfq_entity
*child
;
1505 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1506 if (bfq_gt(entity
->min_start
, child
->min_start
))
1507 entity
->min_start
= child
->min_start
;
1512 * bfq_update_active_node - recalculate min_start.
1513 * @node: the node to update.
1515 * @node may have changed position or one of its children may have moved,
1516 * this function updates its min_start value. The left and right subtrees
1517 * are assumed to hold a correct min_start value.
1519 static void bfq_update_active_node(struct rb_node
*node
)
1521 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1523 entity
->min_start
= entity
->start
;
1524 bfq_update_min(entity
, node
->rb_right
);
1525 bfq_update_min(entity
, node
->rb_left
);
1529 * bfq_update_active_tree - update min_start for the whole active tree.
1530 * @node: the starting node.
1532 * @node must be the deepest modified node after an update. This function
1533 * updates its min_start using the values held by its children, assuming
1534 * that they did not change, and then updates all the nodes that may have
1535 * changed in the path to the root. The only nodes that may have changed
1536 * are the ones in the path or their siblings.
1538 static void bfq_update_active_tree(struct rb_node
*node
)
1540 struct rb_node
*parent
;
1543 bfq_update_active_node(node
);
1545 parent
= rb_parent(node
);
1549 if (node
== parent
->rb_left
&& parent
->rb_right
)
1550 bfq_update_active_node(parent
->rb_right
);
1551 else if (parent
->rb_left
)
1552 bfq_update_active_node(parent
->rb_left
);
1558 static void bfq_weights_tree_add(struct bfq_data
*bfqd
,
1559 struct bfq_entity
*entity
,
1560 struct rb_root
*root
);
1562 static void bfq_weights_tree_remove(struct bfq_data
*bfqd
,
1563 struct bfq_entity
*entity
,
1564 struct rb_root
*root
);
1568 * bfq_active_insert - insert an entity in the active tree of its
1570 * @st: the service tree of the entity.
1571 * @entity: the entity being inserted.
1573 * The active tree is ordered by finish time, but an extra key is kept
1574 * per each node, containing the minimum value for the start times of
1575 * its children (and the node itself), so it's possible to search for
1576 * the eligible node with the lowest finish time in logarithmic time.
1578 static void bfq_active_insert(struct bfq_service_tree
*st
,
1579 struct bfq_entity
*entity
)
1581 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1582 struct rb_node
*node
= &entity
->rb_node
;
1583 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1584 struct bfq_sched_data
*sd
= NULL
;
1585 struct bfq_group
*bfqg
= NULL
;
1586 struct bfq_data
*bfqd
= NULL
;
1589 bfq_insert(&st
->active
, entity
);
1592 node
= node
->rb_left
;
1593 else if (node
->rb_right
)
1594 node
= node
->rb_right
;
1596 bfq_update_active_tree(node
);
1598 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1599 sd
= entity
->sched_data
;
1600 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1601 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1604 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1605 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1606 else /* bfq_group */
1607 bfq_weights_tree_add(bfqd
, entity
, &bfqd
->group_weights_tree
);
1609 if (bfqg
!= bfqd
->root_group
)
1610 bfqg
->active_entities
++;
1615 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1616 * @ioprio: the ioprio value to convert.
1618 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1620 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1624 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1625 * @weight: the weight value to convert.
1627 * To preserve as much as possible the old only-ioprio user interface,
1628 * 0 is used as an escape ioprio value for weights (numerically) equal or
1629 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1631 static unsigned short bfq_weight_to_ioprio(int weight
)
1633 return max_t(int, 0,
1634 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1637 static void bfq_get_entity(struct bfq_entity
*entity
)
1639 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1643 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1649 * bfq_find_deepest - find the deepest node that an extraction can modify.
1650 * @node: the node being removed.
1652 * Do the first step of an extraction in an rb tree, looking for the
1653 * node that will replace @node, and returning the deepest node that
1654 * the following modifications to the tree can touch. If @node is the
1655 * last node in the tree return %NULL.
1657 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1659 struct rb_node
*deepest
;
1661 if (!node
->rb_right
&& !node
->rb_left
)
1662 deepest
= rb_parent(node
);
1663 else if (!node
->rb_right
)
1664 deepest
= node
->rb_left
;
1665 else if (!node
->rb_left
)
1666 deepest
= node
->rb_right
;
1668 deepest
= rb_next(node
);
1669 if (deepest
->rb_right
)
1670 deepest
= deepest
->rb_right
;
1671 else if (rb_parent(deepest
) != node
)
1672 deepest
= rb_parent(deepest
);
1679 * bfq_active_extract - remove an entity from the active tree.
1680 * @st: the service_tree containing the tree.
1681 * @entity: the entity being removed.
1683 static void bfq_active_extract(struct bfq_service_tree
*st
,
1684 struct bfq_entity
*entity
)
1686 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1687 struct rb_node
*node
;
1688 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1689 struct bfq_sched_data
*sd
= NULL
;
1690 struct bfq_group
*bfqg
= NULL
;
1691 struct bfq_data
*bfqd
= NULL
;
1694 node
= bfq_find_deepest(&entity
->rb_node
);
1695 bfq_extract(&st
->active
, entity
);
1698 bfq_update_active_tree(node
);
1700 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1701 sd
= entity
->sched_data
;
1702 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1703 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1706 list_del(&bfqq
->bfqq_list
);
1707 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1708 else /* bfq_group */
1709 bfq_weights_tree_remove(bfqd
, entity
,
1710 &bfqd
->group_weights_tree
);
1712 if (bfqg
!= bfqd
->root_group
)
1713 bfqg
->active_entities
--;
1718 * bfq_idle_insert - insert an entity into the idle tree.
1719 * @st: the service tree containing the tree.
1720 * @entity: the entity to insert.
1722 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1723 struct bfq_entity
*entity
)
1725 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1726 struct bfq_entity
*first_idle
= st
->first_idle
;
1727 struct bfq_entity
*last_idle
= st
->last_idle
;
1729 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1730 st
->first_idle
= entity
;
1731 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1732 st
->last_idle
= entity
;
1734 bfq_insert(&st
->idle
, entity
);
1737 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1741 * bfq_forget_entity - do not consider entity any longer for scheduling
1742 * @st: the service tree.
1743 * @entity: the entity being removed.
1744 * @is_in_service: true if entity is currently the in-service entity.
1746 * Forget everything about @entity. In addition, if entity represents
1747 * a queue, and the latter is not in service, then release the service
1748 * reference to the queue (the one taken through bfq_get_entity). In
1749 * fact, in this case, there is really no more service reference to
1750 * the queue, as the latter is also outside any service tree. If,
1751 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1752 * will take care of putting the reference when the queue finally
1753 * stops being served.
1755 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1756 struct bfq_entity
*entity
,
1759 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1761 entity
->on_st
= false;
1762 st
->wsum
-= entity
->weight
;
1763 if (bfqq
&& !is_in_service
)
1764 bfq_put_queue(bfqq
);
1768 * bfq_put_idle_entity - release the idle tree ref of an entity.
1769 * @st: service tree for the entity.
1770 * @entity: the entity being released.
1772 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1773 struct bfq_entity
*entity
)
1775 bfq_idle_extract(st
, entity
);
1776 bfq_forget_entity(st
, entity
,
1777 entity
== entity
->sched_data
->in_service_entity
);
1781 * bfq_forget_idle - update the idle tree if necessary.
1782 * @st: the service tree to act upon.
1784 * To preserve the global O(log N) complexity we only remove one entry here;
1785 * as the idle tree will not grow indefinitely this can be done safely.
1787 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1789 struct bfq_entity
*first_idle
= st
->first_idle
;
1790 struct bfq_entity
*last_idle
= st
->last_idle
;
1792 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1793 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1795 * Forget the whole idle tree, increasing the vtime past
1796 * the last finish time of idle entities.
1798 st
->vtime
= last_idle
->finish
;
1801 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1802 bfq_put_idle_entity(st
, first_idle
);
1805 static struct bfq_service_tree
*
1806 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1807 struct bfq_entity
*entity
)
1809 struct bfq_service_tree
*new_st
= old_st
;
1811 if (entity
->prio_changed
) {
1812 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1813 unsigned int prev_weight
, new_weight
;
1814 struct bfq_data
*bfqd
= NULL
;
1815 struct rb_root
*root
;
1816 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1817 struct bfq_sched_data
*sd
;
1818 struct bfq_group
*bfqg
;
1823 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1825 sd
= entity
->my_sched_data
;
1826 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1827 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1831 old_st
->wsum
-= entity
->weight
;
1833 if (entity
->new_weight
!= entity
->orig_weight
) {
1834 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1835 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1836 pr_crit("update_weight_prio: new_weight %d\n",
1837 entity
->new_weight
);
1838 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1839 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1841 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1843 entity
->orig_weight
= entity
->new_weight
;
1846 bfq_weight_to_ioprio(entity
->orig_weight
);
1850 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1851 entity
->prio_changed
= 0;
1854 * NOTE: here we may be changing the weight too early,
1855 * this will cause unfairness. The correct approach
1856 * would have required additional complexity to defer
1857 * weight changes to the proper time instants (i.e.,
1858 * when entity->finish <= old_st->vtime).
1860 new_st
= bfq_entity_service_tree(entity
);
1862 prev_weight
= entity
->weight
;
1863 new_weight
= entity
->orig_weight
*
1864 (bfqq
? bfqq
->wr_coeff
: 1);
1866 * If the weight of the entity changes, remove the entity
1867 * from its old weight counter (if there is a counter
1868 * associated with the entity), and add it to the counter
1869 * associated with its new weight.
1871 if (prev_weight
!= new_weight
) {
1872 root
= bfqq
? &bfqd
->queue_weights_tree
:
1873 &bfqd
->group_weights_tree
;
1874 bfq_weights_tree_remove(bfqd
, entity
, root
);
1876 entity
->weight
= new_weight
;
1878 * Add the entity to its weights tree only if it is
1879 * not associated with a weight-raised queue.
1881 if (prev_weight
!= new_weight
&&
1882 (bfqq
? bfqq
->wr_coeff
== 1 : 1))
1883 /* If we get here, root has been initialized. */
1884 bfq_weights_tree_add(bfqd
, entity
, root
);
1886 new_st
->wsum
+= entity
->weight
;
1888 if (new_st
!= old_st
)
1889 entity
->start
= new_st
->vtime
;
1895 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1896 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1899 * bfq_bfqq_served - update the scheduler status after selection for
1901 * @bfqq: the queue being served.
1902 * @served: bytes to transfer.
1904 * NOTE: this can be optimized, as the timestamps of upper level entities
1905 * are synchronized every time a new bfqq is selected for service. By now,
1906 * we keep it to better check consistency.
1908 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1910 struct bfq_entity
*entity
= &bfqq
->entity
;
1911 struct bfq_service_tree
*st
;
1913 for_each_entity(entity
) {
1914 st
= bfq_entity_service_tree(entity
);
1916 entity
->service
+= served
;
1918 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1919 bfq_forget_idle(st
);
1921 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1922 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1926 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1927 * of the time interval during which bfqq has been in
1930 * @bfqq: the queue that needs a service update.
1931 * @time_ms: the amount of time during which the queue has received service
1933 * If a queue does not consume its budget fast enough, then providing
1934 * the queue with service fairness may impair throughput, more or less
1935 * severely. For this reason, queues that consume their budget slowly
1936 * are provided with time fairness instead of service fairness. This
1937 * goal is achieved through the BFQ scheduling engine, even if such an
1938 * engine works in the service, and not in the time domain. The trick
1939 * is charging these queues with an inflated amount of service, equal
1940 * to the amount of service that they would have received during their
1941 * service slot if they had been fast, i.e., if their requests had
1942 * been dispatched at a rate equal to the estimated peak rate.
1944 * It is worth noting that time fairness can cause important
1945 * distortions in terms of bandwidth distribution, on devices with
1946 * internal queueing. The reason is that I/O requests dispatched
1947 * during the service slot of a queue may be served after that service
1948 * slot is finished, and may have a total processing time loosely
1949 * correlated with the duration of the service slot. This is
1950 * especially true for short service slots.
1952 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1953 unsigned long time_ms
)
1955 struct bfq_entity
*entity
= &bfqq
->entity
;
1956 int tot_serv_to_charge
= entity
->service
;
1957 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
1959 if (time_ms
> 0 && time_ms
< timeout_ms
)
1960 tot_serv_to_charge
=
1961 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
1963 if (tot_serv_to_charge
< entity
->service
)
1964 tot_serv_to_charge
= entity
->service
;
1966 /* Increase budget to avoid inconsistencies */
1967 if (tot_serv_to_charge
> entity
->budget
)
1968 entity
->budget
= tot_serv_to_charge
;
1970 bfq_bfqq_served(bfqq
,
1971 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
1974 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1975 struct bfq_service_tree
*st
,
1978 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1980 st
= __bfq_entity_update_weight_prio(st
, entity
);
1981 bfq_calc_finish(entity
, entity
->budget
);
1984 * If some queues enjoy backshifting for a while, then their
1985 * (virtual) finish timestamps may happen to become lower and
1986 * lower than the system virtual time. In particular, if
1987 * these queues often happen to be idle for short time
1988 * periods, and during such time periods other queues with
1989 * higher timestamps happen to be busy, then the backshifted
1990 * timestamps of the former queues can become much lower than
1991 * the system virtual time. In fact, to serve the queues with
1992 * higher timestamps while the ones with lower timestamps are
1993 * idle, the system virtual time may be pushed-up to much
1994 * higher values than the finish timestamps of the idle
1995 * queues. As a consequence, the finish timestamps of all new
1996 * or newly activated queues may end up being much larger than
1997 * those of lucky queues with backshifted timestamps. The
1998 * latter queues may then monopolize the device for a lot of
1999 * time. This would simply break service guarantees.
2001 * To reduce this problem, push up a little bit the
2002 * backshifted timestamps of the queue associated with this
2003 * entity (only a queue can happen to have the backshifted
2004 * flag set): just enough to let the finish timestamp of the
2005 * queue be equal to the current value of the system virtual
2006 * time. This may introduce a little unfairness among queues
2007 * with backshifted timestamps, but it does not break
2008 * worst-case fairness guarantees.
2010 * As a special case, if bfqq is weight-raised, push up
2011 * timestamps much less, to keep very low the probability that
2012 * this push up causes the backshifted finish timestamps of
2013 * weight-raised queues to become higher than the backshifted
2014 * finish timestamps of non weight-raised queues.
2016 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
2017 unsigned long delta
= st
->vtime
- entity
->finish
;
2020 delta
/= bfqq
->wr_coeff
;
2022 entity
->start
+= delta
;
2023 entity
->finish
+= delta
;
2026 bfq_active_insert(st
, entity
);
2030 * __bfq_activate_entity - handle activation of entity.
2031 * @entity: the entity being activated.
2032 * @non_blocking_wait_rq: true if entity was waiting for a request
2034 * Called for a 'true' activation, i.e., if entity is not active and
2035 * one of its children receives a new request.
2037 * Basically, this function updates the timestamps of entity and
2038 * inserts entity into its active tree, ater possible extracting it
2039 * from its idle tree.
2041 static void __bfq_activate_entity(struct bfq_entity
*entity
,
2042 bool non_blocking_wait_rq
)
2044 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2045 bool backshifted
= false;
2046 unsigned long long min_vstart
;
2048 /* See comments on bfq_fqq_update_budg_for_activation */
2049 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
2051 min_vstart
= entity
->finish
;
2053 min_vstart
= st
->vtime
;
2055 if (entity
->tree
== &st
->idle
) {
2057 * Must be on the idle tree, bfq_idle_extract() will
2060 bfq_idle_extract(st
, entity
);
2061 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
2062 min_vstart
: entity
->finish
;
2065 * The finish time of the entity may be invalid, and
2066 * it is in the past for sure, otherwise the queue
2067 * would have been on the idle tree.
2069 entity
->start
= min_vstart
;
2070 st
->wsum
+= entity
->weight
;
2072 * entity is about to be inserted into a service tree,
2073 * and then set in service: get a reference to make
2074 * sure entity does not disappear until it is no
2075 * longer in service or scheduled for service.
2077 bfq_get_entity(entity
);
2079 entity
->on_st
= true;
2082 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
2086 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
2087 * @entity: the entity being requeued or repositioned.
2089 * Requeueing is needed if this entity stops being served, which
2090 * happens if a leaf descendant entity has expired. On the other hand,
2091 * repositioning is needed if the next_inservice_entity for the child
2092 * entity has changed. See the comments inside the function for
2095 * Basically, this function: 1) removes entity from its active tree if
2096 * present there, 2) updates the timestamps of entity and 3) inserts
2097 * entity back into its active tree (in the new, right position for
2098 * the new values of the timestamps).
2100 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
2102 struct bfq_sched_data
*sd
= entity
->sched_data
;
2103 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2105 if (entity
== sd
->in_service_entity
) {
2107 * We are requeueing the current in-service entity,
2108 * which may have to be done for one of the following
2110 * - entity represents the in-service queue, and the
2111 * in-service queue is being requeued after an
2113 * - entity represents a group, and its budget has
2114 * changed because one of its child entities has
2115 * just been either activated or requeued for some
2116 * reason; the timestamps of the entity need then to
2117 * be updated, and the entity needs to be enqueued
2118 * or repositioned accordingly.
2120 * In particular, before requeueing, the start time of
2121 * the entity must be moved forward to account for the
2122 * service that the entity has received while in
2123 * service. This is done by the next instructions. The
2124 * finish time will then be updated according to this
2125 * new value of the start time, and to the budget of
2128 bfq_calc_finish(entity
, entity
->service
);
2129 entity
->start
= entity
->finish
;
2131 * In addition, if the entity had more than one child
2132 * when set in service, then was not extracted from
2133 * the active tree. This implies that the position of
2134 * the entity in the active tree may need to be
2135 * changed now, because we have just updated the start
2136 * time of the entity, and we will update its finish
2137 * time in a moment (the requeueing is then, more
2138 * precisely, a repositioning in this case). To
2139 * implement this repositioning, we: 1) dequeue the
2140 * entity here, 2) update the finish time and
2141 * requeue the entity according to the new
2145 bfq_active_extract(st
, entity
);
2146 } else { /* The entity is already active, and not in service */
2148 * In this case, this function gets called only if the
2149 * next_in_service entity below this entity has
2150 * changed, and this change has caused the budget of
2151 * this entity to change, which, finally implies that
2152 * the finish time of this entity must be
2153 * updated. Such an update may cause the scheduling,
2154 * i.e., the position in the active tree, of this
2155 * entity to change. We handle this change by: 1)
2156 * dequeueing the entity here, 2) updating the finish
2157 * time and requeueing the entity according to the new
2158 * timestamps below. This is the same approach as the
2159 * non-extracted-entity sub-case above.
2161 bfq_active_extract(st
, entity
);
2164 bfq_update_fin_time_enqueue(entity
, st
, false);
2167 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2168 struct bfq_sched_data
*sd
,
2169 bool non_blocking_wait_rq
)
2171 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2173 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
2175 * in service or already queued on the active tree,
2176 * requeue or reposition
2178 __bfq_requeue_entity(entity
);
2181 * Not in service and not queued on its active tree:
2182 * the activity is idle and this is a true activation.
2184 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
2189 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
2190 * and activate, requeue or reposition all ancestors
2191 * for which such an update becomes necessary.
2192 * @entity: the entity to activate.
2193 * @non_blocking_wait_rq: true if this entity was waiting for a request
2194 * @requeue: true if this is a requeue, which implies that bfqq is
2195 * being expired; thus ALL its ancestors stop being served and must
2196 * therefore be requeued
2198 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
2199 bool non_blocking_wait_rq
,
2202 struct bfq_sched_data
*sd
;
2204 for_each_entity(entity
) {
2205 sd
= entity
->sched_data
;
2206 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
2208 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
2214 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2215 * @entity: the entity to deactivate.
2216 * @ins_into_idle_tree: if false, the entity will not be put into the
2219 * Deactivates an entity, independently from its previous state. Must
2220 * be invoked only if entity is on a service tree. Extracts the entity
2221 * from that tree, and if necessary and allowed, puts it on the idle
2224 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
2225 bool ins_into_idle_tree
)
2227 struct bfq_sched_data
*sd
= entity
->sched_data
;
2228 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2229 int is_in_service
= entity
== sd
->in_service_entity
;
2231 if (!entity
->on_st
) /* entity never activated, or already inactive */
2235 bfq_calc_finish(entity
, entity
->service
);
2237 if (entity
->tree
== &st
->active
)
2238 bfq_active_extract(st
, entity
);
2239 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
2240 bfq_idle_extract(st
, entity
);
2242 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
2243 bfq_forget_entity(st
, entity
, is_in_service
);
2245 bfq_idle_insert(st
, entity
);
2251 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2252 * @entity: the entity to deactivate.
2253 * @ins_into_idle_tree: true if the entity can be put on the idle tree
2255 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
2256 bool ins_into_idle_tree
,
2259 struct bfq_sched_data
*sd
;
2260 struct bfq_entity
*parent
= NULL
;
2262 for_each_entity_safe(entity
, parent
) {
2263 sd
= entity
->sched_data
;
2265 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
2267 * entity is not in any tree any more, so
2268 * this deactivation is a no-op, and there is
2269 * nothing to change for upper-level entities
2270 * (in case of expiration, this can never
2276 if (sd
->next_in_service
== entity
)
2278 * entity was the next_in_service entity,
2279 * then, since entity has just been
2280 * deactivated, a new one must be found.
2282 bfq_update_next_in_service(sd
, NULL
);
2284 if (sd
->next_in_service
)
2286 * The parent entity is still backlogged,
2287 * because next_in_service is not NULL. So, no
2288 * further upwards deactivation must be
2289 * performed. Yet, next_in_service has
2290 * changed. Then the schedule does need to be
2296 * If we get here, then the parent is no more
2297 * backlogged and we need to propagate the
2298 * deactivation upwards. Thus let the loop go on.
2302 * Also let parent be queued into the idle tree on
2303 * deactivation, to preserve service guarantees, and
2304 * assuming that who invoked this function does not
2305 * need parent entities too to be removed completely.
2307 ins_into_idle_tree
= true;
2311 * If the deactivation loop is fully executed, then there are
2312 * no more entities to touch and next loop is not executed at
2313 * all. Otherwise, requeue remaining entities if they are
2314 * about to stop receiving service, or reposition them if this
2318 for_each_entity(entity
) {
2320 * Invoke __bfq_requeue_entity on entity, even if
2321 * already active, to requeue/reposition it in the
2322 * active tree (because sd->next_in_service has
2325 __bfq_requeue_entity(entity
);
2327 sd
= entity
->sched_data
;
2328 if (!bfq_update_next_in_service(sd
, entity
) &&
2331 * next_in_service unchanged or not causing
2332 * any change in entity->parent->sd, and no
2333 * requeueing needed for expiration: stop
2341 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2342 * if needed, to have at least one entity eligible.
2343 * @st: the service tree to act upon.
2345 * Assumes that st is not empty.
2347 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
2349 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
2351 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
2352 return root_entity
->min_start
;
2357 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2359 if (new_value
> st
->vtime
) {
2360 st
->vtime
= new_value
;
2361 bfq_forget_idle(st
);
2366 * bfq_first_active_entity - find the eligible entity with
2367 * the smallest finish time
2368 * @st: the service tree to select from.
2369 * @vtime: the system virtual to use as a reference for eligibility
2371 * This function searches the first schedulable entity, starting from the
2372 * root of the tree and going on the left every time on this side there is
2373 * a subtree with at least one eligible (start >= vtime) entity. The path on
2374 * the right is followed only if a) the left subtree contains no eligible
2375 * entities and b) no eligible entity has been found yet.
2377 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2380 struct bfq_entity
*entry
, *first
= NULL
;
2381 struct rb_node
*node
= st
->active
.rb_node
;
2384 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2386 if (!bfq_gt(entry
->start
, vtime
))
2389 if (node
->rb_left
) {
2390 entry
= rb_entry(node
->rb_left
,
2391 struct bfq_entity
, rb_node
);
2392 if (!bfq_gt(entry
->min_start
, vtime
)) {
2393 node
= node
->rb_left
;
2399 node
= node
->rb_right
;
2406 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2407 * @st: the service tree.
2409 * If there is no in-service entity for the sched_data st belongs to,
2410 * then return the entity that will be set in service if:
2411 * 1) the parent entity this st belongs to is set in service;
2412 * 2) no entity belonging to such parent entity undergoes a state change
2413 * that would influence the timestamps of the entity (e.g., becomes idle,
2414 * becomes backlogged, changes its budget, ...).
2416 * In this first case, update the virtual time in @st too (see the
2417 * comments on this update inside the function).
2419 * In constrast, if there is an in-service entity, then return the
2420 * entity that would be set in service if not only the above
2421 * conditions, but also the next one held true: the currently
2422 * in-service entity, on expiration,
2423 * 1) gets a finish time equal to the current one, or
2424 * 2) is not eligible any more, or
2427 static struct bfq_entity
*
2428 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2430 struct bfq_entity
*entity
;
2433 if (RB_EMPTY_ROOT(&st
->active
))
2437 * Get the value of the system virtual time for which at
2438 * least one entity is eligible.
2440 new_vtime
= bfq_calc_vtime_jump(st
);
2443 * If there is no in-service entity for the sched_data this
2444 * active tree belongs to, then push the system virtual time
2445 * up to the value that guarantees that at least one entity is
2446 * eligible. If, instead, there is an in-service entity, then
2447 * do not make any such update, because there is already an
2448 * eligible entity, namely the in-service one (even if the
2449 * entity is not on st, because it was extracted when set in
2453 bfq_update_vtime(st
, new_vtime
);
2455 entity
= bfq_first_active_entity(st
, new_vtime
);
2461 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2462 * @sd: the sched_data.
2464 * This function is invoked when there has been a change in the trees
2465 * for sd, and we need know what is the new next entity after this
2468 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2470 struct bfq_service_tree
*st
= sd
->service_tree
;
2471 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2472 struct bfq_entity
*entity
= NULL
;
2476 * Choose from idle class, if needed to guarantee a minimum
2477 * bandwidth to this class (and if there is some active entity
2478 * in idle class). This should also mitigate
2479 * priority-inversion problems in case a low priority task is
2480 * holding file system resources.
2482 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2483 BFQ_CL_IDLE_TIMEOUT
)) {
2484 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2485 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2486 /* About to be served if backlogged, or not yet backlogged */
2487 sd
->bfq_class_idle_last_service
= jiffies
;
2491 * Find the next entity to serve for the highest-priority
2492 * class, unless the idle class needs to be served.
2494 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2495 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2496 sd
->in_service_entity
);
2508 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2510 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2512 return sd
->next_in_service
!= sd
->in_service_entity
;
2516 * Get next queue for service.
2518 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2520 struct bfq_entity
*entity
= NULL
;
2521 struct bfq_sched_data
*sd
;
2522 struct bfq_queue
*bfqq
;
2524 if (bfqd
->busy_queues
== 0)
2528 * Traverse the path from the root to the leaf entity to
2529 * serve. Set in service all the entities visited along the
2532 sd
= &bfqd
->root_group
->sched_data
;
2533 for (; sd
; sd
= entity
->my_sched_data
) {
2535 * WARNING. We are about to set the in-service entity
2536 * to sd->next_in_service, i.e., to the (cached) value
2537 * returned by bfq_lookup_next_entity(sd) the last
2538 * time it was invoked, i.e., the last time when the
2539 * service order in sd changed as a consequence of the
2540 * activation or deactivation of an entity. In this
2541 * respect, if we execute bfq_lookup_next_entity(sd)
2542 * in this very moment, it may, although with low
2543 * probability, yield a different entity than that
2544 * pointed to by sd->next_in_service. This rare event
2545 * happens in case there was no CLASS_IDLE entity to
2546 * serve for sd when bfq_lookup_next_entity(sd) was
2547 * invoked for the last time, while there is now one
2550 * If the above event happens, then the scheduling of
2551 * such entity in CLASS_IDLE is postponed until the
2552 * service of the sd->next_in_service entity
2553 * finishes. In fact, when the latter is expired,
2554 * bfq_lookup_next_entity(sd) gets called again,
2555 * exactly to update sd->next_in_service.
2558 /* Make next_in_service entity become in_service_entity */
2559 entity
= sd
->next_in_service
;
2560 sd
->in_service_entity
= entity
;
2563 * Reset the accumulator of the amount of service that
2564 * the entity is about to receive.
2566 entity
->service
= 0;
2569 * If entity is no longer a candidate for next
2570 * service, then we extract it from its active tree,
2571 * for the following reason. To further boost the
2572 * throughput in some special case, BFQ needs to know
2573 * which is the next candidate entity to serve, while
2574 * there is already an entity in service. In this
2575 * respect, to make it easy to compute/update the next
2576 * candidate entity to serve after the current
2577 * candidate has been set in service, there is a case
2578 * where it is necessary to extract the current
2579 * candidate from its service tree. Such a case is
2580 * when the entity just set in service cannot be also
2581 * a candidate for next service. Details about when
2582 * this conditions holds are reported in the comments
2583 * on the function bfq_no_longer_next_in_service()
2586 if (bfq_no_longer_next_in_service(entity
))
2587 bfq_active_extract(bfq_entity_service_tree(entity
),
2591 * For the same reason why we may have just extracted
2592 * entity from its active tree, we may need to update
2593 * next_in_service for the sched_data of entity too,
2594 * regardless of whether entity has been extracted.
2595 * In fact, even if entity has not been extracted, a
2596 * descendant entity may get extracted. Such an event
2597 * would cause a change in next_in_service for the
2598 * level of the descendant entity, and thus possibly
2599 * back to upper levels.
2601 * We cannot perform the resulting needed update
2602 * before the end of this loop, because, to know which
2603 * is the correct next-to-serve candidate entity for
2604 * each level, we need first to find the leaf entity
2605 * to set in service. In fact, only after we know
2606 * which is the next-to-serve leaf entity, we can
2607 * discover whether the parent entity of the leaf
2608 * entity becomes the next-to-serve, and so on.
2613 bfqq
= bfq_entity_to_bfqq(entity
);
2616 * We can finally update all next-to-serve entities along the
2617 * path from the leaf entity just set in service to the root.
2619 for_each_entity(entity
) {
2620 struct bfq_sched_data
*sd
= entity
->sched_data
;
2622 if (!bfq_update_next_in_service(sd
, NULL
))
2629 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2631 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2632 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2633 struct bfq_entity
*entity
= in_serv_entity
;
2635 if (bfqd
->in_service_bic
) {
2637 * Schedule the release of a reference to
2638 * bfqd->in_service_bic->icq.ioc to right after the
2639 * scheduler lock is released. This ioc is not
2640 * released immediately, to not risk to possibly take
2641 * an ioc->lock while holding the scheduler lock.
2643 bfqd
->ioc_to_put
= bfqd
->in_service_bic
->icq
.ioc
;
2644 bfqd
->in_service_bic
= NULL
;
2647 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2648 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2649 bfqd
->in_service_queue
= NULL
;
2652 * When this function is called, all in-service entities have
2653 * been properly deactivated or requeued, so we can safely
2654 * execute the final step: reset in_service_entity along the
2655 * path from entity to the root.
2657 for_each_entity(entity
)
2658 entity
->sched_data
->in_service_entity
= NULL
;
2661 * in_serv_entity is no longer in service, so, if it is in no
2662 * service tree either, then release the service reference to
2663 * the queue it represents (taken with bfq_get_entity).
2665 if (!in_serv_entity
->on_st
)
2666 bfq_put_queue(in_serv_bfqq
);
2669 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2670 bool ins_into_idle_tree
, bool expiration
)
2672 struct bfq_entity
*entity
= &bfqq
->entity
;
2674 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2677 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2679 struct bfq_entity
*entity
= &bfqq
->entity
;
2681 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2683 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2686 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2688 struct bfq_entity
*entity
= &bfqq
->entity
;
2690 bfq_activate_requeue_entity(entity
, false,
2691 bfqq
== bfqd
->in_service_queue
);
2694 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2697 * Called when the bfqq no longer has requests pending, remove it from
2698 * the service tree. As a special case, it can be invoked during an
2701 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2704 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2706 bfq_clear_bfqq_busy(bfqq
);
2708 bfqd
->busy_queues
--;
2710 if (!bfqq
->dispatched
)
2711 bfq_weights_tree_remove(bfqd
, &bfqq
->entity
,
2712 &bfqd
->queue_weights_tree
);
2714 if (bfqq
->wr_coeff
> 1)
2715 bfqd
->wr_busy_queues
--;
2717 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2719 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2723 * Called when an inactive queue receives a new request.
2725 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2727 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2729 bfq_activate_bfqq(bfqd
, bfqq
);
2731 bfq_mark_bfqq_busy(bfqq
);
2732 bfqd
->busy_queues
++;
2734 if (!bfqq
->dispatched
)
2735 if (bfqq
->wr_coeff
== 1)
2736 bfq_weights_tree_add(bfqd
, &bfqq
->entity
,
2737 &bfqd
->queue_weights_tree
);
2739 if (bfqq
->wr_coeff
> 1)
2740 bfqd
->wr_busy_queues
++;
2743 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2745 /* bfqg stats flags */
2746 enum bfqg_stats_flags
{
2747 BFQG_stats_waiting
= 0,
2752 #define BFQG_FLAG_FNS(name) \
2753 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2755 stats->flags |= (1 << BFQG_stats_##name); \
2757 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2759 stats->flags &= ~(1 << BFQG_stats_##name); \
2761 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2763 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2766 BFQG_FLAG_FNS(waiting)
2767 BFQG_FLAG_FNS(idling
)
2768 BFQG_FLAG_FNS(empty
)
2769 #undef BFQG_FLAG_FNS
2771 /* This should be called with the queue_lock held. */
2772 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2774 unsigned long long now
;
2776 if (!bfqg_stats_waiting(stats
))
2779 now
= sched_clock();
2780 if (time_after64(now
, stats
->start_group_wait_time
))
2781 blkg_stat_add(&stats
->group_wait_time
,
2782 now
- stats
->start_group_wait_time
);
2783 bfqg_stats_clear_waiting(stats
);
2786 /* This should be called with the queue_lock held. */
2787 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2788 struct bfq_group
*curr_bfqg
)
2790 struct bfqg_stats
*stats
= &bfqg
->stats
;
2792 if (bfqg_stats_waiting(stats
))
2794 if (bfqg
== curr_bfqg
)
2796 stats
->start_group_wait_time
= sched_clock();
2797 bfqg_stats_mark_waiting(stats
);
2800 /* This should be called with the queue_lock held. */
2801 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2803 unsigned long long now
;
2805 if (!bfqg_stats_empty(stats
))
2808 now
= sched_clock();
2809 if (time_after64(now
, stats
->start_empty_time
))
2810 blkg_stat_add(&stats
->empty_time
,
2811 now
- stats
->start_empty_time
);
2812 bfqg_stats_clear_empty(stats
);
2815 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2817 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2820 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2822 struct bfqg_stats
*stats
= &bfqg
->stats
;
2824 if (blkg_rwstat_total(&stats
->queued
))
2828 * group is already marked empty. This can happen if bfqq got new
2829 * request in parent group and moved to this group while being added
2830 * to service tree. Just ignore the event and move on.
2832 if (bfqg_stats_empty(stats
))
2835 stats
->start_empty_time
= sched_clock();
2836 bfqg_stats_mark_empty(stats
);
2839 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2841 struct bfqg_stats
*stats
= &bfqg
->stats
;
2843 if (bfqg_stats_idling(stats
)) {
2844 unsigned long long now
= sched_clock();
2846 if (time_after64(now
, stats
->start_idle_time
))
2847 blkg_stat_add(&stats
->idle_time
,
2848 now
- stats
->start_idle_time
);
2849 bfqg_stats_clear_idling(stats
);
2853 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2855 struct bfqg_stats
*stats
= &bfqg
->stats
;
2857 stats
->start_idle_time
= sched_clock();
2858 bfqg_stats_mark_idling(stats
);
2861 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2863 struct bfqg_stats
*stats
= &bfqg
->stats
;
2865 blkg_stat_add(&stats
->avg_queue_size_sum
,
2866 blkg_rwstat_total(&stats
->queued
));
2867 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2868 bfqg_stats_update_group_wait_time(stats
);
2872 * blk-cgroup policy-related handlers
2873 * The following functions help in converting between blk-cgroup
2874 * internal structures and BFQ-specific structures.
2877 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2879 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2882 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2884 return pd_to_blkg(&bfqg
->pd
);
2887 static struct blkcg_policy blkcg_policy_bfq
;
2889 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2891 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2895 * bfq_group handlers
2896 * The following functions help in navigating the bfq_group hierarchy
2897 * by allowing to find the parent of a bfq_group or the bfq_group
2898 * associated to a bfq_queue.
2901 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2903 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2905 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2908 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2910 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2912 return group_entity
? container_of(group_entity
, struct bfq_group
,
2914 bfqq
->bfqd
->root_group
;
2918 * The following two functions handle get and put of a bfq_group by
2919 * wrapping the related blk-cgroup hooks.
2922 static void bfqg_get(struct bfq_group
*bfqg
)
2924 return blkg_get(bfqg_to_blkg(bfqg
));
2927 static void bfqg_put(struct bfq_group
*bfqg
)
2929 return blkg_put(bfqg_to_blkg(bfqg
));
2932 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2933 struct bfq_queue
*bfqq
,
2936 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2937 bfqg_stats_end_empty_time(&bfqg
->stats
);
2938 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2939 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2942 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2944 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2947 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2949 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2952 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2953 uint64_t start_time
, uint64_t io_start_time
,
2956 struct bfqg_stats
*stats
= &bfqg
->stats
;
2957 unsigned long long now
= sched_clock();
2959 if (time_after64(now
, io_start_time
))
2960 blkg_rwstat_add(&stats
->service_time
, op
,
2961 now
- io_start_time
);
2962 if (time_after64(io_start_time
, start_time
))
2963 blkg_rwstat_add(&stats
->wait_time
, op
,
2964 io_start_time
- start_time
);
2968 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2970 /* queued stats shouldn't be cleared */
2971 blkg_rwstat_reset(&stats
->merged
);
2972 blkg_rwstat_reset(&stats
->service_time
);
2973 blkg_rwstat_reset(&stats
->wait_time
);
2974 blkg_stat_reset(&stats
->time
);
2975 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2976 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2977 blkg_stat_reset(&stats
->dequeue
);
2978 blkg_stat_reset(&stats
->group_wait_time
);
2979 blkg_stat_reset(&stats
->idle_time
);
2980 blkg_stat_reset(&stats
->empty_time
);
2984 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2989 /* queued stats shouldn't be cleared */
2990 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2991 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2992 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2993 blkg_stat_add_aux(&from
->time
, &from
->time
);
2994 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2995 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2996 &from
->avg_queue_size_samples
);
2997 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2998 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2999 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
3000 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
3004 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
3005 * recursive stats can still account for the amount used by this bfqg after
3008 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
3010 struct bfq_group
*parent
;
3012 if (!bfqg
) /* root_group */
3015 parent
= bfqg_parent(bfqg
);
3017 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
3019 if (unlikely(!parent
))
3022 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
3023 bfqg_stats_reset(&bfqg
->stats
);
3026 static void bfq_init_entity(struct bfq_entity
*entity
,
3027 struct bfq_group
*bfqg
)
3029 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3031 entity
->weight
= entity
->new_weight
;
3032 entity
->orig_weight
= entity
->new_weight
;
3034 bfqq
->ioprio
= bfqq
->new_ioprio
;
3035 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3038 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
3039 entity
->sched_data
= &bfqg
->sched_data
;
3042 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
3044 blkg_rwstat_exit(&stats
->merged
);
3045 blkg_rwstat_exit(&stats
->service_time
);
3046 blkg_rwstat_exit(&stats
->wait_time
);
3047 blkg_rwstat_exit(&stats
->queued
);
3048 blkg_stat_exit(&stats
->time
);
3049 blkg_stat_exit(&stats
->avg_queue_size_sum
);
3050 blkg_stat_exit(&stats
->avg_queue_size_samples
);
3051 blkg_stat_exit(&stats
->dequeue
);
3052 blkg_stat_exit(&stats
->group_wait_time
);
3053 blkg_stat_exit(&stats
->idle_time
);
3054 blkg_stat_exit(&stats
->empty_time
);
3057 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
3059 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
3060 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
3061 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
3062 blkg_rwstat_init(&stats
->queued
, gfp
) ||
3063 blkg_stat_init(&stats
->time
, gfp
) ||
3064 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
3065 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
3066 blkg_stat_init(&stats
->dequeue
, gfp
) ||
3067 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
3068 blkg_stat_init(&stats
->idle_time
, gfp
) ||
3069 blkg_stat_init(&stats
->empty_time
, gfp
)) {
3070 bfqg_stats_exit(stats
);
3077 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
3079 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
3082 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
3084 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
3087 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
3089 struct bfq_group_data
*bgd
;
3091 bgd
= kzalloc(sizeof(*bgd
), gfp
);
3097 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
3099 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
3101 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
3102 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
3105 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
3107 kfree(cpd_to_bfqgd(cpd
));
3110 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
3112 struct bfq_group
*bfqg
;
3114 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
3118 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
3126 static void bfq_pd_init(struct blkg_policy_data
*pd
)
3128 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
3129 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3130 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
3131 struct bfq_entity
*entity
= &bfqg
->entity
;
3132 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
3134 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
3135 entity
->my_sched_data
= &bfqg
->sched_data
;
3136 bfqg
->my_entity
= entity
; /*
3137 * the root_group's will be set to NULL
3138 * in bfq_init_queue()
3141 bfqg
->active_entities
= 0;
3142 bfqg
->rq_pos_tree
= RB_ROOT
;
3145 static void bfq_pd_free(struct blkg_policy_data
*pd
)
3147 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3149 bfqg_stats_exit(&bfqg
->stats
);
3153 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
3155 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3157 bfqg_stats_reset(&bfqg
->stats
);
3160 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
3161 struct bfq_group
*parent
)
3163 struct bfq_entity
*entity
;
3165 entity
= &bfqg
->entity
;
3166 entity
->parent
= parent
->my_entity
;
3167 entity
->sched_data
= &parent
->sched_data
;
3170 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
3171 struct blkcg
*blkcg
)
3173 struct blkcg_gq
*blkg
;
3175 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
3177 return blkg_to_bfqg(blkg
);
3181 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3182 struct blkcg
*blkcg
)
3184 struct bfq_group
*bfqg
, *parent
;
3185 struct bfq_entity
*entity
;
3187 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
3189 if (unlikely(!bfqg
))
3193 * Update chain of bfq_groups as we might be handling a leaf group
3194 * which, along with some of its relatives, has not been hooked yet
3195 * to the private hierarchy of BFQ.
3197 entity
= &bfqg
->entity
;
3198 for_each_entity(entity
) {
3199 bfqg
= container_of(entity
, struct bfq_group
, entity
);
3200 if (bfqg
!= bfqd
->root_group
) {
3201 parent
= bfqg_parent(bfqg
);
3203 parent
= bfqd
->root_group
;
3204 bfq_group_set_parent(bfqg
, parent
);
3211 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
,
3212 struct bfq_queue
*bfqq
);
3213 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3214 struct bfq_queue
*bfqq
,
3216 enum bfqq_expiration reason
);
3219 * bfq_bfqq_move - migrate @bfqq to @bfqg.
3220 * @bfqd: queue descriptor.
3221 * @bfqq: the queue to move.
3222 * @bfqg: the group to move to.
3224 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
3225 * it on the new one. Avoid putting the entity on the old group idle tree.
3227 * Must be called under the queue lock; the cgroup owning @bfqg must
3228 * not disappear (by now this just means that we are called under
3231 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3232 struct bfq_group
*bfqg
)
3234 struct bfq_entity
*entity
= &bfqq
->entity
;
3236 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3237 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3238 * from data structures related to current group. Otherwise we
3239 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3242 if (bfqq
== bfqd
->in_service_queue
)
3243 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3244 false, BFQQE_PREEMPTED
);
3246 if (bfq_bfqq_busy(bfqq
))
3247 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
3248 else if (entity
->on_st
)
3249 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
3250 bfqg_put(bfqq_group(bfqq
));
3253 * Here we use a reference to bfqg. We don't need a refcounter
3254 * as the cgroup reference will not be dropped, so that its
3255 * destroy() callback will not be invoked.
3257 entity
->parent
= bfqg
->my_entity
;
3258 entity
->sched_data
= &bfqg
->sched_data
;
3261 if (bfq_bfqq_busy(bfqq
)) {
3262 bfq_pos_tree_add_move(bfqd
, bfqq
);
3263 bfq_activate_bfqq(bfqd
, bfqq
);
3266 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
3267 bfq_schedule_dispatch(bfqd
);
3271 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3272 * @bfqd: the queue descriptor.
3273 * @bic: the bic to move.
3274 * @blkcg: the blk-cgroup to move to.
3276 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3277 * has to make sure that the reference to cgroup is valid across the call.
3279 * NOTE: an alternative approach might have been to store the current
3280 * cgroup in bfqq and getting a reference to it, reducing the lookup
3281 * time here, at the price of slightly more complex code.
3283 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
3284 struct bfq_io_cq
*bic
,
3285 struct blkcg
*blkcg
)
3287 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
3288 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
3289 struct bfq_group
*bfqg
;
3290 struct bfq_entity
*entity
;
3292 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
3294 if (unlikely(!bfqg
))
3295 bfqg
= bfqd
->root_group
;
3298 entity
= &async_bfqq
->entity
;
3300 if (entity
->sched_data
!= &bfqg
->sched_data
) {
3301 bic_set_bfqq(bic
, NULL
, 0);
3302 bfq_log_bfqq(bfqd
, async_bfqq
,
3303 "bic_change_group: %p %d",
3304 async_bfqq
, async_bfqq
->ref
);
3305 bfq_put_queue(async_bfqq
);
3310 entity
= &sync_bfqq
->entity
;
3311 if (entity
->sched_data
!= &bfqg
->sched_data
)
3312 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
3318 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
3320 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
3321 struct bfq_group
*bfqg
= NULL
;
3325 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
3328 * Check whether blkcg has changed. The condition may trigger
3329 * spuriously on a newly created cic but there's no harm.
3331 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
3334 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
3335 bic
->blkcg_serial_nr
= serial_nr
;
3341 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3342 * @st: the service tree being flushed.
3344 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
3346 struct bfq_entity
*entity
= st
->first_idle
;
3348 for (; entity
; entity
= st
->first_idle
)
3349 __bfq_deactivate_entity(entity
, false);
3353 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3354 * @bfqd: the device data structure with the root group.
3355 * @entity: the entity to move.
3357 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
3358 struct bfq_entity
*entity
)
3360 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3362 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
3366 * bfq_reparent_active_entities - move to the root group all active
3368 * @bfqd: the device data structure with the root group.
3369 * @bfqg: the group to move from.
3370 * @st: the service tree with the entities.
3372 * Needs queue_lock to be taken and reference to be valid over the call.
3374 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
3375 struct bfq_group
*bfqg
,
3376 struct bfq_service_tree
*st
)
3378 struct rb_root
*active
= &st
->active
;
3379 struct bfq_entity
*entity
= NULL
;
3381 if (!RB_EMPTY_ROOT(&st
->active
))
3382 entity
= bfq_entity_of(rb_first(active
));
3384 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3385 bfq_reparent_leaf_entity(bfqd
, entity
);
3387 if (bfqg
->sched_data
.in_service_entity
)
3388 bfq_reparent_leaf_entity(bfqd
,
3389 bfqg
->sched_data
.in_service_entity
);
3393 * bfq_pd_offline - deactivate the entity associated with @pd,
3394 * and reparent its children entities.
3395 * @pd: descriptor of the policy going offline.
3397 * blkio already grabs the queue_lock for us, so no need to use
3400 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3402 struct bfq_service_tree
*st
;
3403 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3404 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3405 struct bfq_entity
*entity
= bfqg
->my_entity
;
3406 unsigned long flags
;
3409 if (!entity
) /* root group */
3412 spin_lock_irqsave(&bfqd
->lock
, flags
);
3414 * Empty all service_trees belonging to this group before
3415 * deactivating the group itself.
3417 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3418 st
= bfqg
->sched_data
.service_tree
+ i
;
3421 * The idle tree may still contain bfq_queues belonging
3422 * to exited task because they never migrated to a different
3423 * cgroup from the one being destroyed now. No one else
3424 * can access them so it's safe to act without any lock.
3426 bfq_flush_idle_tree(st
);
3429 * It may happen that some queues are still active
3430 * (busy) upon group destruction (if the corresponding
3431 * processes have been forced to terminate). We move
3432 * all the leaf entities corresponding to these queues
3433 * to the root_group.
3434 * Also, it may happen that the group has an entity
3435 * in service, which is disconnected from the active
3436 * tree: it must be moved, too.
3437 * There is no need to put the sync queues, as the
3438 * scheduler has taken no reference.
3440 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3443 __bfq_deactivate_entity(entity
, false);
3444 bfq_put_async_queues(bfqd
, bfqg
);
3446 bfq_unlock_put_ioc_restore(bfqd
, flags
);
3448 * @blkg is going offline and will be ignored by
3449 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3450 * that they don't get lost. If IOs complete after this point, the
3451 * stats for them will be lost. Oh well...
3453 bfqg_stats_xfer_dead(bfqg
);
3456 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3458 struct blkcg_gq
*blkg
;
3460 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
3461 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3463 bfq_end_wr_async_queues(bfqd
, bfqg
);
3465 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3468 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3470 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3471 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3472 unsigned int val
= 0;
3475 val
= bfqgd
->weight
;
3477 seq_printf(sf
, "%u\n", val
);
3482 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3483 struct cftype
*cftype
,
3486 struct blkcg
*blkcg
= css_to_blkcg(css
);
3487 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3488 struct blkcg_gq
*blkg
;
3491 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3495 spin_lock_irq(&blkcg
->lock
);
3496 bfqgd
->weight
= (unsigned short)val
;
3497 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3498 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3503 * Setting the prio_changed flag of the entity
3504 * to 1 with new_weight == weight would re-set
3505 * the value of the weight to its ioprio mapping.
3506 * Set the flag only if necessary.
3508 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3509 bfqg
->entity
.new_weight
= (unsigned short)val
;
3511 * Make sure that the above new value has been
3512 * stored in bfqg->entity.new_weight before
3513 * setting the prio_changed flag. In fact,
3514 * this flag may be read asynchronously (in
3515 * critical sections protected by a different
3516 * lock than that held here), and finding this
3517 * flag set may cause the execution of the code
3518 * for updating parameters whose value may
3519 * depend also on bfqg->entity.new_weight (in
3520 * __bfq_entity_update_weight_prio).
3521 * This barrier makes sure that the new value
3522 * of bfqg->entity.new_weight is correctly
3523 * seen in that code.
3526 bfqg
->entity
.prio_changed
= 1;
3529 spin_unlock_irq(&blkcg
->lock
);
3534 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3535 char *buf
, size_t nbytes
,
3539 /* First unsigned long found in the file is used */
3540 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3545 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3548 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3550 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3551 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3555 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3557 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3558 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3562 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3563 struct blkg_policy_data
*pd
, int off
)
3565 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3566 &blkcg_policy_bfq
, off
);
3567 return __blkg_prfill_u64(sf
, pd
, sum
);
3570 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3571 struct blkg_policy_data
*pd
, int off
)
3573 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3576 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3579 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3581 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3582 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3583 seq_cft(sf
)->private, false);
3587 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3589 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3590 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3591 seq_cft(sf
)->private, true);
3595 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3598 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3600 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3603 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3605 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3606 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3610 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3611 struct blkg_policy_data
*pd
, int off
)
3613 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3614 offsetof(struct blkcg_gq
, stat_bytes
));
3615 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3616 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3618 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3621 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3623 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3624 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3629 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3630 struct blkg_policy_data
*pd
, int off
)
3632 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3633 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3637 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3638 v
= div64_u64(v
, samples
);
3640 __blkg_prfill_u64(sf
, pd
, v
);
3644 /* print avg_queue_size */
3645 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3647 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3648 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3653 static struct bfq_group
*
3654 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3658 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3662 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3665 static struct cftype bfq_blkcg_legacy_files
[] = {
3667 .name
= "bfq.weight",
3668 .flags
= CFTYPE_NOT_ON_ROOT
,
3669 .seq_show
= bfq_io_show_weight
,
3670 .write_u64
= bfq_io_set_weight_legacy
,
3673 /* statistics, covers only the tasks in the bfqg */
3676 .private = offsetof(struct bfq_group
, stats
.time
),
3677 .seq_show
= bfqg_print_stat
,
3680 .name
= "bfq.sectors",
3681 .seq_show
= bfqg_print_stat_sectors
,
3684 .name
= "bfq.io_service_bytes",
3685 .private = (unsigned long)&blkcg_policy_bfq
,
3686 .seq_show
= blkg_print_stat_bytes
,
3689 .name
= "bfq.io_serviced",
3690 .private = (unsigned long)&blkcg_policy_bfq
,
3691 .seq_show
= blkg_print_stat_ios
,
3694 .name
= "bfq.io_service_time",
3695 .private = offsetof(struct bfq_group
, stats
.service_time
),
3696 .seq_show
= bfqg_print_rwstat
,
3699 .name
= "bfq.io_wait_time",
3700 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3701 .seq_show
= bfqg_print_rwstat
,
3704 .name
= "bfq.io_merged",
3705 .private = offsetof(struct bfq_group
, stats
.merged
),
3706 .seq_show
= bfqg_print_rwstat
,
3709 .name
= "bfq.io_queued",
3710 .private = offsetof(struct bfq_group
, stats
.queued
),
3711 .seq_show
= bfqg_print_rwstat
,
3714 /* the same statictics which cover the bfqg and its descendants */
3716 .name
= "bfq.time_recursive",
3717 .private = offsetof(struct bfq_group
, stats
.time
),
3718 .seq_show
= bfqg_print_stat_recursive
,
3721 .name
= "bfq.sectors_recursive",
3722 .seq_show
= bfqg_print_stat_sectors_recursive
,
3725 .name
= "bfq.io_service_bytes_recursive",
3726 .private = (unsigned long)&blkcg_policy_bfq
,
3727 .seq_show
= blkg_print_stat_bytes_recursive
,
3730 .name
= "bfq.io_serviced_recursive",
3731 .private = (unsigned long)&blkcg_policy_bfq
,
3732 .seq_show
= blkg_print_stat_ios_recursive
,
3735 .name
= "bfq.io_service_time_recursive",
3736 .private = offsetof(struct bfq_group
, stats
.service_time
),
3737 .seq_show
= bfqg_print_rwstat_recursive
,
3740 .name
= "bfq.io_wait_time_recursive",
3741 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3742 .seq_show
= bfqg_print_rwstat_recursive
,
3745 .name
= "bfq.io_merged_recursive",
3746 .private = offsetof(struct bfq_group
, stats
.merged
),
3747 .seq_show
= bfqg_print_rwstat_recursive
,
3750 .name
= "bfq.io_queued_recursive",
3751 .private = offsetof(struct bfq_group
, stats
.queued
),
3752 .seq_show
= bfqg_print_rwstat_recursive
,
3755 .name
= "bfq.avg_queue_size",
3756 .seq_show
= bfqg_print_avg_queue_size
,
3759 .name
= "bfq.group_wait_time",
3760 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3761 .seq_show
= bfqg_print_stat
,
3764 .name
= "bfq.idle_time",
3765 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3766 .seq_show
= bfqg_print_stat
,
3769 .name
= "bfq.empty_time",
3770 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3771 .seq_show
= bfqg_print_stat
,
3774 .name
= "bfq.dequeue",
3775 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3776 .seq_show
= bfqg_print_stat
,
3781 static struct cftype bfq_blkg_files
[] = {
3783 .name
= "bfq.weight",
3784 .flags
= CFTYPE_NOT_ON_ROOT
,
3785 .seq_show
= bfq_io_show_weight
,
3786 .write
= bfq_io_set_weight
,
3791 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3793 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3794 struct bfq_queue
*bfqq
, unsigned int op
) { }
3796 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3798 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3799 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3800 uint64_t start_time
, uint64_t io_start_time
,
3801 unsigned int op
) { }
3803 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3804 struct bfq_group
*curr_bfqg
) { }
3805 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3806 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3807 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3808 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3809 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3810 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3812 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3813 struct bfq_group
*bfqg
) {}
3815 static void bfq_init_entity(struct bfq_entity
*entity
,
3816 struct bfq_group
*bfqg
)
3818 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3820 entity
->weight
= entity
->new_weight
;
3821 entity
->orig_weight
= entity
->new_weight
;
3823 bfqq
->ioprio
= bfqq
->new_ioprio
;
3824 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3826 entity
->sched_data
= &bfqg
->sched_data
;
3829 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3831 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3833 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3836 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3837 struct blkcg
*blkcg
)
3839 return bfqd
->root_group
;
3842 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3844 return bfqq
->bfqd
->root_group
;
3847 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3850 struct bfq_group
*bfqg
;
3853 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3857 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3858 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3862 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3864 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3865 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3867 #define bfq_sample_valid(samples) ((samples) > 80)
3870 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3871 * We choose the request that is closesr to the head right now. Distance
3872 * behind the head is penalized and only allowed to a certain extent.
3874 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3875 struct request
*rq1
,
3876 struct request
*rq2
,
3879 sector_t s1
, s2
, d1
= 0, d2
= 0;
3880 unsigned long back_max
;
3881 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3882 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3883 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3885 if (!rq1
|| rq1
== rq2
)
3890 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3892 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3894 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3896 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3899 s1
= blk_rq_pos(rq1
);
3900 s2
= blk_rq_pos(rq2
);
3903 * By definition, 1KiB is 2 sectors.
3905 back_max
= bfqd
->bfq_back_max
* 2;
3908 * Strict one way elevator _except_ in the case where we allow
3909 * short backward seeks which are biased as twice the cost of a
3910 * similar forward seek.
3914 else if (s1
+ back_max
>= last
)
3915 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3917 wrap
|= BFQ_RQ1_WRAP
;
3921 else if (s2
+ back_max
>= last
)
3922 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3924 wrap
|= BFQ_RQ2_WRAP
;
3926 /* Found required data */
3929 * By doing switch() on the bit mask "wrap" we avoid having to
3930 * check two variables for all permutations: --> faster!
3933 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3948 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3951 * Since both rqs are wrapped,
3952 * start with the one that's further behind head
3953 * (--> only *one* back seek required),
3954 * since back seek takes more time than forward.
3963 static struct bfq_queue
*
3964 bfq_rq_pos_tree_lookup(struct bfq_data
*bfqd
, struct rb_root
*root
,
3965 sector_t sector
, struct rb_node
**ret_parent
,
3966 struct rb_node
***rb_link
)
3968 struct rb_node
**p
, *parent
;
3969 struct bfq_queue
*bfqq
= NULL
;
3977 bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
3980 * Sort strictly based on sector. Smallest to the left,
3981 * largest to the right.
3983 if (sector
> blk_rq_pos(bfqq
->next_rq
))
3984 n
= &(*p
)->rb_right
;
3985 else if (sector
< blk_rq_pos(bfqq
->next_rq
))
3993 *ret_parent
= parent
;
3997 bfq_log(bfqd
, "rq_pos_tree_lookup %llu: returning %d",
3998 (unsigned long long)sector
,
3999 bfqq
? bfqq
->pid
: 0);
4004 static void bfq_pos_tree_add_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4006 struct rb_node
**p
, *parent
;
4007 struct bfq_queue
*__bfqq
;
4009 if (bfqq
->pos_root
) {
4010 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
4011 bfqq
->pos_root
= NULL
;
4014 if (bfq_class_idle(bfqq
))
4019 bfqq
->pos_root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
4020 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, bfqq
->pos_root
,
4021 blk_rq_pos(bfqq
->next_rq
), &parent
, &p
);
4023 rb_link_node(&bfqq
->pos_node
, parent
, p
);
4024 rb_insert_color(&bfqq
->pos_node
, bfqq
->pos_root
);
4026 bfqq
->pos_root
= NULL
;
4030 * Tell whether there are active queues or groups with differentiated weights.
4032 static bool bfq_differentiated_weights(struct bfq_data
*bfqd
)
4035 * For weights to differ, at least one of the trees must contain
4036 * at least two nodes.
4038 return (!RB_EMPTY_ROOT(&bfqd
->queue_weights_tree
) &&
4039 (bfqd
->queue_weights_tree
.rb_node
->rb_left
||
4040 bfqd
->queue_weights_tree
.rb_node
->rb_right
)
4041 #ifdef CONFIG_BFQ_GROUP_IOSCHED
4043 (!RB_EMPTY_ROOT(&bfqd
->group_weights_tree
) &&
4044 (bfqd
->group_weights_tree
.rb_node
->rb_left
||
4045 bfqd
->group_weights_tree
.rb_node
->rb_right
)
4051 * The following function returns true if every queue must receive the
4052 * same share of the throughput (this condition is used when deciding
4053 * whether idling may be disabled, see the comments in the function
4054 * bfq_bfqq_may_idle()).
4056 * Such a scenario occurs when:
4057 * 1) all active queues have the same weight,
4058 * 2) all active groups at the same level in the groups tree have the same
4060 * 3) all active groups at the same level in the groups tree have the same
4061 * number of children.
4063 * Unfortunately, keeping the necessary state for evaluating exactly the
4064 * above symmetry conditions would be quite complex and time-consuming.
4065 * Therefore this function evaluates, instead, the following stronger
4066 * sub-conditions, for which it is much easier to maintain the needed
4068 * 1) all active queues have the same weight,
4069 * 2) all active groups have the same weight,
4070 * 3) all active groups have at most one active child each.
4071 * In particular, the last two conditions are always true if hierarchical
4072 * support and the cgroups interface are not enabled, thus no state needs
4073 * to be maintained in this case.
4075 static bool bfq_symmetric_scenario(struct bfq_data
*bfqd
)
4077 return !bfq_differentiated_weights(bfqd
);
4081 * If the weight-counter tree passed as input contains no counter for
4082 * the weight of the input entity, then add that counter; otherwise just
4083 * increment the existing counter.
4085 * Note that weight-counter trees contain few nodes in mostly symmetric
4086 * scenarios. For example, if all queues have the same weight, then the
4087 * weight-counter tree for the queues may contain at most one node.
4088 * This holds even if low_latency is on, because weight-raised queues
4089 * are not inserted in the tree.
4090 * In most scenarios, the rate at which nodes are created/destroyed
4091 * should be low too.
4093 static void bfq_weights_tree_add(struct bfq_data
*bfqd
,
4094 struct bfq_entity
*entity
,
4095 struct rb_root
*root
)
4097 struct rb_node
**new = &(root
->rb_node
), *parent
= NULL
;
4100 * Do not insert if the entity is already associated with a
4101 * counter, which happens if:
4102 * 1) the entity is associated with a queue,
4103 * 2) a request arrival has caused the queue to become both
4104 * non-weight-raised, and hence change its weight, and
4105 * backlogged; in this respect, each of the two events
4106 * causes an invocation of this function,
4107 * 3) this is the invocation of this function caused by the
4108 * second event. This second invocation is actually useless,
4109 * and we handle this fact by exiting immediately. More
4110 * efficient or clearer solutions might possibly be adopted.
4112 if (entity
->weight_counter
)
4116 struct bfq_weight_counter
*__counter
= container_of(*new,
4117 struct bfq_weight_counter
,
4121 if (entity
->weight
== __counter
->weight
) {
4122 entity
->weight_counter
= __counter
;
4125 if (entity
->weight
< __counter
->weight
)
4126 new = &((*new)->rb_left
);
4128 new = &((*new)->rb_right
);
4131 entity
->weight_counter
= kzalloc(sizeof(struct bfq_weight_counter
),
4135 * In the unlucky event of an allocation failure, we just
4136 * exit. This will cause the weight of entity to not be
4137 * considered in bfq_differentiated_weights, which, in its
4138 * turn, causes the scenario to be deemed wrongly symmetric in
4139 * case entity's weight would have been the only weight making
4140 * the scenario asymmetric. On the bright side, no unbalance
4141 * will however occur when entity becomes inactive again (the
4142 * invocation of this function is triggered by an activation
4143 * of entity). In fact, bfq_weights_tree_remove does nothing
4144 * if !entity->weight_counter.
4146 if (unlikely(!entity
->weight_counter
))
4149 entity
->weight_counter
->weight
= entity
->weight
;
4150 rb_link_node(&entity
->weight_counter
->weights_node
, parent
, new);
4151 rb_insert_color(&entity
->weight_counter
->weights_node
, root
);
4154 entity
->weight_counter
->num_active
++;
4158 * Decrement the weight counter associated with the entity, and, if the
4159 * counter reaches 0, remove the counter from the tree.
4160 * See the comments to the function bfq_weights_tree_add() for considerations
4163 static void bfq_weights_tree_remove(struct bfq_data
*bfqd
,
4164 struct bfq_entity
*entity
,
4165 struct rb_root
*root
)
4167 if (!entity
->weight_counter
)
4170 entity
->weight_counter
->num_active
--;
4171 if (entity
->weight_counter
->num_active
> 0)
4172 goto reset_entity_pointer
;
4174 rb_erase(&entity
->weight_counter
->weights_node
, root
);
4175 kfree(entity
->weight_counter
);
4177 reset_entity_pointer
:
4178 entity
->weight_counter
= NULL
;
4182 * Return expired entry, or NULL to just start from scratch in rbtree.
4184 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
4185 struct request
*last
)
4189 if (bfq_bfqq_fifo_expire(bfqq
))
4192 bfq_mark_bfqq_fifo_expire(bfqq
);
4194 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
4196 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
4199 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
4203 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
4204 struct bfq_queue
*bfqq
,
4205 struct request
*last
)
4207 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
4208 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
4209 struct request
*next
, *prev
= NULL
;
4211 /* Follow expired path, else get first next available. */
4212 next
= bfq_check_fifo(bfqq
, last
);
4217 prev
= rb_entry_rq(rbprev
);
4220 next
= rb_entry_rq(rbnext
);
4222 rbnext
= rb_first(&bfqq
->sort_list
);
4223 if (rbnext
&& rbnext
!= &last
->rb_node
)
4224 next
= rb_entry_rq(rbnext
);
4227 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
4230 /* see the definition of bfq_async_charge_factor for details */
4231 static unsigned long bfq_serv_to_charge(struct request
*rq
,
4232 struct bfq_queue
*bfqq
)
4234 if (bfq_bfqq_sync(bfqq
) || bfqq
->wr_coeff
> 1)
4235 return blk_rq_sectors(rq
);
4238 * If there are no weight-raised queues, then amplify service
4239 * by just the async charge factor; otherwise amplify service
4240 * by twice the async charge factor, to further reduce latency
4241 * for weight-raised queues.
4243 if (bfqq
->bfqd
->wr_busy_queues
== 0)
4244 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
4246 return blk_rq_sectors(rq
) * 2 * bfq_async_charge_factor
;
4250 * bfq_updated_next_req - update the queue after a new next_rq selection.
4251 * @bfqd: the device data the queue belongs to.
4252 * @bfqq: the queue to update.
4254 * If the first request of a queue changes we make sure that the queue
4255 * has enough budget to serve at least its first request (if the
4256 * request has grown). We do this because if the queue has not enough
4257 * budget for its first request, it has to go through two dispatch
4258 * rounds to actually get it dispatched.
4260 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
4261 struct bfq_queue
*bfqq
)
4263 struct bfq_entity
*entity
= &bfqq
->entity
;
4264 struct request
*next_rq
= bfqq
->next_rq
;
4265 unsigned long new_budget
;
4270 if (bfqq
== bfqd
->in_service_queue
)
4272 * In order not to break guarantees, budgets cannot be
4273 * changed after an entity has been selected.
4277 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
4278 bfq_serv_to_charge(next_rq
, bfqq
));
4279 if (entity
->budget
!= new_budget
) {
4280 entity
->budget
= new_budget
;
4281 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
4283 bfq_requeue_bfqq(bfqd
, bfqq
);
4288 bfq_bfqq_resume_state(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
4290 if (bic
->saved_idle_window
)
4291 bfq_mark_bfqq_idle_window(bfqq
);
4293 bfq_clear_bfqq_idle_window(bfqq
);
4295 if (bic
->saved_IO_bound
)
4296 bfq_mark_bfqq_IO_bound(bfqq
);
4298 bfq_clear_bfqq_IO_bound(bfqq
);
4300 bfqq
->ttime
= bic
->saved_ttime
;
4301 bfqq
->wr_coeff
= bic
->saved_wr_coeff
;
4302 bfqq
->wr_start_at_switch_to_srt
= bic
->saved_wr_start_at_switch_to_srt
;
4303 bfqq
->last_wr_start_finish
= bic
->saved_last_wr_start_finish
;
4304 bfqq
->wr_cur_max_time
= bic
->saved_wr_cur_max_time
;
4306 if (bfqq
->wr_coeff
> 1 &&
4307 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
4308 bfqq
->wr_cur_max_time
)) {
4309 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
4310 "resume state: switching off wr");
4315 /* make sure weight will be updated, however we got here */
4316 bfqq
->entity
.prio_changed
= 1;
4319 static int bfqq_process_refs(struct bfq_queue
*bfqq
)
4321 return bfqq
->ref
- bfqq
->allocated
- bfqq
->entity
.on_st
;
4324 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
4326 struct bfq_entity
*entity
= &bfqq
->entity
;
4328 return entity
->budget
- entity
->service
;
4332 * If enough samples have been computed, return the current max budget
4333 * stored in bfqd, which is dynamically updated according to the
4334 * estimated disk peak rate; otherwise return the default max budget
4336 static int bfq_max_budget(struct bfq_data
*bfqd
)
4338 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4339 return bfq_default_max_budget
;
4341 return bfqd
->bfq_max_budget
;
4345 * Return min budget, which is a fraction of the current or default
4346 * max budget (trying with 1/32)
4348 static int bfq_min_budget(struct bfq_data
*bfqd
)
4350 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
4351 return bfq_default_max_budget
/ 32;
4353 return bfqd
->bfq_max_budget
/ 32;
4356 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
4357 struct bfq_queue
*bfqq
,
4359 enum bfqq_expiration reason
);
4362 * The next function, invoked after the input queue bfqq switches from
4363 * idle to busy, updates the budget of bfqq. The function also tells
4364 * whether the in-service queue should be expired, by returning
4365 * true. The purpose of expiring the in-service queue is to give bfqq
4366 * the chance to possibly preempt the in-service queue, and the reason
4367 * for preempting the in-service queue is to achieve one of the two
4370 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
4371 * expired because it has remained idle. In particular, bfqq may have
4372 * expired for one of the following two reasons:
4374 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
4375 * and did not make it to issue a new request before its last
4376 * request was served;
4378 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
4379 * a new request before the expiration of the idling-time.
4381 * Even if bfqq has expired for one of the above reasons, the process
4382 * associated with the queue may be however issuing requests greedily,
4383 * and thus be sensitive to the bandwidth it receives (bfqq may have
4384 * remained idle for other reasons: CPU high load, bfqq not enjoying
4385 * idling, I/O throttling somewhere in the path from the process to
4386 * the I/O scheduler, ...). But if, after every expiration for one of
4387 * the above two reasons, bfqq has to wait for the service of at least
4388 * one full budget of another queue before being served again, then
4389 * bfqq is likely to get a much lower bandwidth or resource time than
4390 * its reserved ones. To address this issue, two countermeasures need
4393 * First, the budget and the timestamps of bfqq need to be updated in
4394 * a special way on bfqq reactivation: they need to be updated as if
4395 * bfqq did not remain idle and did not expire. In fact, if they are
4396 * computed as if bfqq expired and remained idle until reactivation,
4397 * then the process associated with bfqq is treated as if, instead of
4398 * being greedy, it stopped issuing requests when bfqq remained idle,
4399 * and restarts issuing requests only on this reactivation. In other
4400 * words, the scheduler does not help the process recover the "service
4401 * hole" between bfqq expiration and reactivation. As a consequence,
4402 * the process receives a lower bandwidth than its reserved one. In
4403 * contrast, to recover this hole, the budget must be updated as if
4404 * bfqq was not expired at all before this reactivation, i.e., it must
4405 * be set to the value of the remaining budget when bfqq was
4406 * expired. Along the same line, timestamps need to be assigned the
4407 * value they had the last time bfqq was selected for service, i.e.,
4408 * before last expiration. Thus timestamps need to be back-shifted
4409 * with respect to their normal computation (see [1] for more details
4410 * on this tricky aspect).
4412 * Secondly, to allow the process to recover the hole, the in-service
4413 * queue must be expired too, to give bfqq the chance to preempt it
4414 * immediately. In fact, if bfqq has to wait for a full budget of the
4415 * in-service queue to be completed, then it may become impossible to
4416 * let the process recover the hole, even if the back-shifted
4417 * timestamps of bfqq are lower than those of the in-service queue. If
4418 * this happens for most or all of the holes, then the process may not
4419 * receive its reserved bandwidth. In this respect, it is worth noting
4420 * that, being the service of outstanding requests unpreemptible, a
4421 * little fraction of the holes may however be unrecoverable, thereby
4422 * causing a little loss of bandwidth.
4424 * The last important point is detecting whether bfqq does need this
4425 * bandwidth recovery. In this respect, the next function deems the
4426 * process associated with bfqq greedy, and thus allows it to recover
4427 * the hole, if: 1) the process is waiting for the arrival of a new
4428 * request (which implies that bfqq expired for one of the above two
4429 * reasons), and 2) such a request has arrived soon. The first
4430 * condition is controlled through the flag non_blocking_wait_rq,
4431 * while the second through the flag arrived_in_time. If both
4432 * conditions hold, then the function computes the budget in the
4433 * above-described special way, and signals that the in-service queue
4434 * should be expired. Timestamp back-shifting is done later in
4435 * __bfq_activate_entity.
4437 * 2. Reduce latency. Even if timestamps are not backshifted to let
4438 * the process associated with bfqq recover a service hole, bfqq may
4439 * however happen to have, after being (re)activated, a lower finish
4440 * timestamp than the in-service queue. That is, the next budget of
4441 * bfqq may have to be completed before the one of the in-service
4442 * queue. If this is the case, then preempting the in-service queue
4443 * allows this goal to be achieved, apart from the unpreemptible,
4444 * outstanding requests mentioned above.
4446 * Unfortunately, regardless of which of the above two goals one wants
4447 * to achieve, service trees need first to be updated to know whether
4448 * the in-service queue must be preempted. To have service trees
4449 * correctly updated, the in-service queue must be expired and
4450 * rescheduled, and bfqq must be scheduled too. This is one of the
4451 * most costly operations (in future versions, the scheduling
4452 * mechanism may be re-designed in such a way to make it possible to
4453 * know whether preemption is needed without needing to update service
4454 * trees). In addition, queue preemptions almost always cause random
4455 * I/O, and thus loss of throughput. Because of these facts, the next
4456 * function adopts the following simple scheme to avoid both costly
4457 * operations and too frequent preemptions: it requests the expiration
4458 * of the in-service queue (unconditionally) only for queues that need
4459 * to recover a hole, or that either are weight-raised or deserve to
4462 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
4463 struct bfq_queue
*bfqq
,
4464 bool arrived_in_time
,
4465 bool wr_or_deserves_wr
)
4467 struct bfq_entity
*entity
= &bfqq
->entity
;
4469 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
4471 * We do not clear the flag non_blocking_wait_rq here, as
4472 * the latter is used in bfq_activate_bfqq to signal
4473 * that timestamps need to be back-shifted (and is
4474 * cleared right after).
4478 * In next assignment we rely on that either
4479 * entity->service or entity->budget are not updated
4480 * on expiration if bfqq is empty (see
4481 * __bfq_bfqq_recalc_budget). Thus both quantities
4482 * remain unchanged after such an expiration, and the
4483 * following statement therefore assigns to
4484 * entity->budget the remaining budget on such an
4485 * expiration. For clarity, entity->service is not
4486 * updated on expiration in any case, and, in normal
4487 * operation, is reset only when bfqq is selected for
4488 * service (see bfq_get_next_queue).
4490 entity
->budget
= min_t(unsigned long,
4491 bfq_bfqq_budget_left(bfqq
),
4497 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
4498 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
4499 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
4500 return wr_or_deserves_wr
;
4503 static unsigned int bfq_wr_duration(struct bfq_data
*bfqd
)
4507 if (bfqd
->bfq_wr_max_time
> 0)
4508 return bfqd
->bfq_wr_max_time
;
4510 dur
= bfqd
->RT_prod
;
4511 do_div(dur
, bfqd
->peak_rate
);
4514 * Limit duration between 3 and 13 seconds. Tests show that
4515 * higher values than 13 seconds often yield the opposite of
4516 * the desired result, i.e., worsen responsiveness by letting
4517 * non-interactive and non-soft-real-time applications
4518 * preserve weight raising for a too long time interval.
4520 * On the other end, lower values than 3 seconds make it
4521 * difficult for most interactive tasks to complete their jobs
4522 * before weight-raising finishes.
4524 if (dur
> msecs_to_jiffies(13000))
4525 dur
= msecs_to_jiffies(13000);
4526 else if (dur
< msecs_to_jiffies(3000))
4527 dur
= msecs_to_jiffies(3000);
4532 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data
*bfqd
,
4533 struct bfq_queue
*bfqq
,
4534 unsigned int old_wr_coeff
,
4535 bool wr_or_deserves_wr
,
4539 if (old_wr_coeff
== 1 && wr_or_deserves_wr
) {
4540 /* start a weight-raising period */
4542 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4543 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4545 bfqq
->wr_start_at_switch_to_srt
= jiffies
;
4546 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4547 BFQ_SOFTRT_WEIGHT_FACTOR
;
4548 bfqq
->wr_cur_max_time
=
4549 bfqd
->bfq_wr_rt_max_time
;
4553 * If needed, further reduce budget to make sure it is
4554 * close to bfqq's backlog, so as to reduce the
4555 * scheduling-error component due to a too large
4556 * budget. Do not care about throughput consequences,
4557 * but only about latency. Finally, do not assign a
4558 * too small budget either, to avoid increasing
4559 * latency by causing too frequent expirations.
4561 bfqq
->entity
.budget
= min_t(unsigned long,
4562 bfqq
->entity
.budget
,
4563 2 * bfq_min_budget(bfqd
));
4564 } else if (old_wr_coeff
> 1) {
4565 if (interactive
) { /* update wr coeff and duration */
4566 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4567 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4568 } else if (soft_rt
) {
4570 * The application is now or still meeting the
4571 * requirements for being deemed soft rt. We
4572 * can then correctly and safely (re)charge
4573 * the weight-raising duration for the
4574 * application with the weight-raising
4575 * duration for soft rt applications.
4577 * In particular, doing this recharge now, i.e.,
4578 * before the weight-raising period for the
4579 * application finishes, reduces the probability
4580 * of the following negative scenario:
4581 * 1) the weight of a soft rt application is
4582 * raised at startup (as for any newly
4583 * created application),
4584 * 2) since the application is not interactive,
4585 * at a certain time weight-raising is
4586 * stopped for the application,
4587 * 3) at that time the application happens to
4588 * still have pending requests, and hence
4589 * is destined to not have a chance to be
4590 * deemed soft rt before these requests are
4591 * completed (see the comments to the
4592 * function bfq_bfqq_softrt_next_start()
4593 * for details on soft rt detection),
4594 * 4) these pending requests experience a high
4595 * latency because the application is not
4596 * weight-raised while they are pending.
4598 if (bfqq
->wr_cur_max_time
!=
4599 bfqd
->bfq_wr_rt_max_time
) {
4600 bfqq
->wr_start_at_switch_to_srt
=
4601 bfqq
->last_wr_start_finish
;
4603 bfqq
->wr_cur_max_time
=
4604 bfqd
->bfq_wr_rt_max_time
;
4605 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4606 BFQ_SOFTRT_WEIGHT_FACTOR
;
4608 bfqq
->last_wr_start_finish
= jiffies
;
4613 static bool bfq_bfqq_idle_for_long_time(struct bfq_data
*bfqd
,
4614 struct bfq_queue
*bfqq
)
4616 return bfqq
->dispatched
== 0 &&
4617 time_is_before_jiffies(
4618 bfqq
->budget_timeout
+
4619 bfqd
->bfq_wr_min_idle_time
);
4622 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
4623 struct bfq_queue
*bfqq
,
4628 bool soft_rt
, wr_or_deserves_wr
, bfqq_wants_to_preempt
,
4629 idle_for_long_time
= bfq_bfqq_idle_for_long_time(bfqd
, bfqq
),
4631 * See the comments on
4632 * bfq_bfqq_update_budg_for_activation for
4633 * details on the usage of the next variable.
4635 arrived_in_time
= ktime_get_ns() <=
4636 bfqq
->ttime
.last_end_request
+
4637 bfqd
->bfq_slice_idle
* 3;
4639 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
4642 * bfqq deserves to be weight-raised if:
4644 * - it has been idle for enough time or is soft real-time,
4645 * - is linked to a bfq_io_cq (it is not shared in any sense).
4647 soft_rt
= bfqd
->bfq_wr_max_softrt_rate
> 0 &&
4648 time_is_before_jiffies(bfqq
->soft_rt_next_start
);
4649 *interactive
= idle_for_long_time
;
4650 wr_or_deserves_wr
= bfqd
->low_latency
&&
4651 (bfqq
->wr_coeff
> 1 ||
4652 (bfq_bfqq_sync(bfqq
) &&
4653 bfqq
->bic
&& (*interactive
|| soft_rt
)));
4656 * Using the last flag, update budget and check whether bfqq
4657 * may want to preempt the in-service queue.
4659 bfqq_wants_to_preempt
=
4660 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
4664 if (!bfq_bfqq_IO_bound(bfqq
)) {
4665 if (arrived_in_time
) {
4666 bfqq
->requests_within_timer
++;
4667 if (bfqq
->requests_within_timer
>=
4668 bfqd
->bfq_requests_within_timer
)
4669 bfq_mark_bfqq_IO_bound(bfqq
);
4671 bfqq
->requests_within_timer
= 0;
4674 if (bfqd
->low_latency
) {
4675 if (unlikely(time_is_after_jiffies(bfqq
->split_time
)))
4678 jiffies
- bfqd
->bfq_wr_min_idle_time
- 1;
4680 if (time_is_before_jiffies(bfqq
->split_time
+
4681 bfqd
->bfq_wr_min_idle_time
)) {
4682 bfq_update_bfqq_wr_on_rq_arrival(bfqd
, bfqq
,
4688 if (old_wr_coeff
!= bfqq
->wr_coeff
)
4689 bfqq
->entity
.prio_changed
= 1;
4693 bfqq
->last_idle_bklogged
= jiffies
;
4694 bfqq
->service_from_backlogged
= 0;
4695 bfq_clear_bfqq_softrt_update(bfqq
);
4697 bfq_add_bfqq_busy(bfqd
, bfqq
);
4700 * Expire in-service queue only if preemption may be needed
4701 * for guarantees. In this respect, the function
4702 * next_queue_may_preempt just checks a simple, necessary
4703 * condition, and not a sufficient condition based on
4704 * timestamps. In fact, for the latter condition to be
4705 * evaluated, timestamps would need first to be updated, and
4706 * this operation is quite costly (see the comments on the
4707 * function bfq_bfqq_update_budg_for_activation).
4709 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
4710 bfqd
->in_service_queue
->wr_coeff
< bfqq
->wr_coeff
&&
4711 next_queue_may_preempt(bfqd
))
4712 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
4713 false, BFQQE_PREEMPTED
);
4716 static void bfq_add_request(struct request
*rq
)
4718 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4719 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4720 struct request
*next_rq
, *prev
;
4721 unsigned int old_wr_coeff
= bfqq
->wr_coeff
;
4722 bool interactive
= false;
4724 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
4725 bfqq
->queued
[rq_is_sync(rq
)]++;
4728 elv_rb_add(&bfqq
->sort_list
, rq
);
4731 * Check if this request is a better next-serve candidate.
4733 prev
= bfqq
->next_rq
;
4734 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
4735 bfqq
->next_rq
= next_rq
;
4738 * Adjust priority tree position, if next_rq changes.
4740 if (prev
!= bfqq
->next_rq
)
4741 bfq_pos_tree_add_move(bfqd
, bfqq
);
4743 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
4744 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, old_wr_coeff
,
4747 if (bfqd
->low_latency
&& old_wr_coeff
== 1 && !rq_is_sync(rq
) &&
4748 time_is_before_jiffies(
4749 bfqq
->last_wr_start_finish
+
4750 bfqd
->bfq_wr_min_inter_arr_async
)) {
4751 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4752 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4754 bfqd
->wr_busy_queues
++;
4755 bfqq
->entity
.prio_changed
= 1;
4757 if (prev
!= bfqq
->next_rq
)
4758 bfq_updated_next_req(bfqd
, bfqq
);
4762 * Assign jiffies to last_wr_start_finish in the following
4765 * . if bfqq is not going to be weight-raised, because, for
4766 * non weight-raised queues, last_wr_start_finish stores the
4767 * arrival time of the last request; as of now, this piece
4768 * of information is used only for deciding whether to
4769 * weight-raise async queues
4771 * . if bfqq is not weight-raised, because, if bfqq is now
4772 * switching to weight-raised, then last_wr_start_finish
4773 * stores the time when weight-raising starts
4775 * . if bfqq is interactive, because, regardless of whether
4776 * bfqq is currently weight-raised, the weight-raising
4777 * period must start or restart (this case is considered
4778 * separately because it is not detected by the above
4779 * conditions, if bfqq is already weight-raised)
4781 * last_wr_start_finish has to be updated also if bfqq is soft
4782 * real-time, because the weight-raising period is constantly
4783 * restarted on idle-to-busy transitions for these queues, but
4784 * this is already done in bfq_bfqq_handle_idle_busy_switch if
4787 if (bfqd
->low_latency
&&
4788 (old_wr_coeff
== 1 || bfqq
->wr_coeff
== 1 || interactive
))
4789 bfqq
->last_wr_start_finish
= jiffies
;
4792 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
4794 struct request_queue
*q
)
4796 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4800 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
4805 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
4808 return abs(blk_rq_pos(rq
) - last_pos
);
4813 #if 0 /* Still not clear if we can do without next two functions */
4814 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
4816 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4818 bfqd
->rq_in_driver
++;
4821 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
4823 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4825 bfqd
->rq_in_driver
--;
4829 static void bfq_remove_request(struct request_queue
*q
,
4832 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4833 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4834 const int sync
= rq_is_sync(rq
);
4836 if (bfqq
->next_rq
== rq
) {
4837 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
4838 bfq_updated_next_req(bfqd
, bfqq
);
4841 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
4842 list_del_init(&rq
->queuelist
);
4843 bfqq
->queued
[sync
]--;
4845 elv_rb_del(&bfqq
->sort_list
, rq
);
4847 elv_rqhash_del(q
, rq
);
4848 if (q
->last_merge
== rq
)
4849 q
->last_merge
= NULL
;
4851 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4852 bfqq
->next_rq
= NULL
;
4854 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
4855 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
4857 * bfqq emptied. In normal operation, when
4858 * bfqq is empty, bfqq->entity.service and
4859 * bfqq->entity.budget must contain,
4860 * respectively, the service received and the
4861 * budget used last time bfqq emptied. These
4862 * facts do not hold in this case, as at least
4863 * this last removal occurred while bfqq is
4864 * not in service. To avoid inconsistencies,
4865 * reset both bfqq->entity.service and
4866 * bfqq->entity.budget, if bfqq has still a
4867 * process that may issue I/O requests to it.
4869 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
4873 * Remove queue from request-position tree as it is empty.
4875 if (bfqq
->pos_root
) {
4876 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
4877 bfqq
->pos_root
= NULL
;
4881 if (rq
->cmd_flags
& REQ_META
)
4882 bfqq
->meta_pending
--;
4884 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
4887 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
4889 struct request_queue
*q
= hctx
->queue
;
4890 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4891 struct request
*free
= NULL
;
4893 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4894 * store its return value for later use, to avoid nesting
4895 * queue_lock inside the bfqd->lock. We assume that the bic
4896 * returned by bfq_bic_lookup does not go away before
4897 * bfqd->lock is taken.
4899 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
4902 spin_lock_irq(&bfqd
->lock
);
4905 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
4907 bfqd
->bio_bfqq
= NULL
;
4908 bfqd
->bio_bic
= bic
;
4910 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
4913 blk_mq_free_request(free
);
4914 spin_unlock_irq(&bfqd
->lock
);
4919 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
4922 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4923 struct request
*__rq
;
4925 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
4926 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
4928 return ELEVATOR_FRONT_MERGE
;
4931 return ELEVATOR_NO_MERGE
;
4934 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
4935 enum elv_merge type
)
4937 if (type
== ELEVATOR_FRONT_MERGE
&&
4938 rb_prev(&req
->rb_node
) &&
4940 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
4941 struct request
, rb_node
))) {
4942 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
4943 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4944 struct request
*prev
, *next_rq
;
4946 /* Reposition request in its sort_list */
4947 elv_rb_del(&bfqq
->sort_list
, req
);
4948 elv_rb_add(&bfqq
->sort_list
, req
);
4950 /* Choose next request to be served for bfqq */
4951 prev
= bfqq
->next_rq
;
4952 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4953 bfqd
->last_position
);
4954 bfqq
->next_rq
= next_rq
;
4956 * If next_rq changes, update both the queue's budget to
4957 * fit the new request and the queue's position in its
4960 if (prev
!= bfqq
->next_rq
) {
4961 bfq_updated_next_req(bfqd
, bfqq
);
4962 bfq_pos_tree_add_move(bfqd
, bfqq
);
4967 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4968 struct request
*next
)
4970 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4972 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4974 spin_lock_irq(&bfqq
->bfqd
->lock
);
4977 * If next and rq belong to the same bfq_queue and next is older
4978 * than rq, then reposition rq in the fifo (by substituting next
4979 * with rq). Otherwise, if next and rq belong to different
4980 * bfq_queues, never reposition rq: in fact, we would have to
4981 * reposition it with respect to next's position in its own fifo,
4982 * which would most certainly be too expensive with respect to
4985 if (bfqq
== next_bfqq
&&
4986 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4987 next
->fifo_time
< rq
->fifo_time
) {
4988 list_del_init(&rq
->queuelist
);
4989 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4990 rq
->fifo_time
= next
->fifo_time
;
4993 if (bfqq
->next_rq
== next
)
4996 bfq_remove_request(q
, next
);
4998 spin_unlock_irq(&bfqq
->bfqd
->lock
);
5000 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
5003 /* Must be called with bfqq != NULL */
5004 static void bfq_bfqq_end_wr(struct bfq_queue
*bfqq
)
5006 if (bfq_bfqq_busy(bfqq
))
5007 bfqq
->bfqd
->wr_busy_queues
--;
5009 bfqq
->wr_cur_max_time
= 0;
5010 bfqq
->last_wr_start_finish
= jiffies
;
5012 * Trigger a weight change on the next invocation of
5013 * __bfq_entity_update_weight_prio.
5015 bfqq
->entity
.prio_changed
= 1;
5018 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
5019 struct bfq_group
*bfqg
)
5023 for (i
= 0; i
< 2; i
++)
5024 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
5025 if (bfqg
->async_bfqq
[i
][j
])
5026 bfq_bfqq_end_wr(bfqg
->async_bfqq
[i
][j
]);
5027 if (bfqg
->async_idle_bfqq
)
5028 bfq_bfqq_end_wr(bfqg
->async_idle_bfqq
);
5031 static void bfq_end_wr(struct bfq_data
*bfqd
)
5033 struct bfq_queue
*bfqq
;
5035 spin_lock_irq(&bfqd
->lock
);
5037 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
)
5038 bfq_bfqq_end_wr(bfqq
);
5039 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
)
5040 bfq_bfqq_end_wr(bfqq
);
5041 bfq_end_wr_async(bfqd
);
5043 spin_unlock_irq(&bfqd
->lock
);
5046 static sector_t
bfq_io_struct_pos(void *io_struct
, bool request
)
5049 return blk_rq_pos(io_struct
);
5051 return ((struct bio
*)io_struct
)->bi_iter
.bi_sector
;
5054 static int bfq_rq_close_to_sector(void *io_struct
, bool request
,
5057 return abs(bfq_io_struct_pos(io_struct
, request
) - sector
) <=
5061 static struct bfq_queue
*bfqq_find_close(struct bfq_data
*bfqd
,
5062 struct bfq_queue
*bfqq
,
5065 struct rb_root
*root
= &bfq_bfqq_to_bfqg(bfqq
)->rq_pos_tree
;
5066 struct rb_node
*parent
, *node
;
5067 struct bfq_queue
*__bfqq
;
5069 if (RB_EMPTY_ROOT(root
))
5073 * First, if we find a request starting at the end of the last
5074 * request, choose it.
5076 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, root
, sector
, &parent
, NULL
);
5081 * If the exact sector wasn't found, the parent of the NULL leaf
5082 * will contain the closest sector (rq_pos_tree sorted by
5083 * next_request position).
5085 __bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
5086 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
5089 if (blk_rq_pos(__bfqq
->next_rq
) < sector
)
5090 node
= rb_next(&__bfqq
->pos_node
);
5092 node
= rb_prev(&__bfqq
->pos_node
);
5096 __bfqq
= rb_entry(node
, struct bfq_queue
, pos_node
);
5097 if (bfq_rq_close_to_sector(__bfqq
->next_rq
, true, sector
))
5103 static struct bfq_queue
*bfq_find_close_cooperator(struct bfq_data
*bfqd
,
5104 struct bfq_queue
*cur_bfqq
,
5107 struct bfq_queue
*bfqq
;
5110 * We shall notice if some of the queues are cooperating,
5111 * e.g., working closely on the same area of the device. In
5112 * that case, we can group them together and: 1) don't waste
5113 * time idling, and 2) serve the union of their requests in
5114 * the best possible order for throughput.
5116 bfqq
= bfqq_find_close(bfqd
, cur_bfqq
, sector
);
5117 if (!bfqq
|| bfqq
== cur_bfqq
)
5123 static struct bfq_queue
*
5124 bfq_setup_merge(struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
5126 int process_refs
, new_process_refs
;
5127 struct bfq_queue
*__bfqq
;
5130 * If there are no process references on the new_bfqq, then it is
5131 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
5132 * may have dropped their last reference (not just their last process
5135 if (!bfqq_process_refs(new_bfqq
))
5138 /* Avoid a circular list and skip interim queue merges. */
5139 while ((__bfqq
= new_bfqq
->new_bfqq
)) {
5145 process_refs
= bfqq_process_refs(bfqq
);
5146 new_process_refs
= bfqq_process_refs(new_bfqq
);
5148 * If the process for the bfqq has gone away, there is no
5149 * sense in merging the queues.
5151 if (process_refs
== 0 || new_process_refs
== 0)
5154 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "scheduling merge with queue %d",
5158 * Merging is just a redirection: the requests of the process
5159 * owning one of the two queues are redirected to the other queue.
5160 * The latter queue, in its turn, is set as shared if this is the
5161 * first time that the requests of some process are redirected to
5164 * We redirect bfqq to new_bfqq and not the opposite, because we
5165 * are in the context of the process owning bfqq, hence we have
5166 * the io_cq of this process. So we can immediately configure this
5167 * io_cq to redirect the requests of the process to new_bfqq.
5169 * NOTE, even if new_bfqq coincides with the in-service queue, the
5170 * io_cq of new_bfqq is not available, because, if the in-service
5171 * queue is shared, bfqd->in_service_bic may not point to the
5172 * io_cq of the in-service queue.
5173 * Redirecting the requests of the process owning bfqq to the
5174 * currently in-service queue is in any case the best option, as
5175 * we feed the in-service queue with new requests close to the
5176 * last request served and, by doing so, hopefully increase the
5179 bfqq
->new_bfqq
= new_bfqq
;
5180 new_bfqq
->ref
+= process_refs
;
5184 static bool bfq_may_be_close_cooperator(struct bfq_queue
*bfqq
,
5185 struct bfq_queue
*new_bfqq
)
5187 if (bfq_class_idle(bfqq
) || bfq_class_idle(new_bfqq
) ||
5188 (bfqq
->ioprio_class
!= new_bfqq
->ioprio_class
))
5192 * If either of the queues has already been detected as seeky,
5193 * then merging it with the other queue is unlikely to lead to
5196 if (BFQQ_SEEKY(bfqq
) || BFQQ_SEEKY(new_bfqq
))
5200 * Interleaved I/O is known to be done by (some) applications
5201 * only for reads, so it does not make sense to merge async
5204 if (!bfq_bfqq_sync(bfqq
) || !bfq_bfqq_sync(new_bfqq
))
5211 * If this function returns true, then bfqq cannot be merged. The idea
5212 * is that true cooperation happens very early after processes start
5213 * to do I/O. Usually, late cooperations are just accidental false
5214 * positives. In case bfqq is weight-raised, such false positives
5215 * would evidently degrade latency guarantees for bfqq.
5217 static bool wr_from_too_long(struct bfq_queue
*bfqq
)
5219 return bfqq
->wr_coeff
> 1 &&
5220 time_is_before_jiffies(bfqq
->last_wr_start_finish
+
5221 msecs_to_jiffies(100));
5225 * Attempt to schedule a merge of bfqq with the currently in-service
5226 * queue or with a close queue among the scheduled queues. Return
5227 * NULL if no merge was scheduled, a pointer to the shared bfq_queue
5228 * structure otherwise.
5230 * The OOM queue is not allowed to participate to cooperation: in fact, since
5231 * the requests temporarily redirected to the OOM queue could be redirected
5232 * again to dedicated queues at any time, the state needed to correctly
5233 * handle merging with the OOM queue would be quite complex and expensive
5234 * to maintain. Besides, in such a critical condition as an out of memory,
5235 * the benefits of queue merging may be little relevant, or even negligible.
5237 * Weight-raised queues can be merged only if their weight-raising
5238 * period has just started. In fact cooperating processes are usually
5239 * started together. Thus, with this filter we avoid false positives
5240 * that would jeopardize low-latency guarantees.
5242 * WARNING: queue merging may impair fairness among non-weight raised
5243 * queues, for at least two reasons: 1) the original weight of a
5244 * merged queue may change during the merged state, 2) even being the
5245 * weight the same, a merged queue may be bloated with many more
5246 * requests than the ones produced by its originally-associated
5249 static struct bfq_queue
*
5250 bfq_setup_cooperator(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5251 void *io_struct
, bool request
)
5253 struct bfq_queue
*in_service_bfqq
, *new_bfqq
;
5256 return bfqq
->new_bfqq
;
5259 wr_from_too_long(bfqq
) ||
5260 unlikely(bfqq
== &bfqd
->oom_bfqq
))
5263 /* If there is only one backlogged queue, don't search. */
5264 if (bfqd
->busy_queues
== 1)
5267 in_service_bfqq
= bfqd
->in_service_queue
;
5269 if (!in_service_bfqq
|| in_service_bfqq
== bfqq
||
5270 !bfqd
->in_service_bic
|| wr_from_too_long(in_service_bfqq
) ||
5271 unlikely(in_service_bfqq
== &bfqd
->oom_bfqq
))
5272 goto check_scheduled
;
5274 if (bfq_rq_close_to_sector(io_struct
, request
, bfqd
->last_position
) &&
5275 bfqq
->entity
.parent
== in_service_bfqq
->entity
.parent
&&
5276 bfq_may_be_close_cooperator(bfqq
, in_service_bfqq
)) {
5277 new_bfqq
= bfq_setup_merge(bfqq
, in_service_bfqq
);
5282 * Check whether there is a cooperator among currently scheduled
5283 * queues. The only thing we need is that the bio/request is not
5284 * NULL, as we need it to establish whether a cooperator exists.
5287 new_bfqq
= bfq_find_close_cooperator(bfqd
, bfqq
,
5288 bfq_io_struct_pos(io_struct
, request
));
5290 if (new_bfqq
&& !wr_from_too_long(new_bfqq
) &&
5291 likely(new_bfqq
!= &bfqd
->oom_bfqq
) &&
5292 bfq_may_be_close_cooperator(bfqq
, new_bfqq
))
5293 return bfq_setup_merge(bfqq
, new_bfqq
);
5298 static void bfq_bfqq_save_state(struct bfq_queue
*bfqq
)
5300 struct bfq_io_cq
*bic
= bfqq
->bic
;
5303 * If !bfqq->bic, the queue is already shared or its requests
5304 * have already been redirected to a shared queue; both idle window
5305 * and weight raising state have already been saved. Do nothing.
5310 bic
->saved_ttime
= bfqq
->ttime
;
5311 bic
->saved_idle_window
= bfq_bfqq_idle_window(bfqq
);
5312 bic
->saved_IO_bound
= bfq_bfqq_IO_bound(bfqq
);
5313 bic
->saved_wr_coeff
= bfqq
->wr_coeff
;
5314 bic
->saved_wr_start_at_switch_to_srt
= bfqq
->wr_start_at_switch_to_srt
;
5315 bic
->saved_last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5316 bic
->saved_wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5319 static void bfq_get_bic_reference(struct bfq_queue
*bfqq
)
5322 * If bfqq->bic has a non-NULL value, the bic to which it belongs
5323 * is about to begin using a shared bfq_queue.
5326 atomic_long_inc(&bfqq
->bic
->icq
.ioc
->refcount
);
5330 bfq_merge_bfqqs(struct bfq_data
*bfqd
, struct bfq_io_cq
*bic
,
5331 struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
5333 bfq_log_bfqq(bfqd
, bfqq
, "merging with queue %lu",
5334 (unsigned long)new_bfqq
->pid
);
5335 /* Save weight raising and idle window of the merged queues */
5336 bfq_bfqq_save_state(bfqq
);
5337 bfq_bfqq_save_state(new_bfqq
);
5338 if (bfq_bfqq_IO_bound(bfqq
))
5339 bfq_mark_bfqq_IO_bound(new_bfqq
);
5340 bfq_clear_bfqq_IO_bound(bfqq
);
5343 * If bfqq is weight-raised, then let new_bfqq inherit
5344 * weight-raising. To reduce false positives, neglect the case
5345 * where bfqq has just been created, but has not yet made it
5346 * to be weight-raised (which may happen because EQM may merge
5347 * bfqq even before bfq_add_request is executed for the first
5350 if (new_bfqq
->wr_coeff
== 1 && bfqq
->wr_coeff
> 1) {
5351 new_bfqq
->wr_coeff
= bfqq
->wr_coeff
;
5352 new_bfqq
->wr_cur_max_time
= bfqq
->wr_cur_max_time
;
5353 new_bfqq
->last_wr_start_finish
= bfqq
->last_wr_start_finish
;
5354 new_bfqq
->wr_start_at_switch_to_srt
=
5355 bfqq
->wr_start_at_switch_to_srt
;
5356 if (bfq_bfqq_busy(new_bfqq
))
5357 bfqd
->wr_busy_queues
++;
5358 new_bfqq
->entity
.prio_changed
= 1;
5361 if (bfqq
->wr_coeff
> 1) { /* bfqq has given its wr to new_bfqq */
5363 bfqq
->entity
.prio_changed
= 1;
5364 if (bfq_bfqq_busy(bfqq
))
5365 bfqd
->wr_busy_queues
--;
5368 bfq_log_bfqq(bfqd
, new_bfqq
, "merge_bfqqs: wr_busy %d",
5369 bfqd
->wr_busy_queues
);
5372 * Grab a reference to the bic, to prevent it from being destroyed
5373 * before being possibly touched by a bfq_split_bfqq().
5375 bfq_get_bic_reference(bfqq
);
5376 bfq_get_bic_reference(new_bfqq
);
5378 * Merge queues (that is, let bic redirect its requests to new_bfqq)
5380 bic_set_bfqq(bic
, new_bfqq
, 1);
5381 bfq_mark_bfqq_coop(new_bfqq
);
5383 * new_bfqq now belongs to at least two bics (it is a shared queue):
5384 * set new_bfqq->bic to NULL. bfqq either:
5385 * - does not belong to any bic any more, and hence bfqq->bic must
5386 * be set to NULL, or
5387 * - is a queue whose owning bics have already been redirected to a
5388 * different queue, hence the queue is destined to not belong to
5389 * any bic soon and bfqq->bic is already NULL (therefore the next
5390 * assignment causes no harm).
5392 new_bfqq
->bic
= NULL
;
5394 /* release process reference to bfqq */
5395 bfq_put_queue(bfqq
);
5398 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
5401 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
5402 bool is_sync
= op_is_sync(bio
->bi_opf
);
5403 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
, *new_bfqq
;
5406 * Disallow merge of a sync bio into an async request.
5408 if (is_sync
&& !rq_is_sync(rq
))
5412 * Lookup the bfqq that this bio will be queued with. Allow
5413 * merge only if rq is queued there.
5419 * We take advantage of this function to perform an early merge
5420 * of the queues of possible cooperating processes.
5422 new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, bio
, false);
5425 * bic still points to bfqq, then it has not yet been
5426 * redirected to some other bfq_queue, and a queue
5427 * merge beween bfqq and new_bfqq can be safely
5428 * fulfillled, i.e., bic can be redirected to new_bfqq
5429 * and bfqq can be put.
5431 bfq_merge_bfqqs(bfqd
, bfqd
->bio_bic
, bfqq
,
5434 * If we get here, bio will be queued into new_queue,
5435 * so use new_bfqq to decide whether bio and rq can be
5441 * Change also bqfd->bio_bfqq, as
5442 * bfqd->bio_bic now points to new_bfqq, and
5443 * this function may be invoked again (and then may
5444 * use again bqfd->bio_bfqq).
5446 bfqd
->bio_bfqq
= bfqq
;
5449 return bfqq
== RQ_BFQQ(rq
);
5453 * Set the maximum time for the in-service queue to consume its
5454 * budget. This prevents seeky processes from lowering the throughput.
5455 * In practice, a time-slice service scheme is used with seeky
5458 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
,
5459 struct bfq_queue
*bfqq
)
5461 unsigned int timeout_coeff
;
5463 if (bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
)
5466 timeout_coeff
= bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
;
5468 bfqd
->last_budget_start
= ktime_get();
5470 bfqq
->budget_timeout
= jiffies
+
5471 bfqd
->bfq_timeout
* timeout_coeff
;
5474 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
5475 struct bfq_queue
*bfqq
)
5478 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
5479 bfq_clear_bfqq_fifo_expire(bfqq
);
5481 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
5483 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
) &&
5484 bfqq
->wr_coeff
> 1 &&
5485 bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
&&
5486 time_is_before_jiffies(bfqq
->budget_timeout
)) {
5488 * For soft real-time queues, move the start
5489 * of the weight-raising period forward by the
5490 * time the queue has not received any
5491 * service. Otherwise, a relatively long
5492 * service delay is likely to cause the
5493 * weight-raising period of the queue to end,
5494 * because of the short duration of the
5495 * weight-raising period of a soft real-time
5496 * queue. It is worth noting that this move
5497 * is not so dangerous for the other queues,
5498 * because soft real-time queues are not
5501 * To not add a further variable, we use the
5502 * overloaded field budget_timeout to
5503 * determine for how long the queue has not
5504 * received service, i.e., how much time has
5505 * elapsed since the queue expired. However,
5506 * this is a little imprecise, because
5507 * budget_timeout is set to jiffies if bfqq
5508 * not only expires, but also remains with no
5511 if (time_after(bfqq
->budget_timeout
,
5512 bfqq
->last_wr_start_finish
))
5513 bfqq
->last_wr_start_finish
+=
5514 jiffies
- bfqq
->budget_timeout
;
5516 bfqq
->last_wr_start_finish
= jiffies
;
5519 bfq_set_budget_timeout(bfqd
, bfqq
);
5520 bfq_log_bfqq(bfqd
, bfqq
,
5521 "set_in_service_queue, cur-budget = %d",
5522 bfqq
->entity
.budget
);
5525 bfqd
->in_service_queue
= bfqq
;
5529 * Get and set a new queue for service.
5531 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
5533 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
5535 __bfq_set_in_service_queue(bfqd
, bfqq
);
5539 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
5541 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
5542 struct bfq_io_cq
*bic
;
5545 /* Processes have exited, don't wait. */
5546 bic
= bfqd
->in_service_bic
;
5547 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
5550 bfq_mark_bfqq_wait_request(bfqq
);
5553 * We don't want to idle for seeks, but we do want to allow
5554 * fair distribution of slice time for a process doing back-to-back
5555 * seeks. So allow a little bit of time for him to submit a new rq.
5557 sl
= bfqd
->bfq_slice_idle
;
5559 * Unless the queue is being weight-raised or the scenario is
5560 * asymmetric, grant only minimum idle time if the queue
5561 * is seeky. A long idling is preserved for a weight-raised
5562 * queue, or, more in general, in an asymmetric scenario,
5563 * because a long idling is needed for guaranteeing to a queue
5564 * its reserved share of the throughput (in particular, it is
5565 * needed if the queue has a higher weight than some other
5568 if (BFQQ_SEEKY(bfqq
) && bfqq
->wr_coeff
== 1 &&
5569 bfq_symmetric_scenario(bfqd
))
5570 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
5572 bfqd
->last_idling_start
= ktime_get();
5573 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
5575 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
5579 * In autotuning mode, max_budget is dynamically recomputed as the
5580 * amount of sectors transferred in timeout at the estimated peak
5581 * rate. This enables BFQ to utilize a full timeslice with a full
5582 * budget, even if the in-service queue is served at peak rate. And
5583 * this maximises throughput with sequential workloads.
5585 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
5587 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
5588 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
5592 * Update parameters related to throughput and responsiveness, as a
5593 * function of the estimated peak rate. See comments on
5594 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
5596 static void update_thr_responsiveness_params(struct bfq_data
*bfqd
)
5598 int dev_type
= blk_queue_nonrot(bfqd
->queue
);
5600 if (bfqd
->bfq_user_max_budget
== 0)
5601 bfqd
->bfq_max_budget
=
5602 bfq_calc_max_budget(bfqd
);
5604 if (bfqd
->device_speed
== BFQ_BFQD_FAST
&&
5605 bfqd
->peak_rate
< device_speed_thresh
[dev_type
]) {
5606 bfqd
->device_speed
= BFQ_BFQD_SLOW
;
5607 bfqd
->RT_prod
= R_slow
[dev_type
] *
5609 } else if (bfqd
->device_speed
== BFQ_BFQD_SLOW
&&
5610 bfqd
->peak_rate
> device_speed_thresh
[dev_type
]) {
5611 bfqd
->device_speed
= BFQ_BFQD_FAST
;
5612 bfqd
->RT_prod
= R_fast
[dev_type
] *
5617 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
5618 dev_type
== 0 ? "ROT" : "NONROT",
5619 bfqd
->device_speed
== BFQ_BFQD_FAST
? "FAST" : "SLOW",
5620 bfqd
->device_speed
== BFQ_BFQD_FAST
?
5621 (USEC_PER_SEC
*(u64
)R_fast
[dev_type
])>>BFQ_RATE_SHIFT
:
5622 (USEC_PER_SEC
*(u64
)R_slow
[dev_type
])>>BFQ_RATE_SHIFT
,
5623 (USEC_PER_SEC
*(u64
)device_speed_thresh
[dev_type
])>>
5627 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
5630 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
5631 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
5632 bfqd
->peak_rate_samples
= 1;
5633 bfqd
->sequential_samples
= 0;
5634 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
5636 } else /* no new rq dispatched, just reset the number of samples */
5637 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
5640 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
5641 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
5642 bfqd
->tot_sectors_dispatched
);
5645 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
5647 u32 rate
, weight
, divisor
;
5650 * For the convergence property to hold (see comments on
5651 * bfq_update_peak_rate()) and for the assessment to be
5652 * reliable, a minimum number of samples must be present, and
5653 * a minimum amount of time must have elapsed. If not so, do
5654 * not compute new rate. Just reset parameters, to get ready
5655 * for a new evaluation attempt.
5657 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
5658 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
5659 goto reset_computation
;
5662 * If a new request completion has occurred after last
5663 * dispatch, then, to approximate the rate at which requests
5664 * have been served by the device, it is more precise to
5665 * extend the observation interval to the last completion.
5667 bfqd
->delta_from_first
=
5668 max_t(u64
, bfqd
->delta_from_first
,
5669 bfqd
->last_completion
- bfqd
->first_dispatch
);
5672 * Rate computed in sects/usec, and not sects/nsec, for
5675 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
5676 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
5679 * Peak rate not updated if:
5680 * - the percentage of sequential dispatches is below 3/4 of the
5681 * total, and rate is below the current estimated peak rate
5682 * - rate is unreasonably high (> 20M sectors/sec)
5684 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
5685 rate
<= bfqd
->peak_rate
) ||
5686 rate
> 20<<BFQ_RATE_SHIFT
)
5687 goto reset_computation
;
5690 * We have to update the peak rate, at last! To this purpose,
5691 * we use a low-pass filter. We compute the smoothing constant
5692 * of the filter as a function of the 'weight' of the new
5695 * As can be seen in next formulas, we define this weight as a
5696 * quantity proportional to how sequential the workload is,
5697 * and to how long the observation time interval is.
5699 * The weight runs from 0 to 8. The maximum value of the
5700 * weight, 8, yields the minimum value for the smoothing
5701 * constant. At this minimum value for the smoothing constant,
5702 * the measured rate contributes for half of the next value of
5703 * the estimated peak rate.
5705 * So, the first step is to compute the weight as a function
5706 * of how sequential the workload is. Note that the weight
5707 * cannot reach 9, because bfqd->sequential_samples cannot
5708 * become equal to bfqd->peak_rate_samples, which, in its
5709 * turn, holds true because bfqd->sequential_samples is not
5710 * incremented for the first sample.
5712 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
5715 * Second step: further refine the weight as a function of the
5716 * duration of the observation interval.
5718 weight
= min_t(u32
, 8,
5719 div_u64(weight
* bfqd
->delta_from_first
,
5720 BFQ_RATE_REF_INTERVAL
));
5723 * Divisor ranging from 10, for minimum weight, to 2, for
5726 divisor
= 10 - weight
;
5729 * Finally, update peak rate:
5731 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
5733 bfqd
->peak_rate
*= divisor
-1;
5734 bfqd
->peak_rate
/= divisor
;
5735 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
5737 bfqd
->peak_rate
+= rate
;
5738 update_thr_responsiveness_params(bfqd
);
5741 bfq_reset_rate_computation(bfqd
, rq
);
5745 * Update the read/write peak rate (the main quantity used for
5746 * auto-tuning, see update_thr_responsiveness_params()).
5748 * It is not trivial to estimate the peak rate (correctly): because of
5749 * the presence of sw and hw queues between the scheduler and the
5750 * device components that finally serve I/O requests, it is hard to
5751 * say exactly when a given dispatched request is served inside the
5752 * device, and for how long. As a consequence, it is hard to know
5753 * precisely at what rate a given set of requests is actually served
5756 * On the opposite end, the dispatch time of any request is trivially
5757 * available, and, from this piece of information, the "dispatch rate"
5758 * of requests can be immediately computed. So, the idea in the next
5759 * function is to use what is known, namely request dispatch times
5760 * (plus, when useful, request completion times), to estimate what is
5761 * unknown, namely in-device request service rate.
5763 * The main issue is that, because of the above facts, the rate at
5764 * which a certain set of requests is dispatched over a certain time
5765 * interval can vary greatly with respect to the rate at which the
5766 * same requests are then served. But, since the size of any
5767 * intermediate queue is limited, and the service scheme is lossless
5768 * (no request is silently dropped), the following obvious convergence
5769 * property holds: the number of requests dispatched MUST become
5770 * closer and closer to the number of requests completed as the
5771 * observation interval grows. This is the key property used in
5772 * the next function to estimate the peak service rate as a function
5773 * of the observed dispatch rate. The function assumes to be invoked
5774 * on every request dispatch.
5776 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
5778 u64 now_ns
= ktime_get_ns();
5780 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
5781 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
5782 bfqd
->peak_rate_samples
);
5783 bfq_reset_rate_computation(bfqd
, rq
);
5784 goto update_last_values
; /* will add one sample */
5788 * Device idle for very long: the observation interval lasting
5789 * up to this dispatch cannot be a valid observation interval
5790 * for computing a new peak rate (similarly to the late-
5791 * completion event in bfq_completed_request()). Go to
5792 * update_rate_and_reset to have the following three steps
5794 * - close the observation interval at the last (previous)
5795 * request dispatch or completion
5796 * - compute rate, if possible, for that observation interval
5797 * - start a new observation interval with this dispatch
5799 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
5800 bfqd
->rq_in_driver
== 0)
5801 goto update_rate_and_reset
;
5803 /* Update sampling information */
5804 bfqd
->peak_rate_samples
++;
5806 if ((bfqd
->rq_in_driver
> 0 ||
5807 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
5808 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
5809 bfqd
->sequential_samples
++;
5811 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
5813 /* Reset max observed rq size every 32 dispatches */
5814 if (likely(bfqd
->peak_rate_samples
% 32))
5815 bfqd
->last_rq_max_size
=
5816 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
5818 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
5820 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
5822 /* Target observation interval not yet reached, go on sampling */
5823 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
5824 goto update_last_values
;
5826 update_rate_and_reset
:
5827 bfq_update_rate_reset(bfqd
, rq
);
5829 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
5830 bfqd
->last_dispatch
= now_ns
;
5834 * Remove request from internal lists.
5836 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
5838 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
5841 * For consistency, the next instruction should have been
5842 * executed after removing the request from the queue and
5843 * dispatching it. We execute instead this instruction before
5844 * bfq_remove_request() (and hence introduce a temporary
5845 * inconsistency), for efficiency. In fact, should this
5846 * dispatch occur for a non in-service bfqq, this anticipated
5847 * increment prevents two counters related to bfqq->dispatched
5848 * from risking to be, first, uselessly decremented, and then
5849 * incremented again when the (new) value of bfqq->dispatched
5850 * happens to be taken into account.
5853 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
5855 bfq_remove_request(q
, rq
);
5858 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5861 * If this bfqq is shared between multiple processes, check
5862 * to make sure that those processes are still issuing I/Os
5863 * within the mean seek distance. If not, it may be time to
5864 * break the queues apart again.
5866 if (bfq_bfqq_coop(bfqq
) && BFQQ_SEEKY(bfqq
))
5867 bfq_mark_bfqq_split_coop(bfqq
);
5869 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
5870 if (bfqq
->dispatched
== 0)
5872 * Overloading budget_timeout field to store
5873 * the time at which the queue remains with no
5874 * backlog and no outstanding request; used by
5875 * the weight-raising mechanism.
5877 bfqq
->budget_timeout
= jiffies
;
5879 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
5881 bfq_requeue_bfqq(bfqd
, bfqq
);
5883 * Resort priority tree of potential close cooperators.
5885 bfq_pos_tree_add_move(bfqd
, bfqq
);
5889 * All in-service entities must have been properly deactivated
5890 * or requeued before executing the next function, which
5891 * resets all in-service entites as no more in service.
5893 __bfq_bfqd_reset_in_service(bfqd
);
5897 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
5898 * @bfqd: device data.
5899 * @bfqq: queue to update.
5900 * @reason: reason for expiration.
5902 * Handle the feedback on @bfqq budget at queue expiration.
5903 * See the body for detailed comments.
5905 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
5906 struct bfq_queue
*bfqq
,
5907 enum bfqq_expiration reason
)
5909 struct request
*next_rq
;
5910 int budget
, min_budget
;
5912 min_budget
= bfq_min_budget(bfqd
);
5914 if (bfqq
->wr_coeff
== 1)
5915 budget
= bfqq
->max_budget
;
5917 * Use a constant, low budget for weight-raised queues,
5918 * to help achieve a low latency. Keep it slightly higher
5919 * than the minimum possible budget, to cause a little
5920 * bit fewer expirations.
5922 budget
= 2 * min_budget
;
5924 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
5925 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
5926 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
5927 budget
, bfq_min_budget(bfqd
));
5928 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
5929 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
5931 if (bfq_bfqq_sync(bfqq
) && bfqq
->wr_coeff
== 1) {
5934 * Caveat: in all the following cases we trade latency
5937 case BFQQE_TOO_IDLE
:
5939 * This is the only case where we may reduce
5940 * the budget: if there is no request of the
5941 * process still waiting for completion, then
5942 * we assume (tentatively) that the timer has
5943 * expired because the batch of requests of
5944 * the process could have been served with a
5945 * smaller budget. Hence, betting that
5946 * process will behave in the same way when it
5947 * becomes backlogged again, we reduce its
5948 * next budget. As long as we guess right,
5949 * this budget cut reduces the latency
5950 * experienced by the process.
5952 * However, if there are still outstanding
5953 * requests, then the process may have not yet
5954 * issued its next request just because it is
5955 * still waiting for the completion of some of
5956 * the still outstanding ones. So in this
5957 * subcase we do not reduce its budget, on the
5958 * contrary we increase it to possibly boost
5959 * the throughput, as discussed in the
5960 * comments to the BUDGET_TIMEOUT case.
5962 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
5963 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5965 if (budget
> 5 * min_budget
)
5966 budget
-= 4 * min_budget
;
5968 budget
= min_budget
;
5971 case BFQQE_BUDGET_TIMEOUT
:
5973 * We double the budget here because it gives
5974 * the chance to boost the throughput if this
5975 * is not a seeky process (and has bumped into
5976 * this timeout because of, e.g., ZBR).
5978 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5980 case BFQQE_BUDGET_EXHAUSTED
:
5982 * The process still has backlog, and did not
5983 * let either the budget timeout or the disk
5984 * idling timeout expire. Hence it is not
5985 * seeky, has a short thinktime and may be
5986 * happy with a higher budget too. So
5987 * definitely increase the budget of this good
5988 * candidate to boost the disk throughput.
5990 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
5992 case BFQQE_NO_MORE_REQUESTS
:
5994 * For queues that expire for this reason, it
5995 * is particularly important to keep the
5996 * budget close to the actual service they
5997 * need. Doing so reduces the timestamp
5998 * misalignment problem described in the
5999 * comments in the body of
6000 * __bfq_activate_entity. In fact, suppose
6001 * that a queue systematically expires for
6002 * BFQQE_NO_MORE_REQUESTS and presents a
6003 * new request in time to enjoy timestamp
6004 * back-shifting. The larger the budget of the
6005 * queue is with respect to the service the
6006 * queue actually requests in each service
6007 * slot, the more times the queue can be
6008 * reactivated with the same virtual finish
6009 * time. It follows that, even if this finish
6010 * time is pushed to the system virtual time
6011 * to reduce the consequent timestamp
6012 * misalignment, the queue unjustly enjoys for
6013 * many re-activations a lower finish time
6014 * than all newly activated queues.
6016 * The service needed by bfqq is measured
6017 * quite precisely by bfqq->entity.service.
6018 * Since bfqq does not enjoy device idling,
6019 * bfqq->entity.service is equal to the number
6020 * of sectors that the process associated with
6021 * bfqq requested to read/write before waiting
6022 * for request completions, or blocking for
6025 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
6030 } else if (!bfq_bfqq_sync(bfqq
)) {
6032 * Async queues get always the maximum possible
6033 * budget, as for them we do not care about latency
6034 * (in addition, their ability to dispatch is limited
6035 * by the charging factor).
6037 budget
= bfqd
->bfq_max_budget
;
6040 bfqq
->max_budget
= budget
;
6042 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
6043 !bfqd
->bfq_user_max_budget
)
6044 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
6047 * If there is still backlog, then assign a new budget, making
6048 * sure that it is large enough for the next request. Since
6049 * the finish time of bfqq must be kept in sync with the
6050 * budget, be sure to call __bfq_bfqq_expire() *after* this
6053 * If there is no backlog, then no need to update the budget;
6054 * it will be updated on the arrival of a new request.
6056 next_rq
= bfqq
->next_rq
;
6058 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
6059 bfq_serv_to_charge(next_rq
, bfqq
));
6061 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
6062 next_rq
? blk_rq_sectors(next_rq
) : 0,
6063 bfqq
->entity
.budget
);
6067 * Return true if the process associated with bfqq is "slow". The slow
6068 * flag is used, in addition to the budget timeout, to reduce the
6069 * amount of service provided to seeky processes, and thus reduce
6070 * their chances to lower the throughput. More details in the comments
6071 * on the function bfq_bfqq_expire().
6073 * An important observation is in order: as discussed in the comments
6074 * on the function bfq_update_peak_rate(), with devices with internal
6075 * queues, it is hard if ever possible to know when and for how long
6076 * an I/O request is processed by the device (apart from the trivial
6077 * I/O pattern where a new request is dispatched only after the
6078 * previous one has been completed). This makes it hard to evaluate
6079 * the real rate at which the I/O requests of each bfq_queue are
6080 * served. In fact, for an I/O scheduler like BFQ, serving a
6081 * bfq_queue means just dispatching its requests during its service
6082 * slot (i.e., until the budget of the queue is exhausted, or the
6083 * queue remains idle, or, finally, a timeout fires). But, during the
6084 * service slot of a bfq_queue, around 100 ms at most, the device may
6085 * be even still processing requests of bfq_queues served in previous
6086 * service slots. On the opposite end, the requests of the in-service
6087 * bfq_queue may be completed after the service slot of the queue
6090 * Anyway, unless more sophisticated solutions are used
6091 * (where possible), the sum of the sizes of the requests dispatched
6092 * during the service slot of a bfq_queue is probably the only
6093 * approximation available for the service received by the bfq_queue
6094 * during its service slot. And this sum is the quantity used in this
6095 * function to evaluate the I/O speed of a process.
6097 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6098 bool compensate
, enum bfqq_expiration reason
,
6099 unsigned long *delta_ms
)
6101 ktime_t delta_ktime
;
6103 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
6105 if (!bfq_bfqq_sync(bfqq
))
6109 delta_ktime
= bfqd
->last_idling_start
;
6111 delta_ktime
= ktime_get();
6112 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
6113 delta_usecs
= ktime_to_us(delta_ktime
);
6115 /* don't use too short time intervals */
6116 if (delta_usecs
< 1000) {
6117 if (blk_queue_nonrot(bfqd
->queue
))
6119 * give same worst-case guarantees as idling
6122 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
6123 else /* charge at least one seek */
6124 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
6129 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
6132 * Use only long (> 20ms) intervals to filter out excessive
6133 * spikes in service rate estimation.
6135 if (delta_usecs
> 20000) {
6137 * Caveat for rotational devices: processes doing I/O
6138 * in the slower disk zones tend to be slow(er) even
6139 * if not seeky. In this respect, the estimated peak
6140 * rate is likely to be an average over the disk
6141 * surface. Accordingly, to not be too harsh with
6142 * unlucky processes, a process is deemed slow only if
6143 * its rate has been lower than half of the estimated
6146 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
6149 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
6155 * To be deemed as soft real-time, an application must meet two
6156 * requirements. First, the application must not require an average
6157 * bandwidth higher than the approximate bandwidth required to playback or
6158 * record a compressed high-definition video.
6159 * The next function is invoked on the completion of the last request of a
6160 * batch, to compute the next-start time instant, soft_rt_next_start, such
6161 * that, if the next request of the application does not arrive before
6162 * soft_rt_next_start, then the above requirement on the bandwidth is met.
6164 * The second requirement is that the request pattern of the application is
6165 * isochronous, i.e., that, after issuing a request or a batch of requests,
6166 * the application stops issuing new requests until all its pending requests
6167 * have been completed. After that, the application may issue a new batch,
6169 * For this reason the next function is invoked to compute
6170 * soft_rt_next_start only for applications that meet this requirement,
6171 * whereas soft_rt_next_start is set to infinity for applications that do
6174 * Unfortunately, even a greedy application may happen to behave in an
6175 * isochronous way if the CPU load is high. In fact, the application may
6176 * stop issuing requests while the CPUs are busy serving other processes,
6177 * then restart, then stop again for a while, and so on. In addition, if
6178 * the disk achieves a low enough throughput with the request pattern
6179 * issued by the application (e.g., because the request pattern is random
6180 * and/or the device is slow), then the application may meet the above
6181 * bandwidth requirement too. To prevent such a greedy application to be
6182 * deemed as soft real-time, a further rule is used in the computation of
6183 * soft_rt_next_start: soft_rt_next_start must be higher than the current
6184 * time plus the maximum time for which the arrival of a request is waited
6185 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
6186 * This filters out greedy applications, as the latter issue instead their
6187 * next request as soon as possible after the last one has been completed
6188 * (in contrast, when a batch of requests is completed, a soft real-time
6189 * application spends some time processing data).
6191 * Unfortunately, the last filter may easily generate false positives if
6192 * only bfqd->bfq_slice_idle is used as a reference time interval and one
6193 * or both the following cases occur:
6194 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
6195 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
6197 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
6198 * for a while, then suddenly 'jump' by several units to recover the lost
6199 * increments. This seems to happen, e.g., inside virtual machines.
6200 * To address this issue, we do not use as a reference time interval just
6201 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
6202 * particular we add the minimum number of jiffies for which the filter
6203 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
6206 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data
*bfqd
,
6207 struct bfq_queue
*bfqq
)
6209 return max(bfqq
->last_idle_bklogged
+
6210 HZ
* bfqq
->service_from_backlogged
/
6211 bfqd
->bfq_wr_max_softrt_rate
,
6212 jiffies
+ nsecs_to_jiffies(bfqq
->bfqd
->bfq_slice_idle
) + 4);
6216 * Return the farthest future time instant according to jiffies
6219 static unsigned long bfq_greatest_from_now(void)
6221 return jiffies
+ MAX_JIFFY_OFFSET
;
6225 * Return the farthest past time instant according to jiffies
6228 static unsigned long bfq_smallest_from_now(void)
6230 return jiffies
- MAX_JIFFY_OFFSET
;
6234 * bfq_bfqq_expire - expire a queue.
6235 * @bfqd: device owning the queue.
6236 * @bfqq: the queue to expire.
6237 * @compensate: if true, compensate for the time spent idling.
6238 * @reason: the reason causing the expiration.
6240 * If the process associated with bfqq does slow I/O (e.g., because it
6241 * issues random requests), we charge bfqq with the time it has been
6242 * in service instead of the service it has received (see
6243 * bfq_bfqq_charge_time for details on how this goal is achieved). As
6244 * a consequence, bfqq will typically get higher timestamps upon
6245 * reactivation, and hence it will be rescheduled as if it had
6246 * received more service than what it has actually received. In the
6247 * end, bfqq receives less service in proportion to how slowly its
6248 * associated process consumes its budgets (and hence how seriously it
6249 * tends to lower the throughput). In addition, this time-charging
6250 * strategy guarantees time fairness among slow processes. In
6251 * contrast, if the process associated with bfqq is not slow, we
6252 * charge bfqq exactly with the service it has received.
6254 * Charging time to the first type of queues and the exact service to
6255 * the other has the effect of using the WF2Q+ policy to schedule the
6256 * former on a timeslice basis, without violating service domain
6257 * guarantees among the latter.
6259 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
6260 struct bfq_queue
*bfqq
,
6262 enum bfqq_expiration reason
)
6265 unsigned long delta
= 0;
6266 struct bfq_entity
*entity
= &bfqq
->entity
;
6270 * Check whether the process is slow (see bfq_bfqq_is_slow).
6272 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
6275 * Increase service_from_backlogged before next statement,
6276 * because the possible next invocation of
6277 * bfq_bfqq_charge_time would likely inflate
6278 * entity->service. In contrast, service_from_backlogged must
6279 * contain real service, to enable the soft real-time
6280 * heuristic to correctly compute the bandwidth consumed by
6283 bfqq
->service_from_backlogged
+= entity
->service
;
6286 * As above explained, charge slow (typically seeky) and
6287 * timed-out queues with the time and not the service
6288 * received, to favor sequential workloads.
6290 * Processes doing I/O in the slower disk zones will tend to
6291 * be slow(er) even if not seeky. Therefore, since the
6292 * estimated peak rate is actually an average over the disk
6293 * surface, these processes may timeout just for bad luck. To
6294 * avoid punishing them, do not charge time to processes that
6295 * succeeded in consuming at least 2/3 of their budget. This
6296 * allows BFQ to preserve enough elasticity to still perform
6297 * bandwidth, and not time, distribution with little unlucky
6298 * or quasi-sequential processes.
6300 if (bfqq
->wr_coeff
== 1 &&
6302 (reason
== BFQQE_BUDGET_TIMEOUT
&&
6303 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3)))
6304 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
6306 if (reason
== BFQQE_TOO_IDLE
&&
6307 entity
->service
<= 2 * entity
->budget
/ 10)
6308 bfq_clear_bfqq_IO_bound(bfqq
);
6310 if (bfqd
->low_latency
&& bfqq
->wr_coeff
== 1)
6311 bfqq
->last_wr_start_finish
= jiffies
;
6313 if (bfqd
->low_latency
&& bfqd
->bfq_wr_max_softrt_rate
> 0 &&
6314 RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
6316 * If we get here, and there are no outstanding
6317 * requests, then the request pattern is isochronous
6318 * (see the comments on the function
6319 * bfq_bfqq_softrt_next_start()). Thus we can compute
6320 * soft_rt_next_start. If, instead, the queue still
6321 * has outstanding requests, then we have to wait for
6322 * the completion of all the outstanding requests to
6323 * discover whether the request pattern is actually
6326 if (bfqq
->dispatched
== 0)
6327 bfqq
->soft_rt_next_start
=
6328 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
6331 * The application is still waiting for the
6332 * completion of one or more requests:
6333 * prevent it from possibly being incorrectly
6334 * deemed as soft real-time by setting its
6335 * soft_rt_next_start to infinity. In fact,
6336 * without this assignment, the application
6337 * would be incorrectly deemed as soft
6339 * 1) it issued a new request before the
6340 * completion of all its in-flight
6342 * 2) at that time, its soft_rt_next_start
6343 * happened to be in the past.
6345 bfqq
->soft_rt_next_start
=
6346 bfq_greatest_from_now();
6348 * Schedule an update of soft_rt_next_start to when
6349 * the task may be discovered to be isochronous.
6351 bfq_mark_bfqq_softrt_update(bfqq
);
6355 bfq_log_bfqq(bfqd
, bfqq
,
6356 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
6357 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
6360 * Increase, decrease or leave budget unchanged according to
6363 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
6365 __bfq_bfqq_expire(bfqd
, bfqq
);
6367 /* mark bfqq as waiting a request only if a bic still points to it */
6368 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
6369 reason
!= BFQQE_BUDGET_TIMEOUT
&&
6370 reason
!= BFQQE_BUDGET_EXHAUSTED
)
6371 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
6375 * Budget timeout is not implemented through a dedicated timer, but
6376 * just checked on request arrivals and completions, as well as on
6377 * idle timer expirations.
6379 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
6381 return time_is_before_eq_jiffies(bfqq
->budget_timeout
);
6385 * If we expire a queue that is actively waiting (i.e., with the
6386 * device idled) for the arrival of a new request, then we may incur
6387 * the timestamp misalignment problem described in the body of the
6388 * function __bfq_activate_entity. Hence we return true only if this
6389 * condition does not hold, or if the queue is slow enough to deserve
6390 * only to be kicked off for preserving a high throughput.
6392 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
6394 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
6395 "may_budget_timeout: wait_request %d left %d timeout %d",
6396 bfq_bfqq_wait_request(bfqq
),
6397 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
6398 bfq_bfqq_budget_timeout(bfqq
));
6400 return (!bfq_bfqq_wait_request(bfqq
) ||
6401 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
6403 bfq_bfqq_budget_timeout(bfqq
);
6407 * For a queue that becomes empty, device idling is allowed only if
6408 * this function returns true for the queue. As a consequence, since
6409 * device idling plays a critical role in both throughput boosting and
6410 * service guarantees, the return value of this function plays a
6411 * critical role in both these aspects as well.
6413 * In a nutshell, this function returns true only if idling is
6414 * beneficial for throughput or, even if detrimental for throughput,
6415 * idling is however necessary to preserve service guarantees (low
6416 * latency, desired throughput distribution, ...). In particular, on
6417 * NCQ-capable devices, this function tries to return false, so as to
6418 * help keep the drives' internal queues full, whenever this helps the
6419 * device boost the throughput without causing any service-guarantee
6422 * In more detail, the return value of this function is obtained by,
6423 * first, computing a number of boolean variables that take into
6424 * account throughput and service-guarantee issues, and, then,
6425 * combining these variables in a logical expression. Most of the
6426 * issues taken into account are not trivial. We discuss these issues
6427 * individually while introducing the variables.
6429 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
6431 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6432 bool idling_boosts_thr
, idling_boosts_thr_without_issues
,
6433 asymmetric_scenario
;
6435 if (bfqd
->strict_guarantees
)
6439 * The next variable takes into account the cases where idling
6440 * boosts the throughput.
6442 * The value of the variable is computed considering that
6443 * idling is usually beneficial for the throughput if:
6444 * (a) the device is not NCQ-capable, or
6445 * (b) regardless of the presence of NCQ, the device is rotational
6446 * and the request pattern for bfqq is I/O-bound (possible
6447 * throughput losses caused by granting idling to seeky queues
6448 * are mitigated by the fact that, in all scenarios where
6449 * boosting throughput is the best thing to do, i.e., in all
6450 * symmetric scenarios, only a minimal idle time is allowed to
6453 * Secondly, and in contrast to the above item (b), idling an
6454 * NCQ-capable flash-based device would not boost the
6455 * throughput even with intense I/O; rather it would lower
6456 * the throughput in proportion to how fast the device
6457 * is. Accordingly, the next variable is true if any of the
6458 * above conditions (a) and (b) is true, and, in particular,
6459 * happens to be false if bfqd is an NCQ-capable flash-based
6462 idling_boosts_thr
= !bfqd
->hw_tag
||
6463 (!blk_queue_nonrot(bfqd
->queue
) && bfq_bfqq_IO_bound(bfqq
));
6466 * The value of the next variable,
6467 * idling_boosts_thr_without_issues, is equal to that of
6468 * idling_boosts_thr, unless a special case holds. In this
6469 * special case, described below, idling may cause problems to
6470 * weight-raised queues.
6472 * When the request pool is saturated (e.g., in the presence
6473 * of write hogs), if the processes associated with
6474 * non-weight-raised queues ask for requests at a lower rate,
6475 * then processes associated with weight-raised queues have a
6476 * higher probability to get a request from the pool
6477 * immediately (or at least soon) when they need one. Thus
6478 * they have a higher probability to actually get a fraction
6479 * of the device throughput proportional to their high
6480 * weight. This is especially true with NCQ-capable drives,
6481 * which enqueue several requests in advance, and further
6482 * reorder internally-queued requests.
6484 * For this reason, we force to false the value of
6485 * idling_boosts_thr_without_issues if there are weight-raised
6486 * busy queues. In this case, and if bfqq is not weight-raised,
6487 * this guarantees that the device is not idled for bfqq (if,
6488 * instead, bfqq is weight-raised, then idling will be
6489 * guaranteed by another variable, see below). Combined with
6490 * the timestamping rules of BFQ (see [1] for details), this
6491 * behavior causes bfqq, and hence any sync non-weight-raised
6492 * queue, to get a lower number of requests served, and thus
6493 * to ask for a lower number of requests from the request
6494 * pool, before the busy weight-raised queues get served
6495 * again. This often mitigates starvation problems in the
6496 * presence of heavy write workloads and NCQ, thereby
6497 * guaranteeing a higher application and system responsiveness
6498 * in these hostile scenarios.
6500 idling_boosts_thr_without_issues
= idling_boosts_thr
&&
6501 bfqd
->wr_busy_queues
== 0;
6504 * There is then a case where idling must be performed not
6505 * for throughput concerns, but to preserve service
6508 * To introduce this case, we can note that allowing the drive
6509 * to enqueue more than one request at a time, and hence
6510 * delegating de facto final scheduling decisions to the
6511 * drive's internal scheduler, entails loss of control on the
6512 * actual request service order. In particular, the critical
6513 * situation is when requests from different processes happen
6514 * to be present, at the same time, in the internal queue(s)
6515 * of the drive. In such a situation, the drive, by deciding
6516 * the service order of the internally-queued requests, does
6517 * determine also the actual throughput distribution among
6518 * these processes. But the drive typically has no notion or
6519 * concern about per-process throughput distribution, and
6520 * makes its decisions only on a per-request basis. Therefore,
6521 * the service distribution enforced by the drive's internal
6522 * scheduler is likely to coincide with the desired
6523 * device-throughput distribution only in a completely
6524 * symmetric scenario where:
6525 * (i) each of these processes must get the same throughput as
6527 * (ii) all these processes have the same I/O pattern
6528 (either sequential or random).
6529 * In fact, in such a scenario, the drive will tend to treat
6530 * the requests of each of these processes in about the same
6531 * way as the requests of the others, and thus to provide
6532 * each of these processes with about the same throughput
6533 * (which is exactly the desired throughput distribution). In
6534 * contrast, in any asymmetric scenario, device idling is
6535 * certainly needed to guarantee that bfqq receives its
6536 * assigned fraction of the device throughput (see [1] for
6539 * We address this issue by controlling, actually, only the
6540 * symmetry sub-condition (i), i.e., provided that
6541 * sub-condition (i) holds, idling is not performed,
6542 * regardless of whether sub-condition (ii) holds. In other
6543 * words, only if sub-condition (i) holds, then idling is
6544 * allowed, and the device tends to be prevented from queueing
6545 * many requests, possibly of several processes. The reason
6546 * for not controlling also sub-condition (ii) is that we
6547 * exploit preemption to preserve guarantees in case of
6548 * symmetric scenarios, even if (ii) does not hold, as
6549 * explained in the next two paragraphs.
6551 * Even if a queue, say Q, is expired when it remains idle, Q
6552 * can still preempt the new in-service queue if the next
6553 * request of Q arrives soon (see the comments on
6554 * bfq_bfqq_update_budg_for_activation). If all queues and
6555 * groups have the same weight, this form of preemption,
6556 * combined with the hole-recovery heuristic described in the
6557 * comments on function bfq_bfqq_update_budg_for_activation,
6558 * are enough to preserve a correct bandwidth distribution in
6559 * the mid term, even without idling. In fact, even if not
6560 * idling allows the internal queues of the device to contain
6561 * many requests, and thus to reorder requests, we can rather
6562 * safely assume that the internal scheduler still preserves a
6563 * minimum of mid-term fairness. The motivation for using
6564 * preemption instead of idling is that, by not idling,
6565 * service guarantees are preserved without minimally
6566 * sacrificing throughput. In other words, both a high
6567 * throughput and its desired distribution are obtained.
6569 * More precisely, this preemption-based, idleless approach
6570 * provides fairness in terms of IOPS, and not sectors per
6571 * second. This can be seen with a simple example. Suppose
6572 * that there are two queues with the same weight, but that
6573 * the first queue receives requests of 8 sectors, while the
6574 * second queue receives requests of 1024 sectors. In
6575 * addition, suppose that each of the two queues contains at
6576 * most one request at a time, which implies that each queue
6577 * always remains idle after it is served. Finally, after
6578 * remaining idle, each queue receives very quickly a new
6579 * request. It follows that the two queues are served
6580 * alternatively, preempting each other if needed. This
6581 * implies that, although both queues have the same weight,
6582 * the queue with large requests receives a service that is
6583 * 1024/8 times as high as the service received by the other
6586 * On the other hand, device idling is performed, and thus
6587 * pure sector-domain guarantees are provided, for the
6588 * following queues, which are likely to need stronger
6589 * throughput guarantees: weight-raised queues, and queues
6590 * with a higher weight than other queues. When such queues
6591 * are active, sub-condition (i) is false, which triggers
6594 * According to the above considerations, the next variable is
6595 * true (only) if sub-condition (i) holds. To compute the
6596 * value of this variable, we not only use the return value of
6597 * the function bfq_symmetric_scenario(), but also check
6598 * whether bfqq is being weight-raised, because
6599 * bfq_symmetric_scenario() does not take into account also
6600 * weight-raised queues (see comments on
6601 * bfq_weights_tree_add()).
6603 * As a side note, it is worth considering that the above
6604 * device-idling countermeasures may however fail in the
6605 * following unlucky scenario: if idling is (correctly)
6606 * disabled in a time period during which all symmetry
6607 * sub-conditions hold, and hence the device is allowed to
6608 * enqueue many requests, but at some later point in time some
6609 * sub-condition stops to hold, then it may become impossible
6610 * to let requests be served in the desired order until all
6611 * the requests already queued in the device have been served.
6613 asymmetric_scenario
= bfqq
->wr_coeff
> 1 ||
6614 !bfq_symmetric_scenario(bfqd
);
6617 * We have now all the components we need to compute the return
6618 * value of the function, which is true only if both the following
6620 * 1) bfqq is sync, because idling make sense only for sync queues;
6621 * 2) idling either boosts the throughput (without issues), or
6622 * is necessary to preserve service guarantees.
6624 return bfq_bfqq_sync(bfqq
) &&
6625 (idling_boosts_thr_without_issues
|| asymmetric_scenario
);
6629 * If the in-service queue is empty but the function bfq_bfqq_may_idle
6630 * returns true, then:
6631 * 1) the queue must remain in service and cannot be expired, and
6632 * 2) the device must be idled to wait for the possible arrival of a new
6633 * request for the queue.
6634 * See the comments on the function bfq_bfqq_may_idle for the reasons
6635 * why performing device idling is the best choice to boost the throughput
6636 * and preserve service guarantees when bfq_bfqq_may_idle itself
6639 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
6641 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6643 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
6644 bfq_bfqq_may_idle(bfqq
);
6648 * Select a queue for service. If we have a current queue in service,
6649 * check whether to continue servicing it, or retrieve and set a new one.
6651 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
6653 struct bfq_queue
*bfqq
;
6654 struct request
*next_rq
;
6655 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
6657 bfqq
= bfqd
->in_service_queue
;
6661 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
6663 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
6664 !bfq_bfqq_wait_request(bfqq
) &&
6665 !bfq_bfqq_must_idle(bfqq
))
6670 * This loop is rarely executed more than once. Even when it
6671 * happens, it is much more convenient to re-execute this loop
6672 * than to return NULL and trigger a new dispatch to get a
6675 next_rq
= bfqq
->next_rq
;
6677 * If bfqq has requests queued and it has enough budget left to
6678 * serve them, keep the queue, otherwise expire it.
6681 if (bfq_serv_to_charge(next_rq
, bfqq
) >
6682 bfq_bfqq_budget_left(bfqq
)) {
6684 * Expire the queue for budget exhaustion,
6685 * which makes sure that the next budget is
6686 * enough to serve the next request, even if
6687 * it comes from the fifo expired path.
6689 reason
= BFQQE_BUDGET_EXHAUSTED
;
6693 * The idle timer may be pending because we may
6694 * not disable disk idling even when a new request
6697 if (bfq_bfqq_wait_request(bfqq
)) {
6699 * If we get here: 1) at least a new request
6700 * has arrived but we have not disabled the
6701 * timer because the request was too small,
6702 * 2) then the block layer has unplugged
6703 * the device, causing the dispatch to be
6706 * Since the device is unplugged, now the
6707 * requests are probably large enough to
6708 * provide a reasonable throughput.
6709 * So we disable idling.
6711 bfq_clear_bfqq_wait_request(bfqq
);
6712 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
6713 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
6720 * No requests pending. However, if the in-service queue is idling
6721 * for a new request, or has requests waiting for a completion and
6722 * may idle after their completion, then keep it anyway.
6724 if (bfq_bfqq_wait_request(bfqq
) ||
6725 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
6730 reason
= BFQQE_NO_MORE_REQUESTS
;
6732 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
6734 bfqq
= bfq_set_in_service_queue(bfqd
);
6736 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
6741 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
6743 bfq_log(bfqd
, "select_queue: no queue returned");
6748 static void bfq_update_wr_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6750 struct bfq_entity
*entity
= &bfqq
->entity
;
6752 if (bfqq
->wr_coeff
> 1) { /* queue is being weight-raised */
6753 bfq_log_bfqq(bfqd
, bfqq
,
6754 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
6755 jiffies_to_msecs(jiffies
- bfqq
->last_wr_start_finish
),
6756 jiffies_to_msecs(bfqq
->wr_cur_max_time
),
6758 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
6760 if (entity
->prio_changed
)
6761 bfq_log_bfqq(bfqd
, bfqq
, "WARN: pending prio change");
6764 * If too much time has elapsed from the beginning of
6765 * this weight-raising period, then end weight raising.
6767 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
+
6768 bfqq
->wr_cur_max_time
)) {
6769 if (bfqq
->wr_cur_max_time
!= bfqd
->bfq_wr_rt_max_time
||
6770 time_is_before_jiffies(bfqq
->wr_start_at_switch_to_srt
+
6771 bfq_wr_duration(bfqd
)))
6772 bfq_bfqq_end_wr(bfqq
);
6774 /* switch back to interactive wr */
6775 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
6776 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
6777 bfqq
->last_wr_start_finish
=
6778 bfqq
->wr_start_at_switch_to_srt
;
6779 bfqq
->entity
.prio_changed
= 1;
6783 /* Update weight both if it must be raised and if it must be lowered */
6784 if ((entity
->weight
> entity
->orig_weight
) != (bfqq
->wr_coeff
> 1))
6785 __bfq_entity_update_weight_prio(
6786 bfq_entity_service_tree(entity
),
6791 * Dispatch next request from bfqq.
6793 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
6794 struct bfq_queue
*bfqq
)
6796 struct request
*rq
= bfqq
->next_rq
;
6797 unsigned long service_to_charge
;
6799 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
6801 bfq_bfqq_served(bfqq
, service_to_charge
);
6803 bfq_dispatch_remove(bfqd
->queue
, rq
);
6806 * If weight raising has to terminate for bfqq, then next
6807 * function causes an immediate update of bfqq's weight,
6808 * without waiting for next activation. As a consequence, on
6809 * expiration, bfqq will be timestamped as if has never been
6810 * weight-raised during this service slot, even if it has
6811 * received part or even most of the service as a
6812 * weight-raised queue. This inflates bfqq's timestamps, which
6813 * is beneficial, as bfqq is then more willing to leave the
6814 * device immediately to possible other weight-raised queues.
6816 bfq_update_wr_data(bfqd
, bfqq
);
6818 if (!bfqd
->in_service_bic
) {
6819 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
6820 bfqd
->in_service_bic
= RQ_BIC(rq
);
6824 * Expire bfqq, pretending that its budget expired, if bfqq
6825 * belongs to CLASS_IDLE and other queues are waiting for
6828 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
6834 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
6838 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
6840 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6843 * Avoiding lock: a race on bfqd->busy_queues should cause at
6844 * most a call to dispatch for nothing
6846 return !list_empty_careful(&bfqd
->dispatch
) ||
6847 bfqd
->busy_queues
> 0;
6850 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
6852 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6853 struct request
*rq
= NULL
;
6854 struct bfq_queue
*bfqq
= NULL
;
6856 if (!list_empty(&bfqd
->dispatch
)) {
6857 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
6859 list_del_init(&rq
->queuelist
);
6865 * Increment counters here, because this
6866 * dispatch does not follow the standard
6867 * dispatch flow (where counters are
6872 goto inc_in_driver_start_rq
;
6876 * We exploit the put_rq_private hook to decrement
6877 * rq_in_driver, but put_rq_private will not be
6878 * invoked on this request. So, to avoid unbalance,
6879 * just start this request, without incrementing
6880 * rq_in_driver. As a negative consequence,
6881 * rq_in_driver is deceptively lower than it should be
6882 * while this request is in service. This may cause
6883 * bfq_schedule_dispatch to be invoked uselessly.
6885 * As for implementing an exact solution, the
6886 * put_request hook, if defined, is probably invoked
6887 * also on this request. So, by exploiting this hook,
6888 * we could 1) increment rq_in_driver here, and 2)
6889 * decrement it in put_request. Such a solution would
6890 * let the value of the counter be always accurate,
6891 * but it would entail using an extra interface
6892 * function. This cost seems higher than the benefit,
6893 * being the frequency of non-elevator-private
6894 * requests very low.
6899 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
6901 if (bfqd
->busy_queues
== 0)
6905 * Force device to serve one request at a time if
6906 * strict_guarantees is true. Forcing this service scheme is
6907 * currently the ONLY way to guarantee that the request
6908 * service order enforced by the scheduler is respected by a
6909 * queueing device. Otherwise the device is free even to make
6910 * some unlucky request wait for as long as the device
6913 * Of course, serving one request at at time may cause loss of
6916 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
6919 bfqq
= bfq_select_queue(bfqd
);
6923 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
6926 inc_in_driver_start_rq
:
6927 bfqd
->rq_in_driver
++;
6929 rq
->rq_flags
|= RQF_STARTED
;
6935 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
6937 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
6940 spin_lock_irq(&bfqd
->lock
);
6942 rq
= __bfq_dispatch_request(hctx
);
6943 bfq_unlock_put_ioc(bfqd
);
6949 * Task holds one reference to the queue, dropped when task exits. Each rq
6950 * in-flight on this queue also holds a reference, dropped when rq is freed.
6952 * Scheduler lock must be held here. Recall not to use bfqq after calling
6953 * this function on it.
6955 static void bfq_put_queue(struct bfq_queue
*bfqq
)
6957 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6958 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
6962 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
6969 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
6971 kmem_cache_free(bfq_pool
, bfqq
);
6972 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6977 static void bfq_put_cooperator(struct bfq_queue
*bfqq
)
6979 struct bfq_queue
*__bfqq
, *next
;
6982 * If this queue was scheduled to merge with another queue, be
6983 * sure to drop the reference taken on that queue (and others in
6984 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
6986 __bfqq
= bfqq
->new_bfqq
;
6990 next
= __bfqq
->new_bfqq
;
6991 bfq_put_queue(__bfqq
);
6996 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6998 if (bfqq
== bfqd
->in_service_queue
) {
6999 __bfq_bfqq_expire(bfqd
, bfqq
);
7000 bfq_schedule_dispatch(bfqd
);
7003 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
7005 bfq_put_cooperator(bfqq
);
7007 bfq_put_queue(bfqq
); /* release process reference */
7010 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
7012 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
7013 struct bfq_data
*bfqd
;
7016 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
7019 unsigned long flags
;
7021 spin_lock_irqsave(&bfqd
->lock
, flags
);
7023 * If the bic is using a shared queue, put the
7024 * reference taken on the io_context when the bic
7025 * started using a shared bfq_queue. This put cannot
7026 * make ioc->ref_count reach 0, then no ioc->lock
7027 * risks to be taken (leading to possible deadlock
7030 if (is_sync
&& bfq_bfqq_coop(bfqq
))
7031 put_io_context(bic
->icq
.ioc
);
7033 bfq_exit_bfqq(bfqd
, bfqq
);
7034 bic_set_bfqq(bic
, NULL
, is_sync
);
7035 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7039 static void bfq_exit_icq(struct io_cq
*icq
)
7041 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
7043 bfq_exit_icq_bfqq(bic
, true);
7044 bfq_exit_icq_bfqq(bic
, false);
7048 * Update the entity prio values; note that the new values will not
7049 * be used until the next (re)activation.
7052 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
7054 struct task_struct
*tsk
= current
;
7056 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7061 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
7062 switch (ioprio_class
) {
7064 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
7065 "bfq: bad prio class %d\n", ioprio_class
);
7066 case IOPRIO_CLASS_NONE
:
7068 * No prio set, inherit CPU scheduling settings.
7070 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
7071 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
7073 case IOPRIO_CLASS_RT
:
7074 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7075 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
7077 case IOPRIO_CLASS_BE
:
7078 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7079 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
7081 case IOPRIO_CLASS_IDLE
:
7082 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
7083 bfqq
->new_ioprio
= 7;
7084 bfq_clear_bfqq_idle_window(bfqq
);
7088 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
7089 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
7091 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
7094 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
7095 bfqq
->entity
.prio_changed
= 1;
7098 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
7100 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
7101 struct bfq_queue
*bfqq
;
7102 int ioprio
= bic
->icq
.ioc
->ioprio
;
7105 * This condition may trigger on a newly created bic, be sure to
7106 * drop the lock before returning.
7108 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
7111 bic
->ioprio
= ioprio
;
7113 bfqq
= bic_to_bfqq(bic
, false);
7115 /* release process reference on this queue */
7116 bfq_put_queue(bfqq
);
7117 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
7118 bic_set_bfqq(bic
, bfqq
, false);
7121 bfqq
= bic_to_bfqq(bic
, true);
7123 bfq_set_next_ioprio_data(bfqq
, bic
);
7126 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7127 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
7129 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
7130 INIT_LIST_HEAD(&bfqq
->fifo
);
7136 bfq_set_next_ioprio_data(bfqq
, bic
);
7139 if (!bfq_class_idle(bfqq
))
7140 bfq_mark_bfqq_idle_window(bfqq
);
7141 bfq_mark_bfqq_sync(bfqq
);
7143 bfq_clear_bfqq_sync(bfqq
);
7145 /* set end request to minus infinity from now */
7146 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
7148 bfq_mark_bfqq_IO_bound(bfqq
);
7152 /* Tentative initial value to trade off between thr and lat */
7153 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
7154 bfqq
->budget_timeout
= bfq_smallest_from_now();
7157 bfqq
->last_wr_start_finish
= jiffies
;
7158 bfqq
->wr_start_at_switch_to_srt
= bfq_smallest_from_now();
7159 bfqq
->split_time
= bfq_smallest_from_now();
7162 * Set to the value for which bfqq will not be deemed as
7163 * soft rt when it becomes backlogged.
7165 bfqq
->soft_rt_next_start
= bfq_greatest_from_now();
7167 /* first request is almost certainly seeky */
7168 bfqq
->seek_history
= 1;
7171 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
7172 struct bfq_group
*bfqg
,
7173 int ioprio_class
, int ioprio
)
7175 switch (ioprio_class
) {
7176 case IOPRIO_CLASS_RT
:
7177 return &bfqg
->async_bfqq
[0][ioprio
];
7178 case IOPRIO_CLASS_NONE
:
7179 ioprio
= IOPRIO_NORM
;
7181 case IOPRIO_CLASS_BE
:
7182 return &bfqg
->async_bfqq
[1][ioprio
];
7183 case IOPRIO_CLASS_IDLE
:
7184 return &bfqg
->async_idle_bfqq
;
7190 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
7191 struct bio
*bio
, bool is_sync
,
7192 struct bfq_io_cq
*bic
)
7194 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
7195 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
7196 struct bfq_queue
**async_bfqq
= NULL
;
7197 struct bfq_queue
*bfqq
;
7198 struct bfq_group
*bfqg
;
7202 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
7204 bfqq
= &bfqd
->oom_bfqq
;
7209 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
7216 bfqq
= kmem_cache_alloc_node(bfq_pool
,
7217 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
7221 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
7223 bfq_init_entity(&bfqq
->entity
, bfqg
);
7224 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
7226 bfqq
= &bfqd
->oom_bfqq
;
7227 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
7232 * Pin the queue now that it's allocated, scheduler exit will
7237 * Extra group reference, w.r.t. sync
7238 * queue. This extra reference is removed
7239 * only if bfqq->bfqg disappears, to
7240 * guarantee that this queue is not freed
7241 * until its group goes away.
7243 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
7249 bfqq
->ref
++; /* get a process reference to this queue */
7250 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
7255 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
7256 struct bfq_queue
*bfqq
)
7258 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
7259 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
7261 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
7263 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
7264 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
7265 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
7266 ttime
->ttime_samples
);
7270 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7273 bfqq
->seek_history
<<= 1;
7274 bfqq
->seek_history
|=
7275 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
7276 (!blk_queue_nonrot(bfqd
->queue
) ||
7277 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
7281 * Disable idle window if the process thinks too long or seeks so much that
7282 * it doesn't matter.
7284 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
7285 struct bfq_queue
*bfqq
,
7286 struct bfq_io_cq
*bic
)
7290 /* Don't idle for async or idle io prio class. */
7291 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
7294 /* Idle window just restored, statistics are meaningless. */
7295 if (time_is_after_eq_jiffies(bfqq
->split_time
+
7296 bfqd
->bfq_wr_min_idle_time
))
7299 enable_idle
= bfq_bfqq_idle_window(bfqq
);
7301 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
7302 bfqd
->bfq_slice_idle
== 0 ||
7303 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
7304 bfqq
->wr_coeff
== 1))
7306 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
7307 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
7308 bfqq
->wr_coeff
== 1)
7313 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
7317 bfq_mark_bfqq_idle_window(bfqq
);
7319 bfq_clear_bfqq_idle_window(bfqq
);
7323 * Called when a new fs request (rq) is added to bfqq. Check if there's
7324 * something we should do about it.
7326 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
7329 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
7331 if (rq
->cmd_flags
& REQ_META
)
7332 bfqq
->meta_pending
++;
7334 bfq_update_io_thinktime(bfqd
, bfqq
);
7335 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
7336 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
7338 bfq_update_idle_window(bfqd
, bfqq
, bic
);
7340 bfq_log_bfqq(bfqd
, bfqq
,
7341 "rq_enqueued: idle_window=%d (seeky %d)",
7342 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
7344 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
7346 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
7347 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
7348 blk_rq_sectors(rq
) < 32;
7349 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
7352 * There is just this request queued: if the request
7353 * is small and the queue is not to be expired, then
7356 * In this way, if the device is being idled to wait
7357 * for a new request from the in-service queue, we
7358 * avoid unplugging the device and committing the
7359 * device to serve just a small request. On the
7360 * contrary, we wait for the block layer to decide
7361 * when to unplug the device: hopefully, new requests
7362 * will be merged to this one quickly, then the device
7363 * will be unplugged and larger requests will be
7366 if (small_req
&& !budget_timeout
)
7370 * A large enough request arrived, or the queue is to
7371 * be expired: in both cases disk idling is to be
7372 * stopped, so clear wait_request flag and reset
7375 bfq_clear_bfqq_wait_request(bfqq
);
7376 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
7377 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
7380 * The queue is not empty, because a new request just
7381 * arrived. Hence we can safely expire the queue, in
7382 * case of budget timeout, without risking that the
7383 * timestamps of the queue are not updated correctly.
7384 * See [1] for more details.
7387 bfq_bfqq_expire(bfqd
, bfqq
, false,
7388 BFQQE_BUDGET_TIMEOUT
);
7392 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
7394 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
),
7395 *new_bfqq
= bfq_setup_cooperator(bfqd
, bfqq
, rq
, true);
7398 if (bic_to_bfqq(RQ_BIC(rq
), 1) != bfqq
)
7399 new_bfqq
= bic_to_bfqq(RQ_BIC(rq
), 1);
7401 * Release the request's reference to the old bfqq
7402 * and make sure one is taken to the shared queue.
7404 new_bfqq
->allocated
++;
7408 * If the bic associated with the process
7409 * issuing this request still points to bfqq
7410 * (and thus has not been already redirected
7411 * to new_bfqq or even some other bfq_queue),
7412 * then complete the merge and redirect it to
7415 if (bic_to_bfqq(RQ_BIC(rq
), 1) == bfqq
)
7416 bfq_merge_bfqqs(bfqd
, RQ_BIC(rq
),
7419 * rq is about to be enqueued into new_bfqq,
7420 * release rq reference on bfqq
7422 bfq_put_queue(bfqq
);
7423 rq
->elv
.priv
[1] = new_bfqq
;
7427 bfq_add_request(rq
);
7429 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
7430 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
7432 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
7435 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
7438 struct request_queue
*q
= hctx
->queue
;
7439 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
7441 spin_lock_irq(&bfqd
->lock
);
7442 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
7443 spin_unlock_irq(&bfqd
->lock
);
7447 spin_unlock_irq(&bfqd
->lock
);
7449 blk_mq_sched_request_inserted(rq
);
7451 spin_lock_irq(&bfqd
->lock
);
7452 if (at_head
|| blk_rq_is_passthrough(rq
)) {
7454 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
7456 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
7458 __bfq_insert_request(bfqd
, rq
);
7460 if (rq_mergeable(rq
)) {
7461 elv_rqhash_add(q
, rq
);
7467 bfq_unlock_put_ioc(bfqd
);
7470 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
7471 struct list_head
*list
, bool at_head
)
7473 while (!list_empty(list
)) {
7476 rq
= list_first_entry(list
, struct request
, queuelist
);
7477 list_del_init(&rq
->queuelist
);
7478 bfq_insert_request(hctx
, rq
, at_head
);
7482 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
7484 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
7485 bfqd
->rq_in_driver
);
7487 if (bfqd
->hw_tag
== 1)
7491 * This sample is valid if the number of outstanding requests
7492 * is large enough to allow a queueing behavior. Note that the
7493 * sum is not exact, as it's not taking into account deactivated
7496 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
7499 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
7502 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
7503 bfqd
->max_rq_in_driver
= 0;
7504 bfqd
->hw_tag_samples
= 0;
7507 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
7512 bfq_update_hw_tag(bfqd
);
7514 bfqd
->rq_in_driver
--;
7517 if (!bfqq
->dispatched
&& !bfq_bfqq_busy(bfqq
)) {
7519 * Set budget_timeout (which we overload to store the
7520 * time at which the queue remains with no backlog and
7521 * no outstanding request; used by the weight-raising
7524 bfqq
->budget_timeout
= jiffies
;
7526 bfq_weights_tree_remove(bfqd
, &bfqq
->entity
,
7527 &bfqd
->queue_weights_tree
);
7530 now_ns
= ktime_get_ns();
7532 bfqq
->ttime
.last_end_request
= now_ns
;
7535 * Using us instead of ns, to get a reasonable precision in
7536 * computing rate in next check.
7538 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
7541 * If the request took rather long to complete, and, according
7542 * to the maximum request size recorded, this completion latency
7543 * implies that the request was certainly served at a very low
7544 * rate (less than 1M sectors/sec), then the whole observation
7545 * interval that lasts up to this time instant cannot be a
7546 * valid time interval for computing a new peak rate. Invoke
7547 * bfq_update_rate_reset to have the following three steps
7549 * - close the observation interval at the last (previous)
7550 * request dispatch or completion
7551 * - compute rate, if possible, for that observation interval
7552 * - reset to zero samples, which will trigger a proper
7553 * re-initialization of the observation interval on next
7556 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
7557 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
7558 1UL<<(BFQ_RATE_SHIFT
- 10))
7559 bfq_update_rate_reset(bfqd
, NULL
);
7560 bfqd
->last_completion
= now_ns
;
7563 * If we are waiting to discover whether the request pattern
7564 * of the task associated with the queue is actually
7565 * isochronous, and both requisites for this condition to hold
7566 * are now satisfied, then compute soft_rt_next_start (see the
7567 * comments on the function bfq_bfqq_softrt_next_start()). We
7568 * schedule this delayed check when bfqq expires, if it still
7569 * has in-flight requests.
7571 if (bfq_bfqq_softrt_update(bfqq
) && bfqq
->dispatched
== 0 &&
7572 RB_EMPTY_ROOT(&bfqq
->sort_list
))
7573 bfqq
->soft_rt_next_start
=
7574 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
7577 * If this is the in-service queue, check if it needs to be expired,
7578 * or if we want to idle in case it has no pending requests.
7580 if (bfqd
->in_service_queue
== bfqq
) {
7581 if (bfqq
->dispatched
== 0 && bfq_bfqq_must_idle(bfqq
)) {
7582 bfq_arm_slice_timer(bfqd
);
7584 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
7585 bfq_bfqq_expire(bfqd
, bfqq
, false,
7586 BFQQE_BUDGET_TIMEOUT
);
7587 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
7588 (bfqq
->dispatched
== 0 ||
7589 !bfq_bfqq_may_idle(bfqq
)))
7590 bfq_bfqq_expire(bfqd
, bfqq
, false,
7591 BFQQE_NO_MORE_REQUESTS
);
7595 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
7599 bfq_put_queue(bfqq
);
7602 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
7604 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
7605 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7607 if (rq
->rq_flags
& RQF_STARTED
)
7608 bfqg_stats_update_completion(bfqq_group(bfqq
),
7609 rq_start_time_ns(rq
),
7610 rq_io_start_time_ns(rq
),
7613 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
7614 unsigned long flags
;
7616 spin_lock_irqsave(&bfqd
->lock
, flags
);
7618 bfq_completed_request(bfqq
, bfqd
);
7619 bfq_put_rq_priv_body(bfqq
);
7621 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7624 * Request rq may be still/already in the scheduler,
7625 * in which case we need to remove it. And we cannot
7626 * defer such a check and removal, to avoid
7627 * inconsistencies in the time interval from the end
7628 * of this function to the start of the deferred work.
7629 * This situation seems to occur only in process
7630 * context, as a consequence of a merge. In the
7631 * current version of the code, this implies that the
7635 if (!RB_EMPTY_NODE(&rq
->rb_node
))
7636 bfq_remove_request(q
, rq
);
7637 bfq_put_rq_priv_body(bfqq
);
7640 rq
->elv
.priv
[0] = NULL
;
7641 rq
->elv
.priv
[1] = NULL
;
7645 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
7646 * was the last process referring to that bfqq.
7648 static struct bfq_queue
*
7649 bfq_split_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
)
7651 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "splitting queue");
7653 if (bfqq_process_refs(bfqq
) == 1) {
7654 bfqq
->pid
= current
->pid
;
7655 bfq_clear_bfqq_coop(bfqq
);
7656 bfq_clear_bfqq_split_coop(bfqq
);
7660 bic_set_bfqq(bic
, NULL
, 1);
7662 bfq_put_cooperator(bfqq
);
7664 bfq_put_queue(bfqq
);
7668 static struct bfq_queue
*bfq_get_bfqq_handle_split(struct bfq_data
*bfqd
,
7669 struct bfq_io_cq
*bic
,
7671 bool split
, bool is_sync
,
7674 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
7676 if (likely(bfqq
&& bfqq
!= &bfqd
->oom_bfqq
))
7683 bfq_put_queue(bfqq
);
7684 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
7686 bic_set_bfqq(bic
, bfqq
, is_sync
);
7687 if (split
&& is_sync
)
7688 bfqq
->split_time
= jiffies
;
7694 * Allocate bfq data structures associated with this request.
7696 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
7699 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
7700 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
7701 const int is_sync
= rq_is_sync(rq
);
7702 struct bfq_queue
*bfqq
;
7703 bool new_queue
= false;
7705 spin_lock_irq(&bfqd
->lock
);
7707 bfq_check_ioprio_change(bic
, bio
);
7712 bfq_bic_update_cgroup(bic
, bio
);
7714 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
, false, is_sync
,
7717 if (likely(!new_queue
)) {
7718 /* If the queue was seeky for too long, break it apart. */
7719 if (bfq_bfqq_coop(bfqq
) && bfq_bfqq_split_coop(bfqq
)) {
7720 bfq_log_bfqq(bfqd
, bfqq
, "breaking apart bfqq");
7721 bfqq
= bfq_split_bfqq(bic
, bfqq
);
7723 * A reference to bic->icq.ioc needs to be
7724 * released after a queue split. Do not do it
7725 * immediately, to not risk to possibly take
7726 * an ioc->lock while holding the scheduler
7729 bfqd
->ioc_to_put
= bic
->icq
.ioc
;
7732 bfqq
= bfq_get_bfqq_handle_split(bfqd
, bic
, bio
,
7740 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
7741 rq
, bfqq
, bfqq
->ref
);
7743 rq
->elv
.priv
[0] = bic
;
7744 rq
->elv
.priv
[1] = bfqq
;
7747 * If a bfq_queue has only one process reference, it is owned
7748 * by only this bic: we can then set bfqq->bic = bic. in
7749 * addition, if the queue has also just been split, we have to
7752 if (likely(bfqq
!= &bfqd
->oom_bfqq
) && bfqq_process_refs(bfqq
) == 1) {
7754 if (bfqd
->ioc_to_put
) { /* if true, there has been a split */
7756 * The queue has just been split from a shared
7757 * queue: restore the idle window and the
7758 * possible weight raising period.
7760 bfq_bfqq_resume_state(bfqq
, bic
);
7764 bfq_unlock_put_ioc(bfqd
);
7769 spin_unlock_irq(&bfqd
->lock
);
7774 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
7776 struct bfq_data
*bfqd
= bfqq
->bfqd
;
7777 enum bfqq_expiration reason
;
7778 unsigned long flags
;
7780 spin_lock_irqsave(&bfqd
->lock
, flags
);
7781 bfq_clear_bfqq_wait_request(bfqq
);
7783 if (bfqq
!= bfqd
->in_service_queue
) {
7784 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
7788 if (bfq_bfqq_budget_timeout(bfqq
))
7790 * Also here the queue can be safely expired
7791 * for budget timeout without wasting
7794 reason
= BFQQE_BUDGET_TIMEOUT
;
7795 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
7797 * The queue may not be empty upon timer expiration,
7798 * because we may not disable the timer when the
7799 * first request of the in-service queue arrives
7800 * during disk idling.
7802 reason
= BFQQE_TOO_IDLE
;
7804 goto schedule_dispatch
;
7806 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
7809 bfq_unlock_put_ioc_restore(bfqd
, flags
);
7810 bfq_schedule_dispatch(bfqd
);
7814 * Handler of the expiration of the timer running if the in-service queue
7815 * is idling inside its time slice.
7817 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
7819 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
7821 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
7824 * Theoretical race here: the in-service queue can be NULL or
7825 * different from the queue that was idling if a new request
7826 * arrives for the current queue and there is a full dispatch
7827 * cycle that changes the in-service queue. This can hardly
7828 * happen, but in the worst case we just expire a queue too
7832 bfq_idle_slice_timer_body(bfqq
);
7834 return HRTIMER_NORESTART
;
7837 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
7838 struct bfq_queue
**bfqq_ptr
)
7840 struct bfq_queue
*bfqq
= *bfqq_ptr
;
7842 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
7844 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
7846 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
7848 bfq_put_queue(bfqq
);
7854 * Release all the bfqg references to its async queues. If we are
7855 * deallocating the group these queues may still contain requests, so
7856 * we reparent them to the root cgroup (i.e., the only one that will
7857 * exist for sure until all the requests on a device are gone).
7859 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
7863 for (i
= 0; i
< 2; i
++)
7864 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
7865 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
7867 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
7870 static void bfq_exit_queue(struct elevator_queue
*e
)
7872 struct bfq_data
*bfqd
= e
->elevator_data
;
7873 struct bfq_queue
*bfqq
, *n
;
7875 hrtimer_cancel(&bfqd
->idle_slice_timer
);
7877 spin_lock_irq(&bfqd
->lock
);
7878 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
7879 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
7880 spin_unlock_irq(&bfqd
->lock
);
7882 hrtimer_cancel(&bfqd
->idle_slice_timer
);
7884 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7885 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
7887 spin_lock_irq(&bfqd
->lock
);
7888 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
7889 kfree(bfqd
->root_group
);
7890 spin_unlock_irq(&bfqd
->lock
);
7896 static void bfq_init_root_group(struct bfq_group
*root_group
,
7897 struct bfq_data
*bfqd
)
7901 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7902 root_group
->entity
.parent
= NULL
;
7903 root_group
->my_entity
= NULL
;
7904 root_group
->bfqd
= bfqd
;
7906 root_group
->rq_pos_tree
= RB_ROOT
;
7907 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
7908 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
7909 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
7912 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
7914 struct bfq_data
*bfqd
;
7915 struct elevator_queue
*eq
;
7917 eq
= elevator_alloc(q
, e
);
7921 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
7923 kobject_put(&eq
->kobj
);
7926 eq
->elevator_data
= bfqd
;
7928 spin_lock_irq(q
->queue_lock
);
7930 spin_unlock_irq(q
->queue_lock
);
7933 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
7934 * Grab a permanent reference to it, so that the normal code flow
7935 * will not attempt to free it.
7937 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
7938 bfqd
->oom_bfqq
.ref
++;
7939 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
7940 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
7941 bfqd
->oom_bfqq
.entity
.new_weight
=
7942 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
7944 * Trigger weight initialization, according to ioprio, at the
7945 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
7946 * class won't be changed any more.
7948 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
7952 INIT_LIST_HEAD(&bfqd
->dispatch
);
7954 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
7956 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
7958 bfqd
->queue_weights_tree
= RB_ROOT
;
7959 bfqd
->group_weights_tree
= RB_ROOT
;
7961 INIT_LIST_HEAD(&bfqd
->active_list
);
7962 INIT_LIST_HEAD(&bfqd
->idle_list
);
7966 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
7968 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
7969 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
7970 bfqd
->bfq_back_max
= bfq_back_max
;
7971 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
7972 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
7973 bfqd
->bfq_timeout
= bfq_timeout
;
7975 bfqd
->bfq_requests_within_timer
= 120;
7977 bfqd
->low_latency
= true;
7980 * Trade-off between responsiveness and fairness.
7982 bfqd
->bfq_wr_coeff
= 30;
7983 bfqd
->bfq_wr_rt_max_time
= msecs_to_jiffies(300);
7984 bfqd
->bfq_wr_max_time
= 0;
7985 bfqd
->bfq_wr_min_idle_time
= msecs_to_jiffies(2000);
7986 bfqd
->bfq_wr_min_inter_arr_async
= msecs_to_jiffies(500);
7987 bfqd
->bfq_wr_max_softrt_rate
= 7000; /*
7988 * Approximate rate required
7989 * to playback or record a
7990 * high-definition compressed
7993 bfqd
->wr_busy_queues
= 0;
7996 * Begin by assuming, optimistically, that the device is a
7997 * high-speed one, and that its peak rate is equal to 2/3 of
7998 * the highest reference rate.
8000 bfqd
->RT_prod
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] *
8001 T_fast
[blk_queue_nonrot(bfqd
->queue
)];
8002 bfqd
->peak_rate
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] * 2 / 3;
8003 bfqd
->device_speed
= BFQ_BFQD_FAST
;
8005 spin_lock_init(&bfqd
->lock
);
8008 * The invocation of the next bfq_create_group_hierarchy
8009 * function is the head of a chain of function calls
8010 * (bfq_create_group_hierarchy->blkcg_activate_policy->
8011 * blk_mq_freeze_queue) that may lead to the invocation of the
8012 * has_work hook function. For this reason,
8013 * bfq_create_group_hierarchy is invoked only after all
8014 * scheduler data has been initialized, apart from the fields
8015 * that can be initialized only after invoking
8016 * bfq_create_group_hierarchy. This, in particular, enables
8017 * has_work to correctly return false. Of course, to avoid
8018 * other inconsistencies, the blk-mq stack must then refrain
8019 * from invoking further scheduler hooks before this init
8020 * function is finished.
8022 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
8023 if (!bfqd
->root_group
)
8025 bfq_init_root_group(bfqd
->root_group
, bfqd
);
8026 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
8033 kobject_put(&eq
->kobj
);
8037 static void bfq_slab_kill(void)
8039 kmem_cache_destroy(bfq_pool
);
8042 static int __init
bfq_slab_setup(void)
8044 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
8050 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
8052 return sprintf(page
, "%u\n", var
);
8055 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
8058 unsigned long new_val
;
8059 int ret
= kstrtoul(page
, 10, &new_val
);
8067 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
8068 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
8070 struct bfq_data *bfqd = e->elevator_data; \
8071 u64 __data = __VAR; \
8073 __data = jiffies_to_msecs(__data); \
8074 else if (__CONV == 2) \
8075 __data = div_u64(__data, NSEC_PER_MSEC); \
8076 return bfq_var_show(__data, (page)); \
8078 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
8079 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
8080 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
8081 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
8082 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
8083 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
8084 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
8085 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
8086 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
8087 #undef SHOW_FUNCTION
8089 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
8090 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
8092 struct bfq_data *bfqd = e->elevator_data; \
8093 u64 __data = __VAR; \
8094 __data = div_u64(__data, NSEC_PER_USEC); \
8095 return bfq_var_show(__data, (page)); \
8097 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
8098 #undef USEC_SHOW_FUNCTION
8100 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
8102 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
8104 struct bfq_data *bfqd = e->elevator_data; \
8105 unsigned long uninitialized_var(__data); \
8106 int ret = bfq_var_store(&__data, (page), count); \
8107 if (__data < (MIN)) \
8109 else if (__data > (MAX)) \
8112 *(__PTR) = msecs_to_jiffies(__data); \
8113 else if (__CONV == 2) \
8114 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
8116 *(__PTR) = __data; \
8119 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
8121 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
8123 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
8124 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
8126 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
8127 #undef STORE_FUNCTION
8129 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
8130 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
8132 struct bfq_data *bfqd = e->elevator_data; \
8133 unsigned long uninitialized_var(__data); \
8134 int ret = bfq_var_store(&__data, (page), count); \
8135 if (__data < (MIN)) \
8137 else if (__data > (MAX)) \
8139 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
8142 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
8144 #undef USEC_STORE_FUNCTION
8146 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
8147 const char *page
, size_t count
)
8149 struct bfq_data
*bfqd
= e
->elevator_data
;
8150 unsigned long uninitialized_var(__data
);
8151 int ret
= bfq_var_store(&__data
, (page
), count
);
8154 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
8156 if (__data
> INT_MAX
)
8158 bfqd
->bfq_max_budget
= __data
;
8161 bfqd
->bfq_user_max_budget
= __data
;
8167 * Leaving this name to preserve name compatibility with cfq
8168 * parameters, but this timeout is used for both sync and async.
8170 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
8171 const char *page
, size_t count
)
8173 struct bfq_data
*bfqd
= e
->elevator_data
;
8174 unsigned long uninitialized_var(__data
);
8175 int ret
= bfq_var_store(&__data
, (page
), count
);
8179 else if (__data
> INT_MAX
)
8182 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
8183 if (bfqd
->bfq_user_max_budget
== 0)
8184 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
8189 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
8190 const char *page
, size_t count
)
8192 struct bfq_data
*bfqd
= e
->elevator_data
;
8193 unsigned long uninitialized_var(__data
);
8194 int ret
= bfq_var_store(&__data
, (page
), count
);
8198 if (!bfqd
->strict_guarantees
&& __data
== 1
8199 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
8200 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
8202 bfqd
->strict_guarantees
= __data
;
8207 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
8208 const char *page
, size_t count
)
8210 struct bfq_data
*bfqd
= e
->elevator_data
;
8211 unsigned long uninitialized_var(__data
);
8212 int ret
= bfq_var_store(&__data
, (page
), count
);
8216 if (__data
== 0 && bfqd
->low_latency
!= 0)
8218 bfqd
->low_latency
= __data
;
8223 #define BFQ_ATTR(name) \
8224 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
8226 static struct elv_fs_entry bfq_attrs
[] = {
8227 BFQ_ATTR(fifo_expire_sync
),
8228 BFQ_ATTR(fifo_expire_async
),
8229 BFQ_ATTR(back_seek_max
),
8230 BFQ_ATTR(back_seek_penalty
),
8231 BFQ_ATTR(slice_idle
),
8232 BFQ_ATTR(slice_idle_us
),
8233 BFQ_ATTR(max_budget
),
8234 BFQ_ATTR(timeout_sync
),
8235 BFQ_ATTR(strict_guarantees
),
8236 BFQ_ATTR(low_latency
),
8240 static struct elevator_type iosched_bfq_mq
= {
8242 .get_rq_priv
= bfq_get_rq_private
,
8243 .put_rq_priv
= bfq_put_rq_private
,
8244 .exit_icq
= bfq_exit_icq
,
8245 .insert_requests
= bfq_insert_requests
,
8246 .dispatch_request
= bfq_dispatch_request
,
8247 .next_request
= elv_rb_latter_request
,
8248 .former_request
= elv_rb_former_request
,
8249 .allow_merge
= bfq_allow_bio_merge
,
8250 .bio_merge
= bfq_bio_merge
,
8251 .request_merge
= bfq_request_merge
,
8252 .requests_merged
= bfq_requests_merged
,
8253 .request_merged
= bfq_request_merged
,
8254 .has_work
= bfq_has_work
,
8255 .init_sched
= bfq_init_queue
,
8256 .exit_sched
= bfq_exit_queue
,
8260 .icq_size
= sizeof(struct bfq_io_cq
),
8261 .icq_align
= __alignof__(struct bfq_io_cq
),
8262 .elevator_attrs
= bfq_attrs
,
8263 .elevator_name
= "bfq",
8264 .elevator_owner
= THIS_MODULE
,
8267 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8268 static struct blkcg_policy blkcg_policy_bfq
= {
8269 .dfl_cftypes
= bfq_blkg_files
,
8270 .legacy_cftypes
= bfq_blkcg_legacy_files
,
8272 .cpd_alloc_fn
= bfq_cpd_alloc
,
8273 .cpd_init_fn
= bfq_cpd_init
,
8274 .cpd_bind_fn
= bfq_cpd_init
,
8275 .cpd_free_fn
= bfq_cpd_free
,
8277 .pd_alloc_fn
= bfq_pd_alloc
,
8278 .pd_init_fn
= bfq_pd_init
,
8279 .pd_offline_fn
= bfq_pd_offline
,
8280 .pd_free_fn
= bfq_pd_free
,
8281 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
8285 static int __init
bfq_init(void)
8289 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8290 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
8296 if (bfq_slab_setup())
8300 * Times to load large popular applications for the typical
8301 * systems installed on the reference devices (see the
8302 * comments before the definitions of the next two
8303 * arrays). Actually, we use slightly slower values, as the
8304 * estimated peak rate tends to be smaller than the actual
8305 * peak rate. The reason for this last fact is that estimates
8306 * are computed over much shorter time intervals than the long
8307 * intervals typically used for benchmarking. Why? First, to
8308 * adapt more quickly to variations. Second, because an I/O
8309 * scheduler cannot rely on a peak-rate-evaluation workload to
8310 * be run for a long time.
8312 T_slow
[0] = msecs_to_jiffies(3500); /* actually 4 sec */
8313 T_slow
[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
8314 T_fast
[0] = msecs_to_jiffies(7000); /* actually 8 sec */
8315 T_fast
[1] = msecs_to_jiffies(2500); /* actually 3 sec */
8318 * Thresholds that determine the switch between speed classes
8319 * (see the comments before the definition of the array
8320 * device_speed_thresh). These thresholds are biased towards
8321 * transitions to the fast class. This is safer than the
8322 * opposite bias. In fact, a wrong transition to the slow
8323 * class results in short weight-raising periods, because the
8324 * speed of the device then tends to be higher that the
8325 * reference peak rate. On the opposite end, a wrong
8326 * transition to the fast class tends to increase
8327 * weight-raising periods, because of the opposite reason.
8329 device_speed_thresh
[0] = (4 * R_slow
[0]) / 3;
8330 device_speed_thresh
[1] = (4 * R_slow
[1]) / 3;
8332 ret
= elv_register(&iosched_bfq_mq
);
8339 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8340 blkcg_policy_unregister(&blkcg_policy_bfq
);
8345 static void __exit
bfq_exit(void)
8347 elv_unregister(&iosched_bfq_mq
);
8348 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8349 blkcg_policy_unregister(&blkcg_policy_bfq
);
8354 module_init(bfq_init
);
8355 module_exit(bfq_exit
);
8357 MODULE_AUTHOR("Paolo Valente");
8358 MODULE_LICENSE("GPL");
8359 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");