2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
127 #define BFQ_SOFTRT_WEIGHT_FACTOR 100
132 * struct bfq_service_tree - per ioprio_class service tree.
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
139 struct bfq_service_tree
{
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active
;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
145 /* idle entity with minimum F_i */
146 struct bfq_entity
*first_idle
;
147 /* idle entity with maximum F_i */
148 struct bfq_entity
*last_idle
;
150 /* scheduler virtual time */
152 /* scheduler weight sum; active and idle entities contribute to it */
157 * struct bfq_sched_data - multi-class scheduler.
159 * bfq_sched_data is the basic scheduler queue. It supports three
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
173 struct bfq_sched_data
{
174 /* entity in service */
175 struct bfq_entity
*in_service_entity
;
176 /* head-of-line entity (see comments above) */
177 struct bfq_entity
*next_in_service
;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service
;
186 * struct bfq_entity - schedulable entity.
188 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
189 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
190 * entity belongs to the sched_data of the parent group in the cgroup
191 * hierarchy. Non-leaf entities have also their own sched_data, stored
194 * Each entity stores independently its priority values; this would
195 * allow different weights on different devices, but this
196 * functionality is not exported to userspace by now. Priorities and
197 * weights are updated lazily, first storing the new values into the
198 * new_* fields, then setting the @prio_changed flag. As soon as
199 * there is a transition in the entity state that allows the priority
200 * update to take place the effective and the requested priority
201 * values are synchronized.
203 * Unless cgroups are used, the weight value is calculated from the
204 * ioprio to export the same interface as CFQ. When dealing with
205 * ``well-behaved'' queues (i.e., queues that do not spend too much
206 * time to consume their budget and have true sequential behavior, and
207 * when there are no external factors breaking anticipation) the
208 * relative weights at each level of the cgroups hierarchy should be
209 * guaranteed. All the fields are protected by the queue lock of the
213 /* service_tree member */
214 struct rb_node rb_node
;
217 * Flag, true if the entity is on a tree (either the active or
218 * the idle one of its service_tree) or is in service.
222 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
225 /* tree the entity is enqueued into; %NULL if not on a tree */
226 struct rb_root
*tree
;
229 * minimum start time of the (active) subtree rooted at this
230 * entity; used for O(log N) lookups into active trees
234 /* amount of service received during the last service slot */
237 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
240 /* weight of the queue */
242 /* next weight if a change is in progress */
245 /* original weight, used to implement weight boosting */
248 /* parent entity, for hierarchical scheduling */
249 struct bfq_entity
*parent
;
252 * For non-leaf nodes in the hierarchy, the associated
253 * scheduler queue, %NULL on leaf nodes.
255 struct bfq_sched_data
*my_sched_data
;
256 /* the scheduler queue this entity belongs to */
257 struct bfq_sched_data
*sched_data
;
259 /* flag, set to request a weight, ioprio or ioprio_class change */
266 * struct bfq_ttime - per process thinktime stats.
269 /* completion time of the last request */
270 u64 last_end_request
;
272 /* total process thinktime */
274 /* number of thinktime samples */
275 unsigned long ttime_samples
;
276 /* average process thinktime */
281 * struct bfq_queue - leaf schedulable entity.
283 * A bfq_queue is a leaf request queue; it can be associated with an
284 * io_context or more, if it is async. @cgroup holds a reference to
285 * the cgroup, to be sure that it does not disappear while a bfqq
286 * still references it (mostly to avoid races between request issuing
287 * and task migration followed by cgroup destruction). All the fields
288 * are protected by the queue lock of the containing bfqd.
291 /* reference counter */
293 /* parent bfq_data */
294 struct bfq_data
*bfqd
;
296 /* current ioprio and ioprio class */
297 unsigned short ioprio
, ioprio_class
;
298 /* next ioprio and ioprio class if a change is in progress */
299 unsigned short new_ioprio
, new_ioprio_class
;
301 /* sorted list of pending requests */
302 struct rb_root sort_list
;
303 /* if fifo isn't expired, next request to serve */
304 struct request
*next_rq
;
305 /* number of sync and async requests queued */
307 /* number of requests currently allocated */
309 /* number of pending metadata requests */
311 /* fifo list of requests in sort_list */
312 struct list_head fifo
;
314 /* entity representing this queue in the scheduler */
315 struct bfq_entity entity
;
317 /* maximum budget allowed from the feedback mechanism */
319 /* budget expiration (in jiffies) */
320 unsigned long budget_timeout
;
322 /* number of requests on the dispatch list or inside driver */
328 /* node for active/idle bfqq list inside parent bfqd */
329 struct list_head bfqq_list
;
331 /* associated @bfq_ttime struct */
332 struct bfq_ttime ttime
;
334 /* bit vector: a 1 for each seeky requests in history */
336 /* position of the last request enqueued */
337 sector_t last_request_pos
;
339 /* Number of consecutive pairs of request completion and
340 * arrival, such that the queue becomes idle after the
341 * completion, but the next request arrives within an idle
342 * time slice; used only if the queue's IO_bound flag has been
345 unsigned int requests_within_timer
;
347 /* pid of the process owning the queue, used for logging purposes */
350 /* current maximum weight-raising time for this queue */
351 unsigned long wr_cur_max_time
;
353 * Minimum time instant such that, only if a new request is
354 * enqueued after this time instant in an idle @bfq_queue with
355 * no outstanding requests, then the task associated with the
356 * queue it is deemed as soft real-time (see the comments on
357 * the function bfq_bfqq_softrt_next_start())
359 unsigned long soft_rt_next_start
;
361 * Start time of the current weight-raising period if
362 * the @bfq-queue is being weight-raised, otherwise
363 * finish time of the last weight-raising period.
365 unsigned long last_wr_start_finish
;
366 /* factor by which the weight of this queue is multiplied */
367 unsigned int wr_coeff
;
369 * Time of the last transition of the @bfq_queue from idle to
372 unsigned long last_idle_bklogged
;
374 * Cumulative service received from the @bfq_queue since the
375 * last transition from idle to backlogged.
377 unsigned long service_from_backlogged
;
379 * Value of wr start time when switching to soft rt
381 unsigned long wr_start_at_switch_to_srt
;
385 * struct bfq_io_cq - per (request_queue, io_context) structure.
388 /* associated io_cq structure */
389 struct io_cq icq
; /* must be the first member */
390 /* array of two process queues, the sync and the async */
391 struct bfq_queue
*bfqq
[2];
392 /* per (request_queue, blkcg) ioprio */
394 #ifdef CONFIG_BFQ_GROUP_IOSCHED
395 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
399 enum bfq_device_speed
{
405 * struct bfq_data - per-device data structure.
407 * All the fields are protected by @lock.
410 /* device request queue */
411 struct request_queue
*queue
;
413 struct list_head dispatch
;
415 /* root bfq_group for the device */
416 struct bfq_group
*root_group
;
419 * Number of bfq_queues containing requests (including the
420 * queue in service, even if it is idling).
423 /* number of queued requests */
425 /* number of requests dispatched and waiting for completion */
429 * Maximum number of requests in driver in the last
430 * @hw_tag_samples completed requests.
432 int max_rq_in_driver
;
433 /* number of samples used to calculate hw_tag */
435 /* flag set to one if the driver is showing a queueing behavior */
438 /* number of budgets assigned */
439 int budgets_assigned
;
442 * Timer set when idling (waiting) for the next request from
443 * the queue in service.
445 struct hrtimer idle_slice_timer
;
447 /* bfq_queue in service */
448 struct bfq_queue
*in_service_queue
;
449 /* bfq_io_cq (bic) associated with the @in_service_queue */
450 struct bfq_io_cq
*in_service_bic
;
452 /* on-disk position of the last served request */
453 sector_t last_position
;
455 /* time of last request completion (ns) */
458 /* time of first rq dispatch in current observation interval (ns) */
460 /* time of last rq dispatch in current observation interval (ns) */
463 /* beginning of the last budget */
464 ktime_t last_budget_start
;
465 /* beginning of the last idle slice */
466 ktime_t last_idling_start
;
468 /* number of samples in current observation interval */
469 int peak_rate_samples
;
470 /* num of samples of seq dispatches in current observation interval */
471 u32 sequential_samples
;
472 /* total num of sectors transferred in current observation interval */
473 u64 tot_sectors_dispatched
;
474 /* max rq size seen during current observation interval (sectors) */
475 u32 last_rq_max_size
;
476 /* time elapsed from first dispatch in current observ. interval (us) */
477 u64 delta_from_first
;
479 * Current estimate of the device peak rate, measured in
480 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
481 * BFQ_RATE_SHIFT is performed to increase precision in
482 * fixed-point calculations.
486 /* maximum budget allotted to a bfq_queue before rescheduling */
489 /* list of all the bfq_queues active on the device */
490 struct list_head active_list
;
491 /* list of all the bfq_queues idle on the device */
492 struct list_head idle_list
;
495 * Timeout for async/sync requests; when it fires, requests
496 * are served in fifo order.
498 u64 bfq_fifo_expire
[2];
499 /* weight of backward seeks wrt forward ones */
500 unsigned int bfq_back_penalty
;
501 /* maximum allowed backward seek */
502 unsigned int bfq_back_max
;
503 /* maximum idling time */
506 /* user-configured max budget value (0 for auto-tuning) */
507 int bfq_user_max_budget
;
509 * Timeout for bfq_queues to consume their budget; used to
510 * prevent seeky queues from imposing long latencies to
511 * sequential or quasi-sequential ones (this also implies that
512 * seeky queues cannot receive guarantees in the service
513 * domain; after a timeout they are charged for the time they
514 * have been in service, to preserve fairness among them, but
515 * without service-domain guarantees).
517 unsigned int bfq_timeout
;
520 * Number of consecutive requests that must be issued within
521 * the idle time slice to set again idling to a queue which
522 * was marked as non-I/O-bound (see the definition of the
523 * IO_bound flag for further details).
525 unsigned int bfq_requests_within_timer
;
528 * Force device idling whenever needed to provide accurate
529 * service guarantees, without caring about throughput
530 * issues. CAVEAT: this may even increase latencies, in case
531 * of useless idling for processes that did stop doing I/O.
533 bool strict_guarantees
;
535 /* if set to true, low-latency heuristics are enabled */
538 * Maximum factor by which the weight of a weight-raised queue
541 unsigned int bfq_wr_coeff
;
542 /* maximum duration of a weight-raising period (jiffies) */
543 unsigned int bfq_wr_max_time
;
545 /* Maximum weight-raising duration for soft real-time processes */
546 unsigned int bfq_wr_rt_max_time
;
548 * Minimum idle period after which weight-raising may be
549 * reactivated for a queue (in jiffies).
551 unsigned int bfq_wr_min_idle_time
;
553 * Minimum period between request arrivals after which
554 * weight-raising may be reactivated for an already busy async
555 * queue (in jiffies).
557 unsigned long bfq_wr_min_inter_arr_async
;
559 /* Max service-rate for a soft real-time queue, in sectors/sec */
560 unsigned int bfq_wr_max_softrt_rate
;
562 * Cached value of the product R*T, used for computing the
563 * maximum duration of weight raising automatically.
566 /* device-speed class for the low-latency heuristic */
567 enum bfq_device_speed device_speed
;
569 /* fallback dummy bfqq for extreme OOM conditions */
570 struct bfq_queue oom_bfqq
;
575 * bic associated with the task issuing current bio for
576 * merging. This and the next field are used as a support to
577 * be able to perform the bic lookup, needed by bio-merge
578 * functions, before the scheduler lock is taken, and thus
579 * avoid taking the request-queue lock while the scheduler
580 * lock is being held.
582 struct bfq_io_cq
*bio_bic
;
583 /* bfqq associated with the task issuing current bio for merging */
584 struct bfq_queue
*bio_bfqq
;
587 enum bfqq_state_flags
{
588 BFQQF_busy
= 0, /* has requests or is in service */
589 BFQQF_wait_request
, /* waiting for a request */
590 BFQQF_non_blocking_wait_rq
, /*
591 * waiting for a request
592 * without idling the device
594 BFQQF_fifo_expire
, /* FIFO checked in this slice */
595 BFQQF_idle_window
, /* slice idling enabled */
596 BFQQF_sync
, /* synchronous queue */
598 * bfqq has timed-out at least once
599 * having consumed at most 2/10 of
602 BFQQF_softrt_update
, /*
603 * may need softrt-next-start
608 #define BFQ_BFQQ_FNS(name) \
609 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
611 __set_bit(BFQQF_##name, &(bfqq)->flags); \
613 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
615 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
617 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
619 return test_bit(BFQQF_##name, &(bfqq)->flags); \
623 BFQ_BFQQ_FNS(wait_request
);
624 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
625 BFQ_BFQQ_FNS(fifo_expire
);
626 BFQ_BFQQ_FNS(idle_window
);
628 BFQ_BFQQ_FNS(IO_bound
);
629 BFQ_BFQQ_FNS(softrt_update
);
632 /* Logging facilities. */
633 #ifdef CONFIG_BFQ_GROUP_IOSCHED
634 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
635 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
637 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
640 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
641 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
642 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
646 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
649 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
650 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
653 #else /* CONFIG_BFQ_GROUP_IOSCHED */
655 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
656 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
657 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
659 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
661 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
663 #define bfq_log(bfqd, fmt, args...) \
664 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
666 /* Expiration reasons. */
667 enum bfqq_expiration
{
668 BFQQE_TOO_IDLE
= 0, /*
669 * queue has been idling for
672 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
673 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
674 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
675 BFQQE_PREEMPTED
/* preemption in progress */
679 #ifdef CONFIG_BFQ_GROUP_IOSCHED
680 /* number of ios merged */
681 struct blkg_rwstat merged
;
682 /* total time spent on device in ns, may not be accurate w/ queueing */
683 struct blkg_rwstat service_time
;
684 /* total time spent waiting in scheduler queue in ns */
685 struct blkg_rwstat wait_time
;
686 /* number of IOs queued up */
687 struct blkg_rwstat queued
;
688 /* total disk time and nr sectors dispatched by this group */
689 struct blkg_stat time
;
690 /* sum of number of ios queued across all samples */
691 struct blkg_stat avg_queue_size_sum
;
692 /* count of samples taken for average */
693 struct blkg_stat avg_queue_size_samples
;
694 /* how many times this group has been removed from service tree */
695 struct blkg_stat dequeue
;
696 /* total time spent waiting for it to be assigned a timeslice. */
697 struct blkg_stat group_wait_time
;
698 /* time spent idling for this blkcg_gq */
699 struct blkg_stat idle_time
;
700 /* total time with empty current active q with other requests queued */
701 struct blkg_stat empty_time
;
702 /* fields after this shouldn't be cleared on stat reset */
703 uint64_t start_group_wait_time
;
704 uint64_t start_idle_time
;
705 uint64_t start_empty_time
;
707 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
710 #ifdef CONFIG_BFQ_GROUP_IOSCHED
713 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
715 * @ps: @blkcg_policy_storage that this structure inherits
716 * @weight: weight of the bfq_group
718 struct bfq_group_data
{
719 /* must be the first member */
720 struct blkcg_policy_data pd
;
726 * struct bfq_group - per (device, cgroup) data structure.
727 * @entity: schedulable entity to insert into the parent group sched_data.
728 * @sched_data: own sched_data, to contain child entities (they may be
729 * both bfq_queues and bfq_groups).
730 * @bfqd: the bfq_data for the device this group acts upon.
731 * @async_bfqq: array of async queues for all the tasks belonging to
732 * the group, one queue per ioprio value per ioprio_class,
733 * except for the idle class that has only one queue.
734 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
735 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
736 * to avoid too many special cases during group creation/
738 * @stats: stats for this bfqg.
740 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
741 * there is a set of bfq_groups, each one collecting the lower-level
742 * entities belonging to the group that are acting on the same device.
744 * Locking works as follows:
745 * o @bfqd is protected by the queue lock, RCU is used to access it
747 * o All the other fields are protected by the @bfqd queue lock.
750 /* must be the first member */
751 struct blkg_policy_data pd
;
753 struct bfq_entity entity
;
754 struct bfq_sched_data sched_data
;
758 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
759 struct bfq_queue
*async_idle_bfqq
;
761 struct bfq_entity
*my_entity
;
763 struct bfqg_stats stats
;
768 struct bfq_sched_data sched_data
;
770 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
771 struct bfq_queue
*async_idle_bfqq
;
773 struct rb_root rq_pos_tree
;
777 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
779 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
781 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
783 return bfqq
? bfqq
->ioprio_class
- 1 :
784 BFQ_DEFAULT_GRP_CLASS
- 1;
787 static struct bfq_service_tree
*
788 bfq_entity_service_tree(struct bfq_entity
*entity
)
790 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
791 unsigned int idx
= bfq_class_idx(entity
);
793 return sched_data
->service_tree
+ idx
;
796 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
798 return bic
->bfqq
[is_sync
];
801 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
804 bic
->bfqq
[is_sync
] = bfqq
;
807 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
809 return bic
->icq
.q
->elevator
->elevator_data
;
812 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
813 static void bfq_put_queue(struct bfq_queue
*bfqq
);
814 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
815 struct bio
*bio
, bool is_sync
,
816 struct bfq_io_cq
*bic
);
817 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
818 struct bfq_group
*bfqg
);
819 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
820 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
822 /* Expiration time of sync (0) and async (1) requests, in ns. */
823 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
825 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
826 static const int bfq_back_max
= 16 * 1024;
828 /* Penalty of a backwards seek, in number of sectors. */
829 static const int bfq_back_penalty
= 2;
831 /* Idling period duration, in ns. */
832 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
834 /* Minimum number of assigned budgets for which stats are safe to compute. */
835 static const int bfq_stats_min_budgets
= 194;
837 /* Default maximum budget values, in sectors and number of requests. */
838 static const int bfq_default_max_budget
= 16 * 1024;
841 * Async to sync throughput distribution is controlled as follows:
842 * when an async request is served, the entity is charged the number
843 * of sectors of the request, multiplied by the factor below
845 static const int bfq_async_charge_factor
= 10;
847 /* Default timeout values, in jiffies, approximating CFQ defaults. */
848 static const int bfq_timeout
= HZ
/ 8;
850 static struct kmem_cache
*bfq_pool
;
852 /* Below this threshold (in ns), we consider thinktime immediate. */
853 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
855 /* hw_tag detection: parallel requests threshold and min samples needed. */
856 #define BFQ_HW_QUEUE_THRESHOLD 4
857 #define BFQ_HW_QUEUE_SAMPLES 32
859 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
860 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
861 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
862 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
864 /* Min number of samples required to perform peak-rate update */
865 #define BFQ_RATE_MIN_SAMPLES 32
866 /* Min observation time interval required to perform a peak-rate update (ns) */
867 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
868 /* Target observation time interval for a peak-rate update (ns) */
869 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
871 /* Shift used for peak rate fixed precision calculations. */
872 #define BFQ_RATE_SHIFT 16
875 * By default, BFQ computes the duration of the weight raising for
876 * interactive applications automatically, using the following formula:
877 * duration = (R / r) * T, where r is the peak rate of the device, and
878 * R and T are two reference parameters.
879 * In particular, R is the peak rate of the reference device (see below),
880 * and T is a reference time: given the systems that are likely to be
881 * installed on the reference device according to its speed class, T is
882 * about the maximum time needed, under BFQ and while reading two files in
883 * parallel, to load typical large applications on these systems.
884 * In practice, the slower/faster the device at hand is, the more/less it
885 * takes to load applications with respect to the reference device.
886 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
889 * BFQ uses four different reference pairs (R, T), depending on:
890 * . whether the device is rotational or non-rotational;
891 * . whether the device is slow, such as old or portable HDDs, as well as
892 * SD cards, or fast, such as newer HDDs and SSDs.
894 * The device's speed class is dynamically (re)detected in
895 * bfq_update_peak_rate() every time the estimated peak rate is updated.
897 * In the following definitions, R_slow[0]/R_fast[0] and
898 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
899 * rotational device, whereas R_slow[1]/R_fast[1] and
900 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
901 * non-rotational device. Finally, device_speed_thresh are the
902 * thresholds used to switch between speed classes. The reference
903 * rates are not the actual peak rates of the devices used as a
904 * reference, but slightly lower values. The reason for using these
905 * slightly lower values is that the peak-rate estimator tends to
906 * yield slightly lower values than the actual peak rate (it can yield
907 * the actual peak rate only if there is only one process doing I/O,
908 * and the process does sequential I/O).
910 * Both the reference peak rates and the thresholds are measured in
911 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
913 static int R_slow
[2] = {1000, 10700};
914 static int R_fast
[2] = {14000, 33000};
916 * To improve readability, a conversion function is used to initialize the
917 * following arrays, which entails that they can be initialized only in a
920 static int T_slow
[2];
921 static int T_fast
[2];
922 static int device_speed_thresh
[2];
924 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
925 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
927 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
928 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
931 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
932 * @icq: the iocontext queue.
934 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
936 /* bic->icq is the first member, %NULL will convert to %NULL */
937 return container_of(icq
, struct bfq_io_cq
, icq
);
941 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
942 * @bfqd: the lookup key.
943 * @ioc: the io_context of the process doing I/O.
944 * @q: the request queue.
946 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
947 struct io_context
*ioc
,
948 struct request_queue
*q
)
952 struct bfq_io_cq
*icq
;
954 spin_lock_irqsave(q
->queue_lock
, flags
);
955 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
956 spin_unlock_irqrestore(q
->queue_lock
, flags
);
965 * Scheduler run of queue, if there are requests pending and no one in the
966 * driver that will restart queueing.
968 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
970 if (bfqd
->queued
!= 0) {
971 bfq_log(bfqd
, "schedule dispatch");
972 blk_mq_run_hw_queues(bfqd
->queue
, true);
977 * bfq_gt - compare two timestamps.
981 * Return @a > @b, dealing with wrapping correctly.
983 static int bfq_gt(u64 a
, u64 b
)
985 return (s64
)(a
- b
) > 0;
988 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
990 struct rb_node
*node
= tree
->rb_node
;
992 return rb_entry(node
, struct bfq_entity
, rb_node
);
995 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
997 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
1000 * bfq_update_next_in_service - update sd->next_in_service
1001 * @sd: sched_data for which to perform the update.
1002 * @new_entity: if not NULL, pointer to the entity whose activation,
1003 * requeueing or repositionig triggered the invocation of
1006 * This function is called to update sd->next_in_service, which, in
1007 * its turn, may change as a consequence of the insertion or
1008 * extraction of an entity into/from one of the active trees of
1009 * sd. These insertions/extractions occur as a consequence of
1010 * activations/deactivations of entities, with some activations being
1011 * 'true' activations, and other activations being requeueings (i.e.,
1012 * implementing the second, requeueing phase of the mechanism used to
1013 * reposition an entity in its active tree; see comments on
1014 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1015 * both the last two activation sub-cases, new_entity points to the
1016 * just activated or requeued entity.
1018 * Returns true if sd->next_in_service changes in such a way that
1019 * entity->parent may become the next_in_service for its parent
1022 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
1023 struct bfq_entity
*new_entity
)
1025 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
1026 bool parent_sched_may_change
= false;
1029 * If this update is triggered by the activation, requeueing
1030 * or repositiong of an entity that does not coincide with
1031 * sd->next_in_service, then a full lookup in the active tree
1032 * can be avoided. In fact, it is enough to check whether the
1033 * just-modified entity has a higher priority than
1034 * sd->next_in_service, or, even if it has the same priority
1035 * as sd->next_in_service, is eligible and has a lower virtual
1036 * finish time than sd->next_in_service. If this compound
1037 * condition holds, then the new entity becomes the new
1038 * next_in_service. Otherwise no change is needed.
1040 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
1042 * Flag used to decide whether to replace
1043 * sd->next_in_service with new_entity. Tentatively
1044 * set to true, and left as true if
1045 * sd->next_in_service is NULL.
1047 bool replace_next
= true;
1050 * If there is already a next_in_service candidate
1051 * entity, then compare class priorities or timestamps
1052 * to decide whether to replace sd->service_tree with
1055 if (next_in_service
) {
1056 unsigned int new_entity_class_idx
=
1057 bfq_class_idx(new_entity
);
1058 struct bfq_service_tree
*st
=
1059 sd
->service_tree
+ new_entity_class_idx
;
1062 * For efficiency, evaluate the most likely
1063 * sub-condition first.
1066 (new_entity_class_idx
==
1067 bfq_class_idx(next_in_service
)
1069 !bfq_gt(new_entity
->start
, st
->vtime
)
1071 bfq_gt(next_in_service
->finish
,
1072 new_entity
->finish
))
1074 new_entity_class_idx
<
1075 bfq_class_idx(next_in_service
);
1079 next_in_service
= new_entity
;
1080 } else /* invoked because of a deactivation: lookup needed */
1081 next_in_service
= bfq_lookup_next_entity(sd
);
1083 if (next_in_service
) {
1084 parent_sched_may_change
= !sd
->next_in_service
||
1085 bfq_update_parent_budget(next_in_service
);
1088 sd
->next_in_service
= next_in_service
;
1090 if (!next_in_service
)
1091 return parent_sched_may_change
;
1093 return parent_sched_may_change
;
1096 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1097 /* both next loops stop at one of the child entities of the root group */
1098 #define for_each_entity(entity) \
1099 for (; entity ; entity = entity->parent)
1102 * For each iteration, compute parent in advance, so as to be safe if
1103 * entity is deallocated during the iteration. Such a deallocation may
1104 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1105 * containing entity.
1107 #define for_each_entity_safe(entity, parent) \
1108 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1111 * Returns true if this budget changes may let next_in_service->parent
1112 * become the next_in_service entity for its parent entity.
1114 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1116 struct bfq_entity
*bfqg_entity
;
1117 struct bfq_group
*bfqg
;
1118 struct bfq_sched_data
*group_sd
;
1121 group_sd
= next_in_service
->sched_data
;
1123 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
1125 * bfq_group's my_entity field is not NULL only if the group
1126 * is not the root group. We must not touch the root entity
1127 * as it must never become an in-service entity.
1129 bfqg_entity
= bfqg
->my_entity
;
1131 if (bfqg_entity
->budget
> next_in_service
->budget
)
1133 bfqg_entity
->budget
= next_in_service
->budget
;
1140 * This function tells whether entity stops being a candidate for next
1141 * service, according to the following logic.
1143 * This function is invoked for an entity that is about to be set in
1144 * service. If such an entity is a queue, then the entity is no longer
1145 * a candidate for next service (i.e, a candidate entity to serve
1146 * after the in-service entity is expired). The function then returns
1149 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1151 if (bfq_entity_to_bfqq(entity
))
1157 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1159 * Next two macros are fake loops when cgroups support is not
1160 * enabled. I fact, in such a case, there is only one level to go up
1161 * (to reach the root group).
1163 #define for_each_entity(entity) \
1164 for (; entity ; entity = NULL)
1166 #define for_each_entity_safe(entity, parent) \
1167 for (parent = NULL; entity ; entity = parent)
1169 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1174 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1179 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1182 * Shift for timestamp calculations. This actually limits the maximum
1183 * service allowed in one timestamp delta (small shift values increase it),
1184 * the maximum total weight that can be used for the queues in the system
1185 * (big shift values increase it), and the period of virtual time
1188 #define WFQ_SERVICE_SHIFT 22
1190 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1192 struct bfq_queue
*bfqq
= NULL
;
1194 if (!entity
->my_sched_data
)
1195 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1202 * bfq_delta - map service into the virtual time domain.
1203 * @service: amount of service.
1204 * @weight: scale factor (weight of an entity or weight sum).
1206 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1208 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1215 * bfq_calc_finish - assign the finish time to an entity.
1216 * @entity: the entity to act upon.
1217 * @service: the service to be charged to the entity.
1219 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1221 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1223 entity
->finish
= entity
->start
+
1224 bfq_delta(service
, entity
->weight
);
1227 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1228 "calc_finish: serv %lu, w %d",
1229 service
, entity
->weight
);
1230 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1231 "calc_finish: start %llu, finish %llu, delta %llu",
1232 entity
->start
, entity
->finish
,
1233 bfq_delta(service
, entity
->weight
));
1238 * bfq_entity_of - get an entity from a node.
1239 * @node: the node field of the entity.
1241 * Convert a node pointer to the relative entity. This is used only
1242 * to simplify the logic of some functions and not as the generic
1243 * conversion mechanism because, e.g., in the tree walking functions,
1244 * the check for a %NULL value would be redundant.
1246 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1248 struct bfq_entity
*entity
= NULL
;
1251 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1257 * bfq_extract - remove an entity from a tree.
1258 * @root: the tree root.
1259 * @entity: the entity to remove.
1261 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1263 entity
->tree
= NULL
;
1264 rb_erase(&entity
->rb_node
, root
);
1268 * bfq_idle_extract - extract an entity from the idle tree.
1269 * @st: the service tree of the owning @entity.
1270 * @entity: the entity being removed.
1272 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1273 struct bfq_entity
*entity
)
1275 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1276 struct rb_node
*next
;
1278 if (entity
== st
->first_idle
) {
1279 next
= rb_next(&entity
->rb_node
);
1280 st
->first_idle
= bfq_entity_of(next
);
1283 if (entity
== st
->last_idle
) {
1284 next
= rb_prev(&entity
->rb_node
);
1285 st
->last_idle
= bfq_entity_of(next
);
1288 bfq_extract(&st
->idle
, entity
);
1291 list_del(&bfqq
->bfqq_list
);
1295 * bfq_insert - generic tree insertion.
1297 * @entity: entity to insert.
1299 * This is used for the idle and the active tree, since they are both
1300 * ordered by finish time.
1302 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1304 struct bfq_entity
*entry
;
1305 struct rb_node
**node
= &root
->rb_node
;
1306 struct rb_node
*parent
= NULL
;
1310 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1312 if (bfq_gt(entry
->finish
, entity
->finish
))
1313 node
= &parent
->rb_left
;
1315 node
= &parent
->rb_right
;
1318 rb_link_node(&entity
->rb_node
, parent
, node
);
1319 rb_insert_color(&entity
->rb_node
, root
);
1321 entity
->tree
= root
;
1325 * bfq_update_min - update the min_start field of a entity.
1326 * @entity: the entity to update.
1327 * @node: one of its children.
1329 * This function is called when @entity may store an invalid value for
1330 * min_start due to updates to the active tree. The function assumes
1331 * that the subtree rooted at @node (which may be its left or its right
1332 * child) has a valid min_start value.
1334 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1336 struct bfq_entity
*child
;
1339 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1340 if (bfq_gt(entity
->min_start
, child
->min_start
))
1341 entity
->min_start
= child
->min_start
;
1346 * bfq_update_active_node - recalculate min_start.
1347 * @node: the node to update.
1349 * @node may have changed position or one of its children may have moved,
1350 * this function updates its min_start value. The left and right subtrees
1351 * are assumed to hold a correct min_start value.
1353 static void bfq_update_active_node(struct rb_node
*node
)
1355 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1357 entity
->min_start
= entity
->start
;
1358 bfq_update_min(entity
, node
->rb_right
);
1359 bfq_update_min(entity
, node
->rb_left
);
1363 * bfq_update_active_tree - update min_start for the whole active tree.
1364 * @node: the starting node.
1366 * @node must be the deepest modified node after an update. This function
1367 * updates its min_start using the values held by its children, assuming
1368 * that they did not change, and then updates all the nodes that may have
1369 * changed in the path to the root. The only nodes that may have changed
1370 * are the ones in the path or their siblings.
1372 static void bfq_update_active_tree(struct rb_node
*node
)
1374 struct rb_node
*parent
;
1377 bfq_update_active_node(node
);
1379 parent
= rb_parent(node
);
1383 if (node
== parent
->rb_left
&& parent
->rb_right
)
1384 bfq_update_active_node(parent
->rb_right
);
1385 else if (parent
->rb_left
)
1386 bfq_update_active_node(parent
->rb_left
);
1393 * bfq_active_insert - insert an entity in the active tree of its
1395 * @st: the service tree of the entity.
1396 * @entity: the entity being inserted.
1398 * The active tree is ordered by finish time, but an extra key is kept
1399 * per each node, containing the minimum value for the start times of
1400 * its children (and the node itself), so it's possible to search for
1401 * the eligible node with the lowest finish time in logarithmic time.
1403 static void bfq_active_insert(struct bfq_service_tree
*st
,
1404 struct bfq_entity
*entity
)
1406 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1407 struct rb_node
*node
= &entity
->rb_node
;
1408 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1409 struct bfq_sched_data
*sd
= NULL
;
1410 struct bfq_group
*bfqg
= NULL
;
1411 struct bfq_data
*bfqd
= NULL
;
1414 bfq_insert(&st
->active
, entity
);
1417 node
= node
->rb_left
;
1418 else if (node
->rb_right
)
1419 node
= node
->rb_right
;
1421 bfq_update_active_tree(node
);
1423 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1424 sd
= entity
->sched_data
;
1425 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1426 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1429 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1433 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1434 * @ioprio: the ioprio value to convert.
1436 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1438 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1442 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1443 * @weight: the weight value to convert.
1445 * To preserve as much as possible the old only-ioprio user interface,
1446 * 0 is used as an escape ioprio value for weights (numerically) equal or
1447 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1449 static unsigned short bfq_weight_to_ioprio(int weight
)
1451 return max_t(int, 0,
1452 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1455 static void bfq_get_entity(struct bfq_entity
*entity
)
1457 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1461 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1467 * bfq_find_deepest - find the deepest node that an extraction can modify.
1468 * @node: the node being removed.
1470 * Do the first step of an extraction in an rb tree, looking for the
1471 * node that will replace @node, and returning the deepest node that
1472 * the following modifications to the tree can touch. If @node is the
1473 * last node in the tree return %NULL.
1475 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1477 struct rb_node
*deepest
;
1479 if (!node
->rb_right
&& !node
->rb_left
)
1480 deepest
= rb_parent(node
);
1481 else if (!node
->rb_right
)
1482 deepest
= node
->rb_left
;
1483 else if (!node
->rb_left
)
1484 deepest
= node
->rb_right
;
1486 deepest
= rb_next(node
);
1487 if (deepest
->rb_right
)
1488 deepest
= deepest
->rb_right
;
1489 else if (rb_parent(deepest
) != node
)
1490 deepest
= rb_parent(deepest
);
1497 * bfq_active_extract - remove an entity from the active tree.
1498 * @st: the service_tree containing the tree.
1499 * @entity: the entity being removed.
1501 static void bfq_active_extract(struct bfq_service_tree
*st
,
1502 struct bfq_entity
*entity
)
1504 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1505 struct rb_node
*node
;
1506 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1507 struct bfq_sched_data
*sd
= NULL
;
1508 struct bfq_group
*bfqg
= NULL
;
1509 struct bfq_data
*bfqd
= NULL
;
1512 node
= bfq_find_deepest(&entity
->rb_node
);
1513 bfq_extract(&st
->active
, entity
);
1516 bfq_update_active_tree(node
);
1518 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1519 sd
= entity
->sched_data
;
1520 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1521 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1524 list_del(&bfqq
->bfqq_list
);
1528 * bfq_idle_insert - insert an entity into the idle tree.
1529 * @st: the service tree containing the tree.
1530 * @entity: the entity to insert.
1532 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1533 struct bfq_entity
*entity
)
1535 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1536 struct bfq_entity
*first_idle
= st
->first_idle
;
1537 struct bfq_entity
*last_idle
= st
->last_idle
;
1539 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1540 st
->first_idle
= entity
;
1541 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1542 st
->last_idle
= entity
;
1544 bfq_insert(&st
->idle
, entity
);
1547 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1551 * bfq_forget_entity - do not consider entity any longer for scheduling
1552 * @st: the service tree.
1553 * @entity: the entity being removed.
1554 * @is_in_service: true if entity is currently the in-service entity.
1556 * Forget everything about @entity. In addition, if entity represents
1557 * a queue, and the latter is not in service, then release the service
1558 * reference to the queue (the one taken through bfq_get_entity). In
1559 * fact, in this case, there is really no more service reference to
1560 * the queue, as the latter is also outside any service tree. If,
1561 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1562 * will take care of putting the reference when the queue finally
1563 * stops being served.
1565 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1566 struct bfq_entity
*entity
,
1569 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1571 entity
->on_st
= false;
1572 st
->wsum
-= entity
->weight
;
1573 if (bfqq
&& !is_in_service
)
1574 bfq_put_queue(bfqq
);
1578 * bfq_put_idle_entity - release the idle tree ref of an entity.
1579 * @st: service tree for the entity.
1580 * @entity: the entity being released.
1582 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1583 struct bfq_entity
*entity
)
1585 bfq_idle_extract(st
, entity
);
1586 bfq_forget_entity(st
, entity
,
1587 entity
== entity
->sched_data
->in_service_entity
);
1591 * bfq_forget_idle - update the idle tree if necessary.
1592 * @st: the service tree to act upon.
1594 * To preserve the global O(log N) complexity we only remove one entry here;
1595 * as the idle tree will not grow indefinitely this can be done safely.
1597 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1599 struct bfq_entity
*first_idle
= st
->first_idle
;
1600 struct bfq_entity
*last_idle
= st
->last_idle
;
1602 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1603 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1605 * Forget the whole idle tree, increasing the vtime past
1606 * the last finish time of idle entities.
1608 st
->vtime
= last_idle
->finish
;
1611 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1612 bfq_put_idle_entity(st
, first_idle
);
1615 static struct bfq_service_tree
*
1616 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1617 struct bfq_entity
*entity
)
1619 struct bfq_service_tree
*new_st
= old_st
;
1621 if (entity
->prio_changed
) {
1622 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1623 unsigned int prev_weight
, new_weight
;
1624 struct bfq_data
*bfqd
= NULL
;
1625 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1626 struct bfq_sched_data
*sd
;
1627 struct bfq_group
*bfqg
;
1632 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1634 sd
= entity
->my_sched_data
;
1635 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1636 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1640 old_st
->wsum
-= entity
->weight
;
1642 if (entity
->new_weight
!= entity
->orig_weight
) {
1643 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1644 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1645 pr_crit("update_weight_prio: new_weight %d\n",
1646 entity
->new_weight
);
1647 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1648 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1650 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1652 entity
->orig_weight
= entity
->new_weight
;
1655 bfq_weight_to_ioprio(entity
->orig_weight
);
1659 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1660 entity
->prio_changed
= 0;
1663 * NOTE: here we may be changing the weight too early,
1664 * this will cause unfairness. The correct approach
1665 * would have required additional complexity to defer
1666 * weight changes to the proper time instants (i.e.,
1667 * when entity->finish <= old_st->vtime).
1669 new_st
= bfq_entity_service_tree(entity
);
1671 prev_weight
= entity
->weight
;
1672 new_weight
= entity
->orig_weight
*
1673 (bfqq
? bfqq
->wr_coeff
: 1);
1674 entity
->weight
= new_weight
;
1676 new_st
->wsum
+= entity
->weight
;
1678 if (new_st
!= old_st
)
1679 entity
->start
= new_st
->vtime
;
1685 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1686 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1689 * bfq_bfqq_served - update the scheduler status after selection for
1691 * @bfqq: the queue being served.
1692 * @served: bytes to transfer.
1694 * NOTE: this can be optimized, as the timestamps of upper level entities
1695 * are synchronized every time a new bfqq is selected for service. By now,
1696 * we keep it to better check consistency.
1698 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1700 struct bfq_entity
*entity
= &bfqq
->entity
;
1701 struct bfq_service_tree
*st
;
1703 for_each_entity(entity
) {
1704 st
= bfq_entity_service_tree(entity
);
1706 entity
->service
+= served
;
1708 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1709 bfq_forget_idle(st
);
1711 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1712 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1716 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1717 * of the time interval during which bfqq has been in
1720 * @bfqq: the queue that needs a service update.
1721 * @time_ms: the amount of time during which the queue has received service
1723 * If a queue does not consume its budget fast enough, then providing
1724 * the queue with service fairness may impair throughput, more or less
1725 * severely. For this reason, queues that consume their budget slowly
1726 * are provided with time fairness instead of service fairness. This
1727 * goal is achieved through the BFQ scheduling engine, even if such an
1728 * engine works in the service, and not in the time domain. The trick
1729 * is charging these queues with an inflated amount of service, equal
1730 * to the amount of service that they would have received during their
1731 * service slot if they had been fast, i.e., if their requests had
1732 * been dispatched at a rate equal to the estimated peak rate.
1734 * It is worth noting that time fairness can cause important
1735 * distortions in terms of bandwidth distribution, on devices with
1736 * internal queueing. The reason is that I/O requests dispatched
1737 * during the service slot of a queue may be served after that service
1738 * slot is finished, and may have a total processing time loosely
1739 * correlated with the duration of the service slot. This is
1740 * especially true for short service slots.
1742 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1743 unsigned long time_ms
)
1745 struct bfq_entity
*entity
= &bfqq
->entity
;
1746 int tot_serv_to_charge
= entity
->service
;
1747 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
1749 if (time_ms
> 0 && time_ms
< timeout_ms
)
1750 tot_serv_to_charge
=
1751 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
1753 if (tot_serv_to_charge
< entity
->service
)
1754 tot_serv_to_charge
= entity
->service
;
1756 /* Increase budget to avoid inconsistencies */
1757 if (tot_serv_to_charge
> entity
->budget
)
1758 entity
->budget
= tot_serv_to_charge
;
1760 bfq_bfqq_served(bfqq
,
1761 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
1764 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1765 struct bfq_service_tree
*st
,
1768 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1770 st
= __bfq_entity_update_weight_prio(st
, entity
);
1771 bfq_calc_finish(entity
, entity
->budget
);
1774 * If some queues enjoy backshifting for a while, then their
1775 * (virtual) finish timestamps may happen to become lower and
1776 * lower than the system virtual time. In particular, if
1777 * these queues often happen to be idle for short time
1778 * periods, and during such time periods other queues with
1779 * higher timestamps happen to be busy, then the backshifted
1780 * timestamps of the former queues can become much lower than
1781 * the system virtual time. In fact, to serve the queues with
1782 * higher timestamps while the ones with lower timestamps are
1783 * idle, the system virtual time may be pushed-up to much
1784 * higher values than the finish timestamps of the idle
1785 * queues. As a consequence, the finish timestamps of all new
1786 * or newly activated queues may end up being much larger than
1787 * those of lucky queues with backshifted timestamps. The
1788 * latter queues may then monopolize the device for a lot of
1789 * time. This would simply break service guarantees.
1791 * To reduce this problem, push up a little bit the
1792 * backshifted timestamps of the queue associated with this
1793 * entity (only a queue can happen to have the backshifted
1794 * flag set): just enough to let the finish timestamp of the
1795 * queue be equal to the current value of the system virtual
1796 * time. This may introduce a little unfairness among queues
1797 * with backshifted timestamps, but it does not break
1798 * worst-case fairness guarantees.
1800 * As a special case, if bfqq is weight-raised, push up
1801 * timestamps much less, to keep very low the probability that
1802 * this push up causes the backshifted finish timestamps of
1803 * weight-raised queues to become higher than the backshifted
1804 * finish timestamps of non weight-raised queues.
1806 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1807 unsigned long delta
= st
->vtime
- entity
->finish
;
1810 delta
/= bfqq
->wr_coeff
;
1812 entity
->start
+= delta
;
1813 entity
->finish
+= delta
;
1816 bfq_active_insert(st
, entity
);
1820 * __bfq_activate_entity - handle activation of entity.
1821 * @entity: the entity being activated.
1822 * @non_blocking_wait_rq: true if entity was waiting for a request
1824 * Called for a 'true' activation, i.e., if entity is not active and
1825 * one of its children receives a new request.
1827 * Basically, this function updates the timestamps of entity and
1828 * inserts entity into its active tree, ater possible extracting it
1829 * from its idle tree.
1831 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1832 bool non_blocking_wait_rq
)
1834 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1835 bool backshifted
= false;
1836 unsigned long long min_vstart
;
1838 /* See comments on bfq_fqq_update_budg_for_activation */
1839 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1841 min_vstart
= entity
->finish
;
1843 min_vstart
= st
->vtime
;
1845 if (entity
->tree
== &st
->idle
) {
1847 * Must be on the idle tree, bfq_idle_extract() will
1850 bfq_idle_extract(st
, entity
);
1851 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1852 min_vstart
: entity
->finish
;
1855 * The finish time of the entity may be invalid, and
1856 * it is in the past for sure, otherwise the queue
1857 * would have been on the idle tree.
1859 entity
->start
= min_vstart
;
1860 st
->wsum
+= entity
->weight
;
1862 * entity is about to be inserted into a service tree,
1863 * and then set in service: get a reference to make
1864 * sure entity does not disappear until it is no
1865 * longer in service or scheduled for service.
1867 bfq_get_entity(entity
);
1869 entity
->on_st
= true;
1872 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1876 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1877 * @entity: the entity being requeued or repositioned.
1879 * Requeueing is needed if this entity stops being served, which
1880 * happens if a leaf descendant entity has expired. On the other hand,
1881 * repositioning is needed if the next_inservice_entity for the child
1882 * entity has changed. See the comments inside the function for
1885 * Basically, this function: 1) removes entity from its active tree if
1886 * present there, 2) updates the timestamps of entity and 3) inserts
1887 * entity back into its active tree (in the new, right position for
1888 * the new values of the timestamps).
1890 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
1892 struct bfq_sched_data
*sd
= entity
->sched_data
;
1893 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1895 if (entity
== sd
->in_service_entity
) {
1897 * We are requeueing the current in-service entity,
1898 * which may have to be done for one of the following
1900 * - entity represents the in-service queue, and the
1901 * in-service queue is being requeued after an
1903 * - entity represents a group, and its budget has
1904 * changed because one of its child entities has
1905 * just been either activated or requeued for some
1906 * reason; the timestamps of the entity need then to
1907 * be updated, and the entity needs to be enqueued
1908 * or repositioned accordingly.
1910 * In particular, before requeueing, the start time of
1911 * the entity must be moved forward to account for the
1912 * service that the entity has received while in
1913 * service. This is done by the next instructions. The
1914 * finish time will then be updated according to this
1915 * new value of the start time, and to the budget of
1918 bfq_calc_finish(entity
, entity
->service
);
1919 entity
->start
= entity
->finish
;
1921 * In addition, if the entity had more than one child
1922 * when set in service, then was not extracted from
1923 * the active tree. This implies that the position of
1924 * the entity in the active tree may need to be
1925 * changed now, because we have just updated the start
1926 * time of the entity, and we will update its finish
1927 * time in a moment (the requeueing is then, more
1928 * precisely, a repositioning in this case). To
1929 * implement this repositioning, we: 1) dequeue the
1930 * entity here, 2) update the finish time and
1931 * requeue the entity according to the new
1935 bfq_active_extract(st
, entity
);
1936 } else { /* The entity is already active, and not in service */
1938 * In this case, this function gets called only if the
1939 * next_in_service entity below this entity has
1940 * changed, and this change has caused the budget of
1941 * this entity to change, which, finally implies that
1942 * the finish time of this entity must be
1943 * updated. Such an update may cause the scheduling,
1944 * i.e., the position in the active tree, of this
1945 * entity to change. We handle this change by: 1)
1946 * dequeueing the entity here, 2) updating the finish
1947 * time and requeueing the entity according to the new
1948 * timestamps below. This is the same approach as the
1949 * non-extracted-entity sub-case above.
1951 bfq_active_extract(st
, entity
);
1954 bfq_update_fin_time_enqueue(entity
, st
, false);
1957 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1958 struct bfq_sched_data
*sd
,
1959 bool non_blocking_wait_rq
)
1961 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1963 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
1965 * in service or already queued on the active tree,
1966 * requeue or reposition
1968 __bfq_requeue_entity(entity
);
1971 * Not in service and not queued on its active tree:
1972 * the activity is idle and this is a true activation.
1974 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
1979 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1980 * and activate, requeue or reposition all ancestors
1981 * for which such an update becomes necessary.
1982 * @entity: the entity to activate.
1983 * @non_blocking_wait_rq: true if this entity was waiting for a request
1984 * @requeue: true if this is a requeue, which implies that bfqq is
1985 * being expired; thus ALL its ancestors stop being served and must
1986 * therefore be requeued
1988 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1989 bool non_blocking_wait_rq
,
1992 struct bfq_sched_data
*sd
;
1994 for_each_entity(entity
) {
1995 sd
= entity
->sched_data
;
1996 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
1998 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
2004 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2005 * @entity: the entity to deactivate.
2006 * @ins_into_idle_tree: if false, the entity will not be put into the
2009 * Deactivates an entity, independently from its previous state. Must
2010 * be invoked only if entity is on a service tree. Extracts the entity
2011 * from that tree, and if necessary and allowed, puts it on the idle
2014 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
2015 bool ins_into_idle_tree
)
2017 struct bfq_sched_data
*sd
= entity
->sched_data
;
2018 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2019 int is_in_service
= entity
== sd
->in_service_entity
;
2021 if (!entity
->on_st
) /* entity never activated, or already inactive */
2025 bfq_calc_finish(entity
, entity
->service
);
2027 if (entity
->tree
== &st
->active
)
2028 bfq_active_extract(st
, entity
);
2029 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
2030 bfq_idle_extract(st
, entity
);
2032 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
2033 bfq_forget_entity(st
, entity
, is_in_service
);
2035 bfq_idle_insert(st
, entity
);
2041 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2042 * @entity: the entity to deactivate.
2043 * @ins_into_idle_tree: true if the entity can be put on the idle tree
2045 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
2046 bool ins_into_idle_tree
,
2049 struct bfq_sched_data
*sd
;
2050 struct bfq_entity
*parent
= NULL
;
2052 for_each_entity_safe(entity
, parent
) {
2053 sd
= entity
->sched_data
;
2055 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
2057 * entity is not in any tree any more, so
2058 * this deactivation is a no-op, and there is
2059 * nothing to change for upper-level entities
2060 * (in case of expiration, this can never
2066 if (sd
->next_in_service
== entity
)
2068 * entity was the next_in_service entity,
2069 * then, since entity has just been
2070 * deactivated, a new one must be found.
2072 bfq_update_next_in_service(sd
, NULL
);
2074 if (sd
->next_in_service
)
2076 * The parent entity is still backlogged,
2077 * because next_in_service is not NULL. So, no
2078 * further upwards deactivation must be
2079 * performed. Yet, next_in_service has
2080 * changed. Then the schedule does need to be
2086 * If we get here, then the parent is no more
2087 * backlogged and we need to propagate the
2088 * deactivation upwards. Thus let the loop go on.
2092 * Also let parent be queued into the idle tree on
2093 * deactivation, to preserve service guarantees, and
2094 * assuming that who invoked this function does not
2095 * need parent entities too to be removed completely.
2097 ins_into_idle_tree
= true;
2101 * If the deactivation loop is fully executed, then there are
2102 * no more entities to touch and next loop is not executed at
2103 * all. Otherwise, requeue remaining entities if they are
2104 * about to stop receiving service, or reposition them if this
2108 for_each_entity(entity
) {
2110 * Invoke __bfq_requeue_entity on entity, even if
2111 * already active, to requeue/reposition it in the
2112 * active tree (because sd->next_in_service has
2115 __bfq_requeue_entity(entity
);
2117 sd
= entity
->sched_data
;
2118 if (!bfq_update_next_in_service(sd
, entity
) &&
2121 * next_in_service unchanged or not causing
2122 * any change in entity->parent->sd, and no
2123 * requeueing needed for expiration: stop
2131 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2132 * if needed, to have at least one entity eligible.
2133 * @st: the service tree to act upon.
2135 * Assumes that st is not empty.
2137 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
2139 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
2141 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
2142 return root_entity
->min_start
;
2147 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2149 if (new_value
> st
->vtime
) {
2150 st
->vtime
= new_value
;
2151 bfq_forget_idle(st
);
2156 * bfq_first_active_entity - find the eligible entity with
2157 * the smallest finish time
2158 * @st: the service tree to select from.
2159 * @vtime: the system virtual to use as a reference for eligibility
2161 * This function searches the first schedulable entity, starting from the
2162 * root of the tree and going on the left every time on this side there is
2163 * a subtree with at least one eligible (start >= vtime) entity. The path on
2164 * the right is followed only if a) the left subtree contains no eligible
2165 * entities and b) no eligible entity has been found yet.
2167 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2170 struct bfq_entity
*entry
, *first
= NULL
;
2171 struct rb_node
*node
= st
->active
.rb_node
;
2174 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2176 if (!bfq_gt(entry
->start
, vtime
))
2179 if (node
->rb_left
) {
2180 entry
= rb_entry(node
->rb_left
,
2181 struct bfq_entity
, rb_node
);
2182 if (!bfq_gt(entry
->min_start
, vtime
)) {
2183 node
= node
->rb_left
;
2189 node
= node
->rb_right
;
2196 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2197 * @st: the service tree.
2199 * If there is no in-service entity for the sched_data st belongs to,
2200 * then return the entity that will be set in service if:
2201 * 1) the parent entity this st belongs to is set in service;
2202 * 2) no entity belonging to such parent entity undergoes a state change
2203 * that would influence the timestamps of the entity (e.g., becomes idle,
2204 * becomes backlogged, changes its budget, ...).
2206 * In this first case, update the virtual time in @st too (see the
2207 * comments on this update inside the function).
2209 * In constrast, if there is an in-service entity, then return the
2210 * entity that would be set in service if not only the above
2211 * conditions, but also the next one held true: the currently
2212 * in-service entity, on expiration,
2213 * 1) gets a finish time equal to the current one, or
2214 * 2) is not eligible any more, or
2217 static struct bfq_entity
*
2218 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2220 struct bfq_entity
*entity
;
2223 if (RB_EMPTY_ROOT(&st
->active
))
2227 * Get the value of the system virtual time for which at
2228 * least one entity is eligible.
2230 new_vtime
= bfq_calc_vtime_jump(st
);
2233 * If there is no in-service entity for the sched_data this
2234 * active tree belongs to, then push the system virtual time
2235 * up to the value that guarantees that at least one entity is
2236 * eligible. If, instead, there is an in-service entity, then
2237 * do not make any such update, because there is already an
2238 * eligible entity, namely the in-service one (even if the
2239 * entity is not on st, because it was extracted when set in
2243 bfq_update_vtime(st
, new_vtime
);
2245 entity
= bfq_first_active_entity(st
, new_vtime
);
2251 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2252 * @sd: the sched_data.
2254 * This function is invoked when there has been a change in the trees
2255 * for sd, and we need know what is the new next entity after this
2258 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2260 struct bfq_service_tree
*st
= sd
->service_tree
;
2261 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2262 struct bfq_entity
*entity
= NULL
;
2266 * Choose from idle class, if needed to guarantee a minimum
2267 * bandwidth to this class (and if there is some active entity
2268 * in idle class). This should also mitigate
2269 * priority-inversion problems in case a low priority task is
2270 * holding file system resources.
2272 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2273 BFQ_CL_IDLE_TIMEOUT
)) {
2274 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2275 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2276 /* About to be served if backlogged, or not yet backlogged */
2277 sd
->bfq_class_idle_last_service
= jiffies
;
2281 * Find the next entity to serve for the highest-priority
2282 * class, unless the idle class needs to be served.
2284 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2285 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2286 sd
->in_service_entity
);
2298 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2300 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2302 return sd
->next_in_service
!= sd
->in_service_entity
;
2306 * Get next queue for service.
2308 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2310 struct bfq_entity
*entity
= NULL
;
2311 struct bfq_sched_data
*sd
;
2312 struct bfq_queue
*bfqq
;
2314 if (bfqd
->busy_queues
== 0)
2318 * Traverse the path from the root to the leaf entity to
2319 * serve. Set in service all the entities visited along the
2322 sd
= &bfqd
->root_group
->sched_data
;
2323 for (; sd
; sd
= entity
->my_sched_data
) {
2325 * WARNING. We are about to set the in-service entity
2326 * to sd->next_in_service, i.e., to the (cached) value
2327 * returned by bfq_lookup_next_entity(sd) the last
2328 * time it was invoked, i.e., the last time when the
2329 * service order in sd changed as a consequence of the
2330 * activation or deactivation of an entity. In this
2331 * respect, if we execute bfq_lookup_next_entity(sd)
2332 * in this very moment, it may, although with low
2333 * probability, yield a different entity than that
2334 * pointed to by sd->next_in_service. This rare event
2335 * happens in case there was no CLASS_IDLE entity to
2336 * serve for sd when bfq_lookup_next_entity(sd) was
2337 * invoked for the last time, while there is now one
2340 * If the above event happens, then the scheduling of
2341 * such entity in CLASS_IDLE is postponed until the
2342 * service of the sd->next_in_service entity
2343 * finishes. In fact, when the latter is expired,
2344 * bfq_lookup_next_entity(sd) gets called again,
2345 * exactly to update sd->next_in_service.
2348 /* Make next_in_service entity become in_service_entity */
2349 entity
= sd
->next_in_service
;
2350 sd
->in_service_entity
= entity
;
2353 * Reset the accumulator of the amount of service that
2354 * the entity is about to receive.
2356 entity
->service
= 0;
2359 * If entity is no longer a candidate for next
2360 * service, then we extract it from its active tree,
2361 * for the following reason. To further boost the
2362 * throughput in some special case, BFQ needs to know
2363 * which is the next candidate entity to serve, while
2364 * there is already an entity in service. In this
2365 * respect, to make it easy to compute/update the next
2366 * candidate entity to serve after the current
2367 * candidate has been set in service, there is a case
2368 * where it is necessary to extract the current
2369 * candidate from its service tree. Such a case is
2370 * when the entity just set in service cannot be also
2371 * a candidate for next service. Details about when
2372 * this conditions holds are reported in the comments
2373 * on the function bfq_no_longer_next_in_service()
2376 if (bfq_no_longer_next_in_service(entity
))
2377 bfq_active_extract(bfq_entity_service_tree(entity
),
2381 * For the same reason why we may have just extracted
2382 * entity from its active tree, we may need to update
2383 * next_in_service for the sched_data of entity too,
2384 * regardless of whether entity has been extracted.
2385 * In fact, even if entity has not been extracted, a
2386 * descendant entity may get extracted. Such an event
2387 * would cause a change in next_in_service for the
2388 * level of the descendant entity, and thus possibly
2389 * back to upper levels.
2391 * We cannot perform the resulting needed update
2392 * before the end of this loop, because, to know which
2393 * is the correct next-to-serve candidate entity for
2394 * each level, we need first to find the leaf entity
2395 * to set in service. In fact, only after we know
2396 * which is the next-to-serve leaf entity, we can
2397 * discover whether the parent entity of the leaf
2398 * entity becomes the next-to-serve, and so on.
2403 bfqq
= bfq_entity_to_bfqq(entity
);
2406 * We can finally update all next-to-serve entities along the
2407 * path from the leaf entity just set in service to the root.
2409 for_each_entity(entity
) {
2410 struct bfq_sched_data
*sd
= entity
->sched_data
;
2412 if (!bfq_update_next_in_service(sd
, NULL
))
2419 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2421 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2422 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2423 struct bfq_entity
*entity
= in_serv_entity
;
2425 if (bfqd
->in_service_bic
) {
2426 put_io_context(bfqd
->in_service_bic
->icq
.ioc
);
2427 bfqd
->in_service_bic
= NULL
;
2430 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2431 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2432 bfqd
->in_service_queue
= NULL
;
2435 * When this function is called, all in-service entities have
2436 * been properly deactivated or requeued, so we can safely
2437 * execute the final step: reset in_service_entity along the
2438 * path from entity to the root.
2440 for_each_entity(entity
)
2441 entity
->sched_data
->in_service_entity
= NULL
;
2444 * in_serv_entity is no longer in service, so, if it is in no
2445 * service tree either, then release the service reference to
2446 * the queue it represents (taken with bfq_get_entity).
2448 if (!in_serv_entity
->on_st
)
2449 bfq_put_queue(in_serv_bfqq
);
2452 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2453 bool ins_into_idle_tree
, bool expiration
)
2455 struct bfq_entity
*entity
= &bfqq
->entity
;
2457 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2460 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2462 struct bfq_entity
*entity
= &bfqq
->entity
;
2464 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2466 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2469 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2471 struct bfq_entity
*entity
= &bfqq
->entity
;
2473 bfq_activate_requeue_entity(entity
, false,
2474 bfqq
== bfqd
->in_service_queue
);
2477 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2480 * Called when the bfqq no longer has requests pending, remove it from
2481 * the service tree. As a special case, it can be invoked during an
2484 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2487 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2489 bfq_clear_bfqq_busy(bfqq
);
2491 bfqd
->busy_queues
--;
2493 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2495 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2499 * Called when an inactive queue receives a new request.
2501 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2503 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2505 bfq_activate_bfqq(bfqd
, bfqq
);
2507 bfq_mark_bfqq_busy(bfqq
);
2508 bfqd
->busy_queues
++;
2511 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2513 /* bfqg stats flags */
2514 enum bfqg_stats_flags
{
2515 BFQG_stats_waiting
= 0,
2520 #define BFQG_FLAG_FNS(name) \
2521 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2523 stats->flags |= (1 << BFQG_stats_##name); \
2525 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2527 stats->flags &= ~(1 << BFQG_stats_##name); \
2529 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2531 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2534 BFQG_FLAG_FNS(waiting)
2535 BFQG_FLAG_FNS(idling
)
2536 BFQG_FLAG_FNS(empty
)
2537 #undef BFQG_FLAG_FNS
2539 /* This should be called with the queue_lock held. */
2540 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2542 unsigned long long now
;
2544 if (!bfqg_stats_waiting(stats
))
2547 now
= sched_clock();
2548 if (time_after64(now
, stats
->start_group_wait_time
))
2549 blkg_stat_add(&stats
->group_wait_time
,
2550 now
- stats
->start_group_wait_time
);
2551 bfqg_stats_clear_waiting(stats
);
2554 /* This should be called with the queue_lock held. */
2555 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2556 struct bfq_group
*curr_bfqg
)
2558 struct bfqg_stats
*stats
= &bfqg
->stats
;
2560 if (bfqg_stats_waiting(stats
))
2562 if (bfqg
== curr_bfqg
)
2564 stats
->start_group_wait_time
= sched_clock();
2565 bfqg_stats_mark_waiting(stats
);
2568 /* This should be called with the queue_lock held. */
2569 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2571 unsigned long long now
;
2573 if (!bfqg_stats_empty(stats
))
2576 now
= sched_clock();
2577 if (time_after64(now
, stats
->start_empty_time
))
2578 blkg_stat_add(&stats
->empty_time
,
2579 now
- stats
->start_empty_time
);
2580 bfqg_stats_clear_empty(stats
);
2583 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2585 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2588 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2590 struct bfqg_stats
*stats
= &bfqg
->stats
;
2592 if (blkg_rwstat_total(&stats
->queued
))
2596 * group is already marked empty. This can happen if bfqq got new
2597 * request in parent group and moved to this group while being added
2598 * to service tree. Just ignore the event and move on.
2600 if (bfqg_stats_empty(stats
))
2603 stats
->start_empty_time
= sched_clock();
2604 bfqg_stats_mark_empty(stats
);
2607 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2609 struct bfqg_stats
*stats
= &bfqg
->stats
;
2611 if (bfqg_stats_idling(stats
)) {
2612 unsigned long long now
= sched_clock();
2614 if (time_after64(now
, stats
->start_idle_time
))
2615 blkg_stat_add(&stats
->idle_time
,
2616 now
- stats
->start_idle_time
);
2617 bfqg_stats_clear_idling(stats
);
2621 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2623 struct bfqg_stats
*stats
= &bfqg
->stats
;
2625 stats
->start_idle_time
= sched_clock();
2626 bfqg_stats_mark_idling(stats
);
2629 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2631 struct bfqg_stats
*stats
= &bfqg
->stats
;
2633 blkg_stat_add(&stats
->avg_queue_size_sum
,
2634 blkg_rwstat_total(&stats
->queued
));
2635 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2636 bfqg_stats_update_group_wait_time(stats
);
2640 * blk-cgroup policy-related handlers
2641 * The following functions help in converting between blk-cgroup
2642 * internal structures and BFQ-specific structures.
2645 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2647 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2650 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2652 return pd_to_blkg(&bfqg
->pd
);
2655 static struct blkcg_policy blkcg_policy_bfq
;
2657 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2659 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2663 * bfq_group handlers
2664 * The following functions help in navigating the bfq_group hierarchy
2665 * by allowing to find the parent of a bfq_group or the bfq_group
2666 * associated to a bfq_queue.
2669 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2671 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2673 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2676 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2678 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2680 return group_entity
? container_of(group_entity
, struct bfq_group
,
2682 bfqq
->bfqd
->root_group
;
2686 * The following two functions handle get and put of a bfq_group by
2687 * wrapping the related blk-cgroup hooks.
2690 static void bfqg_get(struct bfq_group
*bfqg
)
2692 return blkg_get(bfqg_to_blkg(bfqg
));
2695 static void bfqg_put(struct bfq_group
*bfqg
)
2697 return blkg_put(bfqg_to_blkg(bfqg
));
2700 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2701 struct bfq_queue
*bfqq
,
2704 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2705 bfqg_stats_end_empty_time(&bfqg
->stats
);
2706 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2707 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2710 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2712 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2715 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2717 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2720 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2721 uint64_t start_time
, uint64_t io_start_time
,
2724 struct bfqg_stats
*stats
= &bfqg
->stats
;
2725 unsigned long long now
= sched_clock();
2727 if (time_after64(now
, io_start_time
))
2728 blkg_rwstat_add(&stats
->service_time
, op
,
2729 now
- io_start_time
);
2730 if (time_after64(io_start_time
, start_time
))
2731 blkg_rwstat_add(&stats
->wait_time
, op
,
2732 io_start_time
- start_time
);
2736 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2738 /* queued stats shouldn't be cleared */
2739 blkg_rwstat_reset(&stats
->merged
);
2740 blkg_rwstat_reset(&stats
->service_time
);
2741 blkg_rwstat_reset(&stats
->wait_time
);
2742 blkg_stat_reset(&stats
->time
);
2743 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2744 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2745 blkg_stat_reset(&stats
->dequeue
);
2746 blkg_stat_reset(&stats
->group_wait_time
);
2747 blkg_stat_reset(&stats
->idle_time
);
2748 blkg_stat_reset(&stats
->empty_time
);
2752 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2757 /* queued stats shouldn't be cleared */
2758 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2759 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2760 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2761 blkg_stat_add_aux(&from
->time
, &from
->time
);
2762 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2763 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2764 &from
->avg_queue_size_samples
);
2765 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2766 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2767 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2768 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2772 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2773 * recursive stats can still account for the amount used by this bfqg after
2776 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2778 struct bfq_group
*parent
;
2780 if (!bfqg
) /* root_group */
2783 parent
= bfqg_parent(bfqg
);
2785 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2787 if (unlikely(!parent
))
2790 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2791 bfqg_stats_reset(&bfqg
->stats
);
2794 static void bfq_init_entity(struct bfq_entity
*entity
,
2795 struct bfq_group
*bfqg
)
2797 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2799 entity
->weight
= entity
->new_weight
;
2800 entity
->orig_weight
= entity
->new_weight
;
2802 bfqq
->ioprio
= bfqq
->new_ioprio
;
2803 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2806 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2807 entity
->sched_data
= &bfqg
->sched_data
;
2810 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2812 blkg_rwstat_exit(&stats
->merged
);
2813 blkg_rwstat_exit(&stats
->service_time
);
2814 blkg_rwstat_exit(&stats
->wait_time
);
2815 blkg_rwstat_exit(&stats
->queued
);
2816 blkg_stat_exit(&stats
->time
);
2817 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2818 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2819 blkg_stat_exit(&stats
->dequeue
);
2820 blkg_stat_exit(&stats
->group_wait_time
);
2821 blkg_stat_exit(&stats
->idle_time
);
2822 blkg_stat_exit(&stats
->empty_time
);
2825 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2827 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2828 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2829 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2830 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2831 blkg_stat_init(&stats
->time
, gfp
) ||
2832 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2833 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2834 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2835 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2836 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2837 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2838 bfqg_stats_exit(stats
);
2845 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2847 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2850 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2852 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2855 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2857 struct bfq_group_data
*bgd
;
2859 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2865 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2867 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2869 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2870 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2873 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2875 kfree(cpd_to_bfqgd(cpd
));
2878 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
2880 struct bfq_group
*bfqg
;
2882 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
2886 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
2894 static void bfq_pd_init(struct blkg_policy_data
*pd
)
2896 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
2897 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
2898 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
2899 struct bfq_entity
*entity
= &bfqg
->entity
;
2900 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
2902 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
2903 entity
->my_sched_data
= &bfqg
->sched_data
;
2904 bfqg
->my_entity
= entity
; /*
2905 * the root_group's will be set to NULL
2906 * in bfq_init_queue()
2911 static void bfq_pd_free(struct blkg_policy_data
*pd
)
2913 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2915 bfqg_stats_exit(&bfqg
->stats
);
2919 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
2921 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2923 bfqg_stats_reset(&bfqg
->stats
);
2926 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
2927 struct bfq_group
*parent
)
2929 struct bfq_entity
*entity
;
2931 entity
= &bfqg
->entity
;
2932 entity
->parent
= parent
->my_entity
;
2933 entity
->sched_data
= &parent
->sched_data
;
2936 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
2937 struct blkcg
*blkcg
)
2939 struct blkcg_gq
*blkg
;
2941 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
2943 return blkg_to_bfqg(blkg
);
2947 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
2948 struct blkcg
*blkcg
)
2950 struct bfq_group
*bfqg
, *parent
;
2951 struct bfq_entity
*entity
;
2953 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
2955 if (unlikely(!bfqg
))
2959 * Update chain of bfq_groups as we might be handling a leaf group
2960 * which, along with some of its relatives, has not been hooked yet
2961 * to the private hierarchy of BFQ.
2963 entity
= &bfqg
->entity
;
2964 for_each_entity(entity
) {
2965 bfqg
= container_of(entity
, struct bfq_group
, entity
);
2966 if (bfqg
!= bfqd
->root_group
) {
2967 parent
= bfqg_parent(bfqg
);
2969 parent
= bfqd
->root_group
;
2970 bfq_group_set_parent(bfqg
, parent
);
2977 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
2978 struct bfq_queue
*bfqq
,
2980 enum bfqq_expiration reason
);
2983 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2984 * @bfqd: queue descriptor.
2985 * @bfqq: the queue to move.
2986 * @bfqg: the group to move to.
2988 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2989 * it on the new one. Avoid putting the entity on the old group idle tree.
2991 * Must be called under the queue lock; the cgroup owning @bfqg must
2992 * not disappear (by now this just means that we are called under
2995 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2996 struct bfq_group
*bfqg
)
2998 struct bfq_entity
*entity
= &bfqq
->entity
;
3000 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3001 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3002 * from data structures related to current group. Otherwise we
3003 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3006 if (bfqq
== bfqd
->in_service_queue
)
3007 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3008 false, BFQQE_PREEMPTED
);
3010 if (bfq_bfqq_busy(bfqq
))
3011 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
3012 else if (entity
->on_st
)
3013 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
3014 bfqg_put(bfqq_group(bfqq
));
3017 * Here we use a reference to bfqg. We don't need a refcounter
3018 * as the cgroup reference will not be dropped, so that its
3019 * destroy() callback will not be invoked.
3021 entity
->parent
= bfqg
->my_entity
;
3022 entity
->sched_data
= &bfqg
->sched_data
;
3025 if (bfq_bfqq_busy(bfqq
))
3026 bfq_activate_bfqq(bfqd
, bfqq
);
3028 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
3029 bfq_schedule_dispatch(bfqd
);
3033 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3034 * @bfqd: the queue descriptor.
3035 * @bic: the bic to move.
3036 * @blkcg: the blk-cgroup to move to.
3038 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3039 * has to make sure that the reference to cgroup is valid across the call.
3041 * NOTE: an alternative approach might have been to store the current
3042 * cgroup in bfqq and getting a reference to it, reducing the lookup
3043 * time here, at the price of slightly more complex code.
3045 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
3046 struct bfq_io_cq
*bic
,
3047 struct blkcg
*blkcg
)
3049 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
3050 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
3051 struct bfq_group
*bfqg
;
3052 struct bfq_entity
*entity
;
3054 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
3056 if (unlikely(!bfqg
))
3057 bfqg
= bfqd
->root_group
;
3060 entity
= &async_bfqq
->entity
;
3062 if (entity
->sched_data
!= &bfqg
->sched_data
) {
3063 bic_set_bfqq(bic
, NULL
, 0);
3064 bfq_log_bfqq(bfqd
, async_bfqq
,
3065 "bic_change_group: %p %d",
3068 bfq_put_queue(async_bfqq
);
3073 entity
= &sync_bfqq
->entity
;
3074 if (entity
->sched_data
!= &bfqg
->sched_data
)
3075 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
3081 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
3083 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
3084 struct bfq_group
*bfqg
= NULL
;
3088 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
3091 * Check whether blkcg has changed. The condition may trigger
3092 * spuriously on a newly created cic but there's no harm.
3094 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
3097 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
3098 bic
->blkcg_serial_nr
= serial_nr
;
3104 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3105 * @st: the service tree being flushed.
3107 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
3109 struct bfq_entity
*entity
= st
->first_idle
;
3111 for (; entity
; entity
= st
->first_idle
)
3112 __bfq_deactivate_entity(entity
, false);
3116 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3117 * @bfqd: the device data structure with the root group.
3118 * @entity: the entity to move.
3120 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
3121 struct bfq_entity
*entity
)
3123 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3125 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
3129 * bfq_reparent_active_entities - move to the root group all active
3131 * @bfqd: the device data structure with the root group.
3132 * @bfqg: the group to move from.
3133 * @st: the service tree with the entities.
3135 * Needs queue_lock to be taken and reference to be valid over the call.
3137 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
3138 struct bfq_group
*bfqg
,
3139 struct bfq_service_tree
*st
)
3141 struct rb_root
*active
= &st
->active
;
3142 struct bfq_entity
*entity
= NULL
;
3144 if (!RB_EMPTY_ROOT(&st
->active
))
3145 entity
= bfq_entity_of(rb_first(active
));
3147 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3148 bfq_reparent_leaf_entity(bfqd
, entity
);
3150 if (bfqg
->sched_data
.in_service_entity
)
3151 bfq_reparent_leaf_entity(bfqd
,
3152 bfqg
->sched_data
.in_service_entity
);
3156 * bfq_pd_offline - deactivate the entity associated with @pd,
3157 * and reparent its children entities.
3158 * @pd: descriptor of the policy going offline.
3160 * blkio already grabs the queue_lock for us, so no need to use
3163 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3165 struct bfq_service_tree
*st
;
3166 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3167 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3168 struct bfq_entity
*entity
= bfqg
->my_entity
;
3169 unsigned long flags
;
3172 if (!entity
) /* root group */
3175 spin_lock_irqsave(&bfqd
->lock
, flags
);
3177 * Empty all service_trees belonging to this group before
3178 * deactivating the group itself.
3180 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3181 st
= bfqg
->sched_data
.service_tree
+ i
;
3184 * The idle tree may still contain bfq_queues belonging
3185 * to exited task because they never migrated to a different
3186 * cgroup from the one being destroyed now. No one else
3187 * can access them so it's safe to act without any lock.
3189 bfq_flush_idle_tree(st
);
3192 * It may happen that some queues are still active
3193 * (busy) upon group destruction (if the corresponding
3194 * processes have been forced to terminate). We move
3195 * all the leaf entities corresponding to these queues
3196 * to the root_group.
3197 * Also, it may happen that the group has an entity
3198 * in service, which is disconnected from the active
3199 * tree: it must be moved, too.
3200 * There is no need to put the sync queues, as the
3201 * scheduler has taken no reference.
3203 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3206 __bfq_deactivate_entity(entity
, false);
3207 bfq_put_async_queues(bfqd
, bfqg
);
3209 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
3211 * @blkg is going offline and will be ignored by
3212 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3213 * that they don't get lost. If IOs complete after this point, the
3214 * stats for them will be lost. Oh well...
3216 bfqg_stats_xfer_dead(bfqg
);
3219 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3221 struct blkcg_gq
*blkg
;
3223 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
3224 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3226 bfq_end_wr_async_queues(bfqd
, bfqg
);
3228 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3231 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3233 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3234 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3235 unsigned int val
= 0;
3238 val
= bfqgd
->weight
;
3240 seq_printf(sf
, "%u\n", val
);
3245 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3246 struct cftype
*cftype
,
3249 struct blkcg
*blkcg
= css_to_blkcg(css
);
3250 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3251 struct blkcg_gq
*blkg
;
3254 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3258 spin_lock_irq(&blkcg
->lock
);
3259 bfqgd
->weight
= (unsigned short)val
;
3260 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3261 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3266 * Setting the prio_changed flag of the entity
3267 * to 1 with new_weight == weight would re-set
3268 * the value of the weight to its ioprio mapping.
3269 * Set the flag only if necessary.
3271 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3272 bfqg
->entity
.new_weight
= (unsigned short)val
;
3274 * Make sure that the above new value has been
3275 * stored in bfqg->entity.new_weight before
3276 * setting the prio_changed flag. In fact,
3277 * this flag may be read asynchronously (in
3278 * critical sections protected by a different
3279 * lock than that held here), and finding this
3280 * flag set may cause the execution of the code
3281 * for updating parameters whose value may
3282 * depend also on bfqg->entity.new_weight (in
3283 * __bfq_entity_update_weight_prio).
3284 * This barrier makes sure that the new value
3285 * of bfqg->entity.new_weight is correctly
3286 * seen in that code.
3289 bfqg
->entity
.prio_changed
= 1;
3292 spin_unlock_irq(&blkcg
->lock
);
3297 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3298 char *buf
, size_t nbytes
,
3302 /* First unsigned long found in the file is used */
3303 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3308 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3311 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3313 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3314 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3318 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3320 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3321 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3325 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3326 struct blkg_policy_data
*pd
, int off
)
3328 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3329 &blkcg_policy_bfq
, off
);
3330 return __blkg_prfill_u64(sf
, pd
, sum
);
3333 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3334 struct blkg_policy_data
*pd
, int off
)
3336 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3339 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3342 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3344 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3345 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3346 seq_cft(sf
)->private, false);
3350 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3352 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3353 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3354 seq_cft(sf
)->private, true);
3358 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3361 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3363 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3366 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3368 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3369 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3373 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3374 struct blkg_policy_data
*pd
, int off
)
3376 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3377 offsetof(struct blkcg_gq
, stat_bytes
));
3378 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3379 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3381 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3384 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3386 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3387 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3392 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3393 struct blkg_policy_data
*pd
, int off
)
3395 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3396 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3400 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3401 v
= div64_u64(v
, samples
);
3403 __blkg_prfill_u64(sf
, pd
, v
);
3407 /* print avg_queue_size */
3408 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3410 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3411 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3416 static struct bfq_group
*
3417 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3421 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3425 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3428 static struct cftype bfq_blkcg_legacy_files
[] = {
3430 .name
= "bfq.weight",
3431 .flags
= CFTYPE_NOT_ON_ROOT
,
3432 .seq_show
= bfq_io_show_weight
,
3433 .write_u64
= bfq_io_set_weight_legacy
,
3436 /* statistics, covers only the tasks in the bfqg */
3439 .private = offsetof(struct bfq_group
, stats
.time
),
3440 .seq_show
= bfqg_print_stat
,
3443 .name
= "bfq.sectors",
3444 .seq_show
= bfqg_print_stat_sectors
,
3447 .name
= "bfq.io_service_bytes",
3448 .private = (unsigned long)&blkcg_policy_bfq
,
3449 .seq_show
= blkg_print_stat_bytes
,
3452 .name
= "bfq.io_serviced",
3453 .private = (unsigned long)&blkcg_policy_bfq
,
3454 .seq_show
= blkg_print_stat_ios
,
3457 .name
= "bfq.io_service_time",
3458 .private = offsetof(struct bfq_group
, stats
.service_time
),
3459 .seq_show
= bfqg_print_rwstat
,
3462 .name
= "bfq.io_wait_time",
3463 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3464 .seq_show
= bfqg_print_rwstat
,
3467 .name
= "bfq.io_merged",
3468 .private = offsetof(struct bfq_group
, stats
.merged
),
3469 .seq_show
= bfqg_print_rwstat
,
3472 .name
= "bfq.io_queued",
3473 .private = offsetof(struct bfq_group
, stats
.queued
),
3474 .seq_show
= bfqg_print_rwstat
,
3477 /* the same statictics which cover the bfqg and its descendants */
3479 .name
= "bfq.time_recursive",
3480 .private = offsetof(struct bfq_group
, stats
.time
),
3481 .seq_show
= bfqg_print_stat_recursive
,
3484 .name
= "bfq.sectors_recursive",
3485 .seq_show
= bfqg_print_stat_sectors_recursive
,
3488 .name
= "bfq.io_service_bytes_recursive",
3489 .private = (unsigned long)&blkcg_policy_bfq
,
3490 .seq_show
= blkg_print_stat_bytes_recursive
,
3493 .name
= "bfq.io_serviced_recursive",
3494 .private = (unsigned long)&blkcg_policy_bfq
,
3495 .seq_show
= blkg_print_stat_ios_recursive
,
3498 .name
= "bfq.io_service_time_recursive",
3499 .private = offsetof(struct bfq_group
, stats
.service_time
),
3500 .seq_show
= bfqg_print_rwstat_recursive
,
3503 .name
= "bfq.io_wait_time_recursive",
3504 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3505 .seq_show
= bfqg_print_rwstat_recursive
,
3508 .name
= "bfq.io_merged_recursive",
3509 .private = offsetof(struct bfq_group
, stats
.merged
),
3510 .seq_show
= bfqg_print_rwstat_recursive
,
3513 .name
= "bfq.io_queued_recursive",
3514 .private = offsetof(struct bfq_group
, stats
.queued
),
3515 .seq_show
= bfqg_print_rwstat_recursive
,
3518 .name
= "bfq.avg_queue_size",
3519 .seq_show
= bfqg_print_avg_queue_size
,
3522 .name
= "bfq.group_wait_time",
3523 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3524 .seq_show
= bfqg_print_stat
,
3527 .name
= "bfq.idle_time",
3528 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3529 .seq_show
= bfqg_print_stat
,
3532 .name
= "bfq.empty_time",
3533 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3534 .seq_show
= bfqg_print_stat
,
3537 .name
= "bfq.dequeue",
3538 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3539 .seq_show
= bfqg_print_stat
,
3544 static struct cftype bfq_blkg_files
[] = {
3546 .name
= "bfq.weight",
3547 .flags
= CFTYPE_NOT_ON_ROOT
,
3548 .seq_show
= bfq_io_show_weight
,
3549 .write
= bfq_io_set_weight
,
3554 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3556 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3557 struct bfq_queue
*bfqq
, unsigned int op
) { }
3559 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3561 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3562 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3563 uint64_t start_time
, uint64_t io_start_time
,
3564 unsigned int op
) { }
3566 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3567 struct bfq_group
*curr_bfqg
) { }
3568 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3569 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3570 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3571 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3572 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3573 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3575 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3576 struct bfq_group
*bfqg
) {}
3578 static void bfq_init_entity(struct bfq_entity
*entity
,
3579 struct bfq_group
*bfqg
)
3581 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3583 entity
->weight
= entity
->new_weight
;
3584 entity
->orig_weight
= entity
->new_weight
;
3586 bfqq
->ioprio
= bfqq
->new_ioprio
;
3587 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3589 entity
->sched_data
= &bfqg
->sched_data
;
3592 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3594 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3596 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3599 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3600 struct blkcg
*blkcg
)
3602 return bfqd
->root_group
;
3605 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3607 return bfqq
->bfqd
->root_group
;
3610 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3613 struct bfq_group
*bfqg
;
3616 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3620 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3621 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3625 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3627 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3628 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3630 #define bfq_sample_valid(samples) ((samples) > 80)
3633 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3634 * We choose the request that is closesr to the head right now. Distance
3635 * behind the head is penalized and only allowed to a certain extent.
3637 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3638 struct request
*rq1
,
3639 struct request
*rq2
,
3642 sector_t s1
, s2
, d1
= 0, d2
= 0;
3643 unsigned long back_max
;
3644 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3645 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3646 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3648 if (!rq1
|| rq1
== rq2
)
3653 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3655 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3657 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3659 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3662 s1
= blk_rq_pos(rq1
);
3663 s2
= blk_rq_pos(rq2
);
3666 * By definition, 1KiB is 2 sectors.
3668 back_max
= bfqd
->bfq_back_max
* 2;
3671 * Strict one way elevator _except_ in the case where we allow
3672 * short backward seeks which are biased as twice the cost of a
3673 * similar forward seek.
3677 else if (s1
+ back_max
>= last
)
3678 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3680 wrap
|= BFQ_RQ1_WRAP
;
3684 else if (s2
+ back_max
>= last
)
3685 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3687 wrap
|= BFQ_RQ2_WRAP
;
3689 /* Found required data */
3692 * By doing switch() on the bit mask "wrap" we avoid having to
3693 * check two variables for all permutations: --> faster!
3696 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3711 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3714 * Since both rqs are wrapped,
3715 * start with the one that's further behind head
3716 * (--> only *one* back seek required),
3717 * since back seek takes more time than forward.
3727 * Return expired entry, or NULL to just start from scratch in rbtree.
3729 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3730 struct request
*last
)
3734 if (bfq_bfqq_fifo_expire(bfqq
))
3737 bfq_mark_bfqq_fifo_expire(bfqq
);
3739 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3741 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3744 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3748 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3749 struct bfq_queue
*bfqq
,
3750 struct request
*last
)
3752 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3753 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3754 struct request
*next
, *prev
= NULL
;
3756 /* Follow expired path, else get first next available. */
3757 next
= bfq_check_fifo(bfqq
, last
);
3762 prev
= rb_entry_rq(rbprev
);
3765 next
= rb_entry_rq(rbnext
);
3767 rbnext
= rb_first(&bfqq
->sort_list
);
3768 if (rbnext
&& rbnext
!= &last
->rb_node
)
3769 next
= rb_entry_rq(rbnext
);
3772 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3775 /* see the definition of bfq_async_charge_factor for details */
3776 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3777 struct bfq_queue
*bfqq
)
3779 if (bfq_bfqq_sync(bfqq
) || bfqq
->wr_coeff
> 1)
3780 return blk_rq_sectors(rq
);
3782 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
3786 * bfq_updated_next_req - update the queue after a new next_rq selection.
3787 * @bfqd: the device data the queue belongs to.
3788 * @bfqq: the queue to update.
3790 * If the first request of a queue changes we make sure that the queue
3791 * has enough budget to serve at least its first request (if the
3792 * request has grown). We do this because if the queue has not enough
3793 * budget for its first request, it has to go through two dispatch
3794 * rounds to actually get it dispatched.
3796 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3797 struct bfq_queue
*bfqq
)
3799 struct bfq_entity
*entity
= &bfqq
->entity
;
3800 struct request
*next_rq
= bfqq
->next_rq
;
3801 unsigned long new_budget
;
3806 if (bfqq
== bfqd
->in_service_queue
)
3808 * In order not to break guarantees, budgets cannot be
3809 * changed after an entity has been selected.
3813 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
3814 bfq_serv_to_charge(next_rq
, bfqq
));
3815 if (entity
->budget
!= new_budget
) {
3816 entity
->budget
= new_budget
;
3817 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
3819 bfq_requeue_bfqq(bfqd
, bfqq
);
3823 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
3825 struct bfq_entity
*entity
= &bfqq
->entity
;
3827 return entity
->budget
- entity
->service
;
3831 * If enough samples have been computed, return the current max budget
3832 * stored in bfqd, which is dynamically updated according to the
3833 * estimated disk peak rate; otherwise return the default max budget
3835 static int bfq_max_budget(struct bfq_data
*bfqd
)
3837 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3838 return bfq_default_max_budget
;
3840 return bfqd
->bfq_max_budget
;
3844 * Return min budget, which is a fraction of the current or default
3845 * max budget (trying with 1/32)
3847 static int bfq_min_budget(struct bfq_data
*bfqd
)
3849 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3850 return bfq_default_max_budget
/ 32;
3852 return bfqd
->bfq_max_budget
/ 32;
3855 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3856 struct bfq_queue
*bfqq
,
3858 enum bfqq_expiration reason
);
3861 * The next function, invoked after the input queue bfqq switches from
3862 * idle to busy, updates the budget of bfqq. The function also tells
3863 * whether the in-service queue should be expired, by returning
3864 * true. The purpose of expiring the in-service queue is to give bfqq
3865 * the chance to possibly preempt the in-service queue, and the reason
3866 * for preempting the in-service queue is to achieve one of the two
3869 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
3870 * expired because it has remained idle. In particular, bfqq may have
3871 * expired for one of the following two reasons:
3873 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3874 * and did not make it to issue a new request before its last
3875 * request was served;
3877 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3878 * a new request before the expiration of the idling-time.
3880 * Even if bfqq has expired for one of the above reasons, the process
3881 * associated with the queue may be however issuing requests greedily,
3882 * and thus be sensitive to the bandwidth it receives (bfqq may have
3883 * remained idle for other reasons: CPU high load, bfqq not enjoying
3884 * idling, I/O throttling somewhere in the path from the process to
3885 * the I/O scheduler, ...). But if, after every expiration for one of
3886 * the above two reasons, bfqq has to wait for the service of at least
3887 * one full budget of another queue before being served again, then
3888 * bfqq is likely to get a much lower bandwidth or resource time than
3889 * its reserved ones. To address this issue, two countermeasures need
3892 * First, the budget and the timestamps of bfqq need to be updated in
3893 * a special way on bfqq reactivation: they need to be updated as if
3894 * bfqq did not remain idle and did not expire. In fact, if they are
3895 * computed as if bfqq expired and remained idle until reactivation,
3896 * then the process associated with bfqq is treated as if, instead of
3897 * being greedy, it stopped issuing requests when bfqq remained idle,
3898 * and restarts issuing requests only on this reactivation. In other
3899 * words, the scheduler does not help the process recover the "service
3900 * hole" between bfqq expiration and reactivation. As a consequence,
3901 * the process receives a lower bandwidth than its reserved one. In
3902 * contrast, to recover this hole, the budget must be updated as if
3903 * bfqq was not expired at all before this reactivation, i.e., it must
3904 * be set to the value of the remaining budget when bfqq was
3905 * expired. Along the same line, timestamps need to be assigned the
3906 * value they had the last time bfqq was selected for service, i.e.,
3907 * before last expiration. Thus timestamps need to be back-shifted
3908 * with respect to their normal computation (see [1] for more details
3909 * on this tricky aspect).
3911 * Secondly, to allow the process to recover the hole, the in-service
3912 * queue must be expired too, to give bfqq the chance to preempt it
3913 * immediately. In fact, if bfqq has to wait for a full budget of the
3914 * in-service queue to be completed, then it may become impossible to
3915 * let the process recover the hole, even if the back-shifted
3916 * timestamps of bfqq are lower than those of the in-service queue. If
3917 * this happens for most or all of the holes, then the process may not
3918 * receive its reserved bandwidth. In this respect, it is worth noting
3919 * that, being the service of outstanding requests unpreemptible, a
3920 * little fraction of the holes may however be unrecoverable, thereby
3921 * causing a little loss of bandwidth.
3923 * The last important point is detecting whether bfqq does need this
3924 * bandwidth recovery. In this respect, the next function deems the
3925 * process associated with bfqq greedy, and thus allows it to recover
3926 * the hole, if: 1) the process is waiting for the arrival of a new
3927 * request (which implies that bfqq expired for one of the above two
3928 * reasons), and 2) such a request has arrived soon. The first
3929 * condition is controlled through the flag non_blocking_wait_rq,
3930 * while the second through the flag arrived_in_time. If both
3931 * conditions hold, then the function computes the budget in the
3932 * above-described special way, and signals that the in-service queue
3933 * should be expired. Timestamp back-shifting is done later in
3934 * __bfq_activate_entity.
3936 * 2. Reduce latency. Even if timestamps are not backshifted to let
3937 * the process associated with bfqq recover a service hole, bfqq may
3938 * however happen to have, after being (re)activated, a lower finish
3939 * timestamp than the in-service queue. That is, the next budget of
3940 * bfqq may have to be completed before the one of the in-service
3941 * queue. If this is the case, then preempting the in-service queue
3942 * allows this goal to be achieved, apart from the unpreemptible,
3943 * outstanding requests mentioned above.
3945 * Unfortunately, regardless of which of the above two goals one wants
3946 * to achieve, service trees need first to be updated to know whether
3947 * the in-service queue must be preempted. To have service trees
3948 * correctly updated, the in-service queue must be expired and
3949 * rescheduled, and bfqq must be scheduled too. This is one of the
3950 * most costly operations (in future versions, the scheduling
3951 * mechanism may be re-designed in such a way to make it possible to
3952 * know whether preemption is needed without needing to update service
3953 * trees). In addition, queue preemptions almost always cause random
3954 * I/O, and thus loss of throughput. Because of these facts, the next
3955 * function adopts the following simple scheme to avoid both costly
3956 * operations and too frequent preemptions: it requests the expiration
3957 * of the in-service queue (unconditionally) only for queues that need
3958 * to recover a hole, or that either are weight-raised or deserve to
3961 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
3962 struct bfq_queue
*bfqq
,
3963 bool arrived_in_time
,
3964 bool wr_or_deserves_wr
)
3966 struct bfq_entity
*entity
= &bfqq
->entity
;
3968 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
3970 * We do not clear the flag non_blocking_wait_rq here, as
3971 * the latter is used in bfq_activate_bfqq to signal
3972 * that timestamps need to be back-shifted (and is
3973 * cleared right after).
3977 * In next assignment we rely on that either
3978 * entity->service or entity->budget are not updated
3979 * on expiration if bfqq is empty (see
3980 * __bfq_bfqq_recalc_budget). Thus both quantities
3981 * remain unchanged after such an expiration, and the
3982 * following statement therefore assigns to
3983 * entity->budget the remaining budget on such an
3984 * expiration. For clarity, entity->service is not
3985 * updated on expiration in any case, and, in normal
3986 * operation, is reset only when bfqq is selected for
3987 * service (see bfq_get_next_queue).
3989 entity
->budget
= min_t(unsigned long,
3990 bfq_bfqq_budget_left(bfqq
),
3996 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
3997 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
3998 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
3999 return wr_or_deserves_wr
;
4002 static unsigned int bfq_wr_duration(struct bfq_data
*bfqd
)
4006 if (bfqd
->bfq_wr_max_time
> 0)
4007 return bfqd
->bfq_wr_max_time
;
4009 dur
= bfqd
->RT_prod
;
4010 do_div(dur
, bfqd
->peak_rate
);
4013 * Limit duration between 3 and 13 seconds. Tests show that
4014 * higher values than 13 seconds often yield the opposite of
4015 * the desired result, i.e., worsen responsiveness by letting
4016 * non-interactive and non-soft-real-time applications
4017 * preserve weight raising for a too long time interval.
4019 * On the other end, lower values than 3 seconds make it
4020 * difficult for most interactive tasks to complete their jobs
4021 * before weight-raising finishes.
4023 if (dur
> msecs_to_jiffies(13000))
4024 dur
= msecs_to_jiffies(13000);
4025 else if (dur
< msecs_to_jiffies(3000))
4026 dur
= msecs_to_jiffies(3000);
4031 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data
*bfqd
,
4032 struct bfq_queue
*bfqq
,
4033 unsigned int old_wr_coeff
,
4034 bool wr_or_deserves_wr
,
4038 if (old_wr_coeff
== 1 && wr_or_deserves_wr
) {
4039 /* start a weight-raising period */
4041 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4042 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4044 bfqq
->wr_start_at_switch_to_srt
= jiffies
;
4045 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4046 BFQ_SOFTRT_WEIGHT_FACTOR
;
4047 bfqq
->wr_cur_max_time
=
4048 bfqd
->bfq_wr_rt_max_time
;
4052 * If needed, further reduce budget to make sure it is
4053 * close to bfqq's backlog, so as to reduce the
4054 * scheduling-error component due to a too large
4055 * budget. Do not care about throughput consequences,
4056 * but only about latency. Finally, do not assign a
4057 * too small budget either, to avoid increasing
4058 * latency by causing too frequent expirations.
4060 bfqq
->entity
.budget
= min_t(unsigned long,
4061 bfqq
->entity
.budget
,
4062 2 * bfq_min_budget(bfqd
));
4063 } else if (old_wr_coeff
> 1) {
4064 if (interactive
) { /* update wr coeff and duration */
4065 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4066 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4067 } else if (soft_rt
) {
4069 * The application is now or still meeting the
4070 * requirements for being deemed soft rt. We
4071 * can then correctly and safely (re)charge
4072 * the weight-raising duration for the
4073 * application with the weight-raising
4074 * duration for soft rt applications.
4076 * In particular, doing this recharge now, i.e.,
4077 * before the weight-raising period for the
4078 * application finishes, reduces the probability
4079 * of the following negative scenario:
4080 * 1) the weight of a soft rt application is
4081 * raised at startup (as for any newly
4082 * created application),
4083 * 2) since the application is not interactive,
4084 * at a certain time weight-raising is
4085 * stopped for the application,
4086 * 3) at that time the application happens to
4087 * still have pending requests, and hence
4088 * is destined to not have a chance to be
4089 * deemed soft rt before these requests are
4090 * completed (see the comments to the
4091 * function bfq_bfqq_softrt_next_start()
4092 * for details on soft rt detection),
4093 * 4) these pending requests experience a high
4094 * latency because the application is not
4095 * weight-raised while they are pending.
4097 if (bfqq
->wr_cur_max_time
!=
4098 bfqd
->bfq_wr_rt_max_time
) {
4099 bfqq
->wr_start_at_switch_to_srt
=
4100 bfqq
->last_wr_start_finish
;
4102 bfqq
->wr_cur_max_time
=
4103 bfqd
->bfq_wr_rt_max_time
;
4104 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4105 BFQ_SOFTRT_WEIGHT_FACTOR
;
4107 bfqq
->last_wr_start_finish
= jiffies
;
4112 static bool bfq_bfqq_idle_for_long_time(struct bfq_data
*bfqd
,
4113 struct bfq_queue
*bfqq
)
4115 return bfqq
->dispatched
== 0 &&
4116 time_is_before_jiffies(
4117 bfqq
->budget_timeout
+
4118 bfqd
->bfq_wr_min_idle_time
);
4121 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
4122 struct bfq_queue
*bfqq
,
4127 bool soft_rt
, wr_or_deserves_wr
, bfqq_wants_to_preempt
,
4128 idle_for_long_time
= bfq_bfqq_idle_for_long_time(bfqd
, bfqq
),
4130 * See the comments on
4131 * bfq_bfqq_update_budg_for_activation for
4132 * details on the usage of the next variable.
4134 arrived_in_time
= ktime_get_ns() <=
4135 bfqq
->ttime
.last_end_request
+
4136 bfqd
->bfq_slice_idle
* 3;
4138 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
4141 * bfqq deserves to be weight-raised if:
4143 * - it has been idle for enough time or is soft real-time.
4145 soft_rt
= bfqd
->bfq_wr_max_softrt_rate
> 0 &&
4146 time_is_before_jiffies(bfqq
->soft_rt_next_start
);
4147 *interactive
= idle_for_long_time
;
4148 wr_or_deserves_wr
= bfqd
->low_latency
&&
4149 (bfqq
->wr_coeff
> 1 ||
4150 (bfq_bfqq_sync(bfqq
) && (*interactive
|| soft_rt
)));
4153 * Using the last flag, update budget and check whether bfqq
4154 * may want to preempt the in-service queue.
4156 bfqq_wants_to_preempt
=
4157 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
4161 if (!bfq_bfqq_IO_bound(bfqq
)) {
4162 if (arrived_in_time
) {
4163 bfqq
->requests_within_timer
++;
4164 if (bfqq
->requests_within_timer
>=
4165 bfqd
->bfq_requests_within_timer
)
4166 bfq_mark_bfqq_IO_bound(bfqq
);
4168 bfqq
->requests_within_timer
= 0;
4171 if (bfqd
->low_latency
) {
4172 bfq_update_bfqq_wr_on_rq_arrival(bfqd
, bfqq
,
4178 if (old_wr_coeff
!= bfqq
->wr_coeff
)
4179 bfqq
->entity
.prio_changed
= 1;
4182 bfqq
->last_idle_bklogged
= jiffies
;
4183 bfqq
->service_from_backlogged
= 0;
4184 bfq_clear_bfqq_softrt_update(bfqq
);
4186 bfq_add_bfqq_busy(bfqd
, bfqq
);
4189 * Expire in-service queue only if preemption may be needed
4190 * for guarantees. In this respect, the function
4191 * next_queue_may_preempt just checks a simple, necessary
4192 * condition, and not a sufficient condition based on
4193 * timestamps. In fact, for the latter condition to be
4194 * evaluated, timestamps would need first to be updated, and
4195 * this operation is quite costly (see the comments on the
4196 * function bfq_bfqq_update_budg_for_activation).
4198 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
4199 bfqd
->in_service_queue
->wr_coeff
< bfqq
->wr_coeff
&&
4200 next_queue_may_preempt(bfqd
))
4201 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
4202 false, BFQQE_PREEMPTED
);
4205 static void bfq_add_request(struct request
*rq
)
4207 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4208 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4209 struct request
*next_rq
, *prev
;
4210 unsigned int old_wr_coeff
= bfqq
->wr_coeff
;
4211 bool interactive
= false;
4213 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
4214 bfqq
->queued
[rq_is_sync(rq
)]++;
4217 elv_rb_add(&bfqq
->sort_list
, rq
);
4220 * Check if this request is a better next-serve candidate.
4222 prev
= bfqq
->next_rq
;
4223 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
4224 bfqq
->next_rq
= next_rq
;
4226 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
4227 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, old_wr_coeff
,
4230 if (bfqd
->low_latency
&& old_wr_coeff
== 1 && !rq_is_sync(rq
) &&
4231 time_is_before_jiffies(
4232 bfqq
->last_wr_start_finish
+
4233 bfqd
->bfq_wr_min_inter_arr_async
)) {
4234 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4235 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4237 bfqq
->entity
.prio_changed
= 1;
4239 if (prev
!= bfqq
->next_rq
)
4240 bfq_updated_next_req(bfqd
, bfqq
);
4244 * Assign jiffies to last_wr_start_finish in the following
4247 * . if bfqq is not going to be weight-raised, because, for
4248 * non weight-raised queues, last_wr_start_finish stores the
4249 * arrival time of the last request; as of now, this piece
4250 * of information is used only for deciding whether to
4251 * weight-raise async queues
4253 * . if bfqq is not weight-raised, because, if bfqq is now
4254 * switching to weight-raised, then last_wr_start_finish
4255 * stores the time when weight-raising starts
4257 * . if bfqq is interactive, because, regardless of whether
4258 * bfqq is currently weight-raised, the weight-raising
4259 * period must start or restart (this case is considered
4260 * separately because it is not detected by the above
4261 * conditions, if bfqq is already weight-raised)
4263 * last_wr_start_finish has to be updated also if bfqq is soft
4264 * real-time, because the weight-raising period is constantly
4265 * restarted on idle-to-busy transitions for these queues, but
4266 * this is already done in bfq_bfqq_handle_idle_busy_switch if
4269 if (bfqd
->low_latency
&&
4270 (old_wr_coeff
== 1 || bfqq
->wr_coeff
== 1 || interactive
))
4271 bfqq
->last_wr_start_finish
= jiffies
;
4274 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
4276 struct request_queue
*q
)
4278 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4282 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
4287 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
4290 return abs(blk_rq_pos(rq
) - last_pos
);
4295 #if 0 /* Still not clear if we can do without next two functions */
4296 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
4298 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4300 bfqd
->rq_in_driver
++;
4303 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
4305 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4307 bfqd
->rq_in_driver
--;
4311 static void bfq_remove_request(struct request_queue
*q
,
4314 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4315 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4316 const int sync
= rq_is_sync(rq
);
4318 if (bfqq
->next_rq
== rq
) {
4319 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
4320 bfq_updated_next_req(bfqd
, bfqq
);
4323 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
4324 list_del_init(&rq
->queuelist
);
4325 bfqq
->queued
[sync
]--;
4327 elv_rb_del(&bfqq
->sort_list
, rq
);
4329 elv_rqhash_del(q
, rq
);
4330 if (q
->last_merge
== rq
)
4331 q
->last_merge
= NULL
;
4333 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4334 bfqq
->next_rq
= NULL
;
4336 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
4337 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
4339 * bfqq emptied. In normal operation, when
4340 * bfqq is empty, bfqq->entity.service and
4341 * bfqq->entity.budget must contain,
4342 * respectively, the service received and the
4343 * budget used last time bfqq emptied. These
4344 * facts do not hold in this case, as at least
4345 * this last removal occurred while bfqq is
4346 * not in service. To avoid inconsistencies,
4347 * reset both bfqq->entity.service and
4348 * bfqq->entity.budget, if bfqq has still a
4349 * process that may issue I/O requests to it.
4351 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
4355 if (rq
->cmd_flags
& REQ_META
)
4356 bfqq
->meta_pending
--;
4358 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
4361 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
4363 struct request_queue
*q
= hctx
->queue
;
4364 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4365 struct request
*free
= NULL
;
4367 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4368 * store its return value for later use, to avoid nesting
4369 * queue_lock inside the bfqd->lock. We assume that the bic
4370 * returned by bfq_bic_lookup does not go away before
4371 * bfqd->lock is taken.
4373 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
4376 spin_lock_irq(&bfqd
->lock
);
4379 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
4381 bfqd
->bio_bfqq
= NULL
;
4382 bfqd
->bio_bic
= bic
;
4384 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
4387 blk_mq_free_request(free
);
4388 spin_unlock_irq(&bfqd
->lock
);
4393 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
4396 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4397 struct request
*__rq
;
4399 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
4400 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
4402 return ELEVATOR_FRONT_MERGE
;
4405 return ELEVATOR_NO_MERGE
;
4408 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
4409 enum elv_merge type
)
4411 if (type
== ELEVATOR_FRONT_MERGE
&&
4412 rb_prev(&req
->rb_node
) &&
4414 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
4415 struct request
, rb_node
))) {
4416 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
4417 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4418 struct request
*prev
, *next_rq
;
4420 /* Reposition request in its sort_list */
4421 elv_rb_del(&bfqq
->sort_list
, req
);
4422 elv_rb_add(&bfqq
->sort_list
, req
);
4424 /* Choose next request to be served for bfqq */
4425 prev
= bfqq
->next_rq
;
4426 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4427 bfqd
->last_position
);
4428 bfqq
->next_rq
= next_rq
;
4430 * If next_rq changes, update the queue's budget to fit
4433 if (prev
!= bfqq
->next_rq
)
4434 bfq_updated_next_req(bfqd
, bfqq
);
4438 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4439 struct request
*next
)
4441 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4443 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4445 spin_lock_irq(&bfqq
->bfqd
->lock
);
4448 * If next and rq belong to the same bfq_queue and next is older
4449 * than rq, then reposition rq in the fifo (by substituting next
4450 * with rq). Otherwise, if next and rq belong to different
4451 * bfq_queues, never reposition rq: in fact, we would have to
4452 * reposition it with respect to next's position in its own fifo,
4453 * which would most certainly be too expensive with respect to
4456 if (bfqq
== next_bfqq
&&
4457 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4458 next
->fifo_time
< rq
->fifo_time
) {
4459 list_del_init(&rq
->queuelist
);
4460 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4461 rq
->fifo_time
= next
->fifo_time
;
4464 if (bfqq
->next_rq
== next
)
4467 bfq_remove_request(q
, next
);
4469 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4471 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4474 /* Must be called with bfqq != NULL */
4475 static void bfq_bfqq_end_wr(struct bfq_queue
*bfqq
)
4478 bfqq
->wr_cur_max_time
= 0;
4479 bfqq
->last_wr_start_finish
= jiffies
;
4481 * Trigger a weight change on the next invocation of
4482 * __bfq_entity_update_weight_prio.
4484 bfqq
->entity
.prio_changed
= 1;
4487 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
4488 struct bfq_group
*bfqg
)
4492 for (i
= 0; i
< 2; i
++)
4493 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
4494 if (bfqg
->async_bfqq
[i
][j
])
4495 bfq_bfqq_end_wr(bfqg
->async_bfqq
[i
][j
]);
4496 if (bfqg
->async_idle_bfqq
)
4497 bfq_bfqq_end_wr(bfqg
->async_idle_bfqq
);
4500 static void bfq_end_wr(struct bfq_data
*bfqd
)
4502 struct bfq_queue
*bfqq
;
4504 spin_lock_irq(&bfqd
->lock
);
4506 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
)
4507 bfq_bfqq_end_wr(bfqq
);
4508 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
)
4509 bfq_bfqq_end_wr(bfqq
);
4510 bfq_end_wr_async(bfqd
);
4512 spin_unlock_irq(&bfqd
->lock
);
4515 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
4518 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4519 bool is_sync
= op_is_sync(bio
->bi_opf
);
4520 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4523 * Disallow merge of a sync bio into an async request.
4525 if (is_sync
&& !rq_is_sync(rq
))
4529 * Lookup the bfqq that this bio will be queued with. Allow
4530 * merge only if rq is queued there.
4535 return bfqq
== RQ_BFQQ(rq
);
4539 * Set the maximum time for the in-service queue to consume its
4540 * budget. This prevents seeky processes from lowering the throughput.
4541 * In practice, a time-slice service scheme is used with seeky
4544 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
,
4545 struct bfq_queue
*bfqq
)
4547 unsigned int timeout_coeff
;
4549 if (bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
)
4552 timeout_coeff
= bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
;
4554 bfqd
->last_budget_start
= ktime_get();
4556 bfqq
->budget_timeout
= jiffies
+
4557 bfqd
->bfq_timeout
* timeout_coeff
;
4560 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
4561 struct bfq_queue
*bfqq
)
4564 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
4565 bfq_clear_bfqq_fifo_expire(bfqq
);
4567 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
4569 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
) &&
4570 bfqq
->wr_coeff
> 1 &&
4571 bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
&&
4572 time_is_before_jiffies(bfqq
->budget_timeout
)) {
4574 * For soft real-time queues, move the start
4575 * of the weight-raising period forward by the
4576 * time the queue has not received any
4577 * service. Otherwise, a relatively long
4578 * service delay is likely to cause the
4579 * weight-raising period of the queue to end,
4580 * because of the short duration of the
4581 * weight-raising period of a soft real-time
4582 * queue. It is worth noting that this move
4583 * is not so dangerous for the other queues,
4584 * because soft real-time queues are not
4587 * To not add a further variable, we use the
4588 * overloaded field budget_timeout to
4589 * determine for how long the queue has not
4590 * received service, i.e., how much time has
4591 * elapsed since the queue expired. However,
4592 * this is a little imprecise, because
4593 * budget_timeout is set to jiffies if bfqq
4594 * not only expires, but also remains with no
4597 if (time_after(bfqq
->budget_timeout
,
4598 bfqq
->last_wr_start_finish
))
4599 bfqq
->last_wr_start_finish
+=
4600 jiffies
- bfqq
->budget_timeout
;
4602 bfqq
->last_wr_start_finish
= jiffies
;
4605 bfq_set_budget_timeout(bfqd
, bfqq
);
4606 bfq_log_bfqq(bfqd
, bfqq
,
4607 "set_in_service_queue, cur-budget = %d",
4608 bfqq
->entity
.budget
);
4611 bfqd
->in_service_queue
= bfqq
;
4615 * Get and set a new queue for service.
4617 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
4619 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
4621 __bfq_set_in_service_queue(bfqd
, bfqq
);
4625 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
4627 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4628 struct bfq_io_cq
*bic
;
4631 /* Processes have exited, don't wait. */
4632 bic
= bfqd
->in_service_bic
;
4633 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
4636 bfq_mark_bfqq_wait_request(bfqq
);
4639 * We don't want to idle for seeks, but we do want to allow
4640 * fair distribution of slice time for a process doing back-to-back
4641 * seeks. So allow a little bit of time for him to submit a new rq.
4643 sl
= bfqd
->bfq_slice_idle
;
4645 * Unless the queue is being weight-raised, grant only minimum
4646 * idle time if the queue is seeky. A long idling is preserved
4647 * for a weight-raised queue, because it is needed for
4648 * guaranteeing to the queue its reserved share of the
4651 if (BFQQ_SEEKY(bfqq
) && bfqq
->wr_coeff
== 1)
4652 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
4654 bfqd
->last_idling_start
= ktime_get();
4655 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
4657 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
4661 * In autotuning mode, max_budget is dynamically recomputed as the
4662 * amount of sectors transferred in timeout at the estimated peak
4663 * rate. This enables BFQ to utilize a full timeslice with a full
4664 * budget, even if the in-service queue is served at peak rate. And
4665 * this maximises throughput with sequential workloads.
4667 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
4669 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
4670 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
4674 * Update parameters related to throughput and responsiveness, as a
4675 * function of the estimated peak rate. See comments on
4676 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
4678 static void update_thr_responsiveness_params(struct bfq_data
*bfqd
)
4680 int dev_type
= blk_queue_nonrot(bfqd
->queue
);
4682 if (bfqd
->bfq_user_max_budget
== 0)
4683 bfqd
->bfq_max_budget
=
4684 bfq_calc_max_budget(bfqd
);
4686 if (bfqd
->device_speed
== BFQ_BFQD_FAST
&&
4687 bfqd
->peak_rate
< device_speed_thresh
[dev_type
]) {
4688 bfqd
->device_speed
= BFQ_BFQD_SLOW
;
4689 bfqd
->RT_prod
= R_slow
[dev_type
] *
4691 } else if (bfqd
->device_speed
== BFQ_BFQD_SLOW
&&
4692 bfqd
->peak_rate
> device_speed_thresh
[dev_type
]) {
4693 bfqd
->device_speed
= BFQ_BFQD_FAST
;
4694 bfqd
->RT_prod
= R_fast
[dev_type
] *
4699 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
4700 dev_type
== 0 ? "ROT" : "NONROT",
4701 bfqd
->device_speed
== BFQ_BFQD_FAST
? "FAST" : "SLOW",
4702 bfqd
->device_speed
== BFQ_BFQD_FAST
?
4703 (USEC_PER_SEC
*(u64
)R_fast
[dev_type
])>>BFQ_RATE_SHIFT
:
4704 (USEC_PER_SEC
*(u64
)R_slow
[dev_type
])>>BFQ_RATE_SHIFT
,
4705 (USEC_PER_SEC
*(u64
)device_speed_thresh
[dev_type
])>>
4709 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
4712 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
4713 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
4714 bfqd
->peak_rate_samples
= 1;
4715 bfqd
->sequential_samples
= 0;
4716 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
4718 } else /* no new rq dispatched, just reset the number of samples */
4719 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
4722 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4723 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
4724 bfqd
->tot_sectors_dispatched
);
4727 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
4729 u32 rate
, weight
, divisor
;
4732 * For the convergence property to hold (see comments on
4733 * bfq_update_peak_rate()) and for the assessment to be
4734 * reliable, a minimum number of samples must be present, and
4735 * a minimum amount of time must have elapsed. If not so, do
4736 * not compute new rate. Just reset parameters, to get ready
4737 * for a new evaluation attempt.
4739 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
4740 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
4741 goto reset_computation
;
4744 * If a new request completion has occurred after last
4745 * dispatch, then, to approximate the rate at which requests
4746 * have been served by the device, it is more precise to
4747 * extend the observation interval to the last completion.
4749 bfqd
->delta_from_first
=
4750 max_t(u64
, bfqd
->delta_from_first
,
4751 bfqd
->last_completion
- bfqd
->first_dispatch
);
4754 * Rate computed in sects/usec, and not sects/nsec, for
4757 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
4758 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
4761 * Peak rate not updated if:
4762 * - the percentage of sequential dispatches is below 3/4 of the
4763 * total, and rate is below the current estimated peak rate
4764 * - rate is unreasonably high (> 20M sectors/sec)
4766 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
4767 rate
<= bfqd
->peak_rate
) ||
4768 rate
> 20<<BFQ_RATE_SHIFT
)
4769 goto reset_computation
;
4772 * We have to update the peak rate, at last! To this purpose,
4773 * we use a low-pass filter. We compute the smoothing constant
4774 * of the filter as a function of the 'weight' of the new
4777 * As can be seen in next formulas, we define this weight as a
4778 * quantity proportional to how sequential the workload is,
4779 * and to how long the observation time interval is.
4781 * The weight runs from 0 to 8. The maximum value of the
4782 * weight, 8, yields the minimum value for the smoothing
4783 * constant. At this minimum value for the smoothing constant,
4784 * the measured rate contributes for half of the next value of
4785 * the estimated peak rate.
4787 * So, the first step is to compute the weight as a function
4788 * of how sequential the workload is. Note that the weight
4789 * cannot reach 9, because bfqd->sequential_samples cannot
4790 * become equal to bfqd->peak_rate_samples, which, in its
4791 * turn, holds true because bfqd->sequential_samples is not
4792 * incremented for the first sample.
4794 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
4797 * Second step: further refine the weight as a function of the
4798 * duration of the observation interval.
4800 weight
= min_t(u32
, 8,
4801 div_u64(weight
* bfqd
->delta_from_first
,
4802 BFQ_RATE_REF_INTERVAL
));
4805 * Divisor ranging from 10, for minimum weight, to 2, for
4808 divisor
= 10 - weight
;
4811 * Finally, update peak rate:
4813 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4815 bfqd
->peak_rate
*= divisor
-1;
4816 bfqd
->peak_rate
/= divisor
;
4817 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
4819 bfqd
->peak_rate
+= rate
;
4820 update_thr_responsiveness_params(bfqd
);
4823 bfq_reset_rate_computation(bfqd
, rq
);
4827 * Update the read/write peak rate (the main quantity used for
4828 * auto-tuning, see update_thr_responsiveness_params()).
4830 * It is not trivial to estimate the peak rate (correctly): because of
4831 * the presence of sw and hw queues between the scheduler and the
4832 * device components that finally serve I/O requests, it is hard to
4833 * say exactly when a given dispatched request is served inside the
4834 * device, and for how long. As a consequence, it is hard to know
4835 * precisely at what rate a given set of requests is actually served
4838 * On the opposite end, the dispatch time of any request is trivially
4839 * available, and, from this piece of information, the "dispatch rate"
4840 * of requests can be immediately computed. So, the idea in the next
4841 * function is to use what is known, namely request dispatch times
4842 * (plus, when useful, request completion times), to estimate what is
4843 * unknown, namely in-device request service rate.
4845 * The main issue is that, because of the above facts, the rate at
4846 * which a certain set of requests is dispatched over a certain time
4847 * interval can vary greatly with respect to the rate at which the
4848 * same requests are then served. But, since the size of any
4849 * intermediate queue is limited, and the service scheme is lossless
4850 * (no request is silently dropped), the following obvious convergence
4851 * property holds: the number of requests dispatched MUST become
4852 * closer and closer to the number of requests completed as the
4853 * observation interval grows. This is the key property used in
4854 * the next function to estimate the peak service rate as a function
4855 * of the observed dispatch rate. The function assumes to be invoked
4856 * on every request dispatch.
4858 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
4860 u64 now_ns
= ktime_get_ns();
4862 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
4863 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
4864 bfqd
->peak_rate_samples
);
4865 bfq_reset_rate_computation(bfqd
, rq
);
4866 goto update_last_values
; /* will add one sample */
4870 * Device idle for very long: the observation interval lasting
4871 * up to this dispatch cannot be a valid observation interval
4872 * for computing a new peak rate (similarly to the late-
4873 * completion event in bfq_completed_request()). Go to
4874 * update_rate_and_reset to have the following three steps
4876 * - close the observation interval at the last (previous)
4877 * request dispatch or completion
4878 * - compute rate, if possible, for that observation interval
4879 * - start a new observation interval with this dispatch
4881 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
4882 bfqd
->rq_in_driver
== 0)
4883 goto update_rate_and_reset
;
4885 /* Update sampling information */
4886 bfqd
->peak_rate_samples
++;
4888 if ((bfqd
->rq_in_driver
> 0 ||
4889 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
4890 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
4891 bfqd
->sequential_samples
++;
4893 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
4895 /* Reset max observed rq size every 32 dispatches */
4896 if (likely(bfqd
->peak_rate_samples
% 32))
4897 bfqd
->last_rq_max_size
=
4898 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
4900 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
4902 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
4904 /* Target observation interval not yet reached, go on sampling */
4905 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
4906 goto update_last_values
;
4908 update_rate_and_reset
:
4909 bfq_update_rate_reset(bfqd
, rq
);
4911 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
4912 bfqd
->last_dispatch
= now_ns
;
4916 * Remove request from internal lists.
4918 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
4920 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4923 * For consistency, the next instruction should have been
4924 * executed after removing the request from the queue and
4925 * dispatching it. We execute instead this instruction before
4926 * bfq_remove_request() (and hence introduce a temporary
4927 * inconsistency), for efficiency. In fact, should this
4928 * dispatch occur for a non in-service bfqq, this anticipated
4929 * increment prevents two counters related to bfqq->dispatched
4930 * from risking to be, first, uselessly decremented, and then
4931 * incremented again when the (new) value of bfqq->dispatched
4932 * happens to be taken into account.
4935 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
4937 bfq_remove_request(q
, rq
);
4940 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4942 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4943 if (bfqq
->dispatched
== 0)
4945 * Overloading budget_timeout field to store
4946 * the time at which the queue remains with no
4947 * backlog and no outstanding request; used by
4948 * the weight-raising mechanism.
4950 bfqq
->budget_timeout
= jiffies
;
4952 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
4954 bfq_requeue_bfqq(bfqd
, bfqq
);
4957 * All in-service entities must have been properly deactivated
4958 * or requeued before executing the next function, which
4959 * resets all in-service entites as no more in service.
4961 __bfq_bfqd_reset_in_service(bfqd
);
4965 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4966 * @bfqd: device data.
4967 * @bfqq: queue to update.
4968 * @reason: reason for expiration.
4970 * Handle the feedback on @bfqq budget at queue expiration.
4971 * See the body for detailed comments.
4973 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
4974 struct bfq_queue
*bfqq
,
4975 enum bfqq_expiration reason
)
4977 struct request
*next_rq
;
4978 int budget
, min_budget
;
4980 min_budget
= bfq_min_budget(bfqd
);
4982 if (bfqq
->wr_coeff
== 1)
4983 budget
= bfqq
->max_budget
;
4985 * Use a constant, low budget for weight-raised queues,
4986 * to help achieve a low latency. Keep it slightly higher
4987 * than the minimum possible budget, to cause a little
4988 * bit fewer expirations.
4990 budget
= 2 * min_budget
;
4992 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
4993 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
4994 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
4995 budget
, bfq_min_budget(bfqd
));
4996 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
4997 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
4999 if (bfq_bfqq_sync(bfqq
) && bfqq
->wr_coeff
== 1) {
5002 * Caveat: in all the following cases we trade latency
5005 case BFQQE_TOO_IDLE
:
5007 * This is the only case where we may reduce
5008 * the budget: if there is no request of the
5009 * process still waiting for completion, then
5010 * we assume (tentatively) that the timer has
5011 * expired because the batch of requests of
5012 * the process could have been served with a
5013 * smaller budget. Hence, betting that
5014 * process will behave in the same way when it
5015 * becomes backlogged again, we reduce its
5016 * next budget. As long as we guess right,
5017 * this budget cut reduces the latency
5018 * experienced by the process.
5020 * However, if there are still outstanding
5021 * requests, then the process may have not yet
5022 * issued its next request just because it is
5023 * still waiting for the completion of some of
5024 * the still outstanding ones. So in this
5025 * subcase we do not reduce its budget, on the
5026 * contrary we increase it to possibly boost
5027 * the throughput, as discussed in the
5028 * comments to the BUDGET_TIMEOUT case.
5030 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
5031 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5033 if (budget
> 5 * min_budget
)
5034 budget
-= 4 * min_budget
;
5036 budget
= min_budget
;
5039 case BFQQE_BUDGET_TIMEOUT
:
5041 * We double the budget here because it gives
5042 * the chance to boost the throughput if this
5043 * is not a seeky process (and has bumped into
5044 * this timeout because of, e.g., ZBR).
5046 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5048 case BFQQE_BUDGET_EXHAUSTED
:
5050 * The process still has backlog, and did not
5051 * let either the budget timeout or the disk
5052 * idling timeout expire. Hence it is not
5053 * seeky, has a short thinktime and may be
5054 * happy with a higher budget too. So
5055 * definitely increase the budget of this good
5056 * candidate to boost the disk throughput.
5058 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
5060 case BFQQE_NO_MORE_REQUESTS
:
5062 * For queues that expire for this reason, it
5063 * is particularly important to keep the
5064 * budget close to the actual service they
5065 * need. Doing so reduces the timestamp
5066 * misalignment problem described in the
5067 * comments in the body of
5068 * __bfq_activate_entity. In fact, suppose
5069 * that a queue systematically expires for
5070 * BFQQE_NO_MORE_REQUESTS and presents a
5071 * new request in time to enjoy timestamp
5072 * back-shifting. The larger the budget of the
5073 * queue is with respect to the service the
5074 * queue actually requests in each service
5075 * slot, the more times the queue can be
5076 * reactivated with the same virtual finish
5077 * time. It follows that, even if this finish
5078 * time is pushed to the system virtual time
5079 * to reduce the consequent timestamp
5080 * misalignment, the queue unjustly enjoys for
5081 * many re-activations a lower finish time
5082 * than all newly activated queues.
5084 * The service needed by bfqq is measured
5085 * quite precisely by bfqq->entity.service.
5086 * Since bfqq does not enjoy device idling,
5087 * bfqq->entity.service is equal to the number
5088 * of sectors that the process associated with
5089 * bfqq requested to read/write before waiting
5090 * for request completions, or blocking for
5093 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
5098 } else if (!bfq_bfqq_sync(bfqq
)) {
5100 * Async queues get always the maximum possible
5101 * budget, as for them we do not care about latency
5102 * (in addition, their ability to dispatch is limited
5103 * by the charging factor).
5105 budget
= bfqd
->bfq_max_budget
;
5108 bfqq
->max_budget
= budget
;
5110 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
5111 !bfqd
->bfq_user_max_budget
)
5112 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
5115 * If there is still backlog, then assign a new budget, making
5116 * sure that it is large enough for the next request. Since
5117 * the finish time of bfqq must be kept in sync with the
5118 * budget, be sure to call __bfq_bfqq_expire() *after* this
5121 * If there is no backlog, then no need to update the budget;
5122 * it will be updated on the arrival of a new request.
5124 next_rq
= bfqq
->next_rq
;
5126 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
5127 bfq_serv_to_charge(next_rq
, bfqq
));
5129 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
5130 next_rq
? blk_rq_sectors(next_rq
) : 0,
5131 bfqq
->entity
.budget
);
5135 * Return true if the process associated with bfqq is "slow". The slow
5136 * flag is used, in addition to the budget timeout, to reduce the
5137 * amount of service provided to seeky processes, and thus reduce
5138 * their chances to lower the throughput. More details in the comments
5139 * on the function bfq_bfqq_expire().
5141 * An important observation is in order: as discussed in the comments
5142 * on the function bfq_update_peak_rate(), with devices with internal
5143 * queues, it is hard if ever possible to know when and for how long
5144 * an I/O request is processed by the device (apart from the trivial
5145 * I/O pattern where a new request is dispatched only after the
5146 * previous one has been completed). This makes it hard to evaluate
5147 * the real rate at which the I/O requests of each bfq_queue are
5148 * served. In fact, for an I/O scheduler like BFQ, serving a
5149 * bfq_queue means just dispatching its requests during its service
5150 * slot (i.e., until the budget of the queue is exhausted, or the
5151 * queue remains idle, or, finally, a timeout fires). But, during the
5152 * service slot of a bfq_queue, around 100 ms at most, the device may
5153 * be even still processing requests of bfq_queues served in previous
5154 * service slots. On the opposite end, the requests of the in-service
5155 * bfq_queue may be completed after the service slot of the queue
5158 * Anyway, unless more sophisticated solutions are used
5159 * (where possible), the sum of the sizes of the requests dispatched
5160 * during the service slot of a bfq_queue is probably the only
5161 * approximation available for the service received by the bfq_queue
5162 * during its service slot. And this sum is the quantity used in this
5163 * function to evaluate the I/O speed of a process.
5165 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5166 bool compensate
, enum bfqq_expiration reason
,
5167 unsigned long *delta_ms
)
5169 ktime_t delta_ktime
;
5171 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
5173 if (!bfq_bfqq_sync(bfqq
))
5177 delta_ktime
= bfqd
->last_idling_start
;
5179 delta_ktime
= ktime_get();
5180 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
5181 delta_usecs
= ktime_to_us(delta_ktime
);
5183 /* don't use too short time intervals */
5184 if (delta_usecs
< 1000) {
5185 if (blk_queue_nonrot(bfqd
->queue
))
5187 * give same worst-case guarantees as idling
5190 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
5191 else /* charge at least one seek */
5192 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
5197 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
5200 * Use only long (> 20ms) intervals to filter out excessive
5201 * spikes in service rate estimation.
5203 if (delta_usecs
> 20000) {
5205 * Caveat for rotational devices: processes doing I/O
5206 * in the slower disk zones tend to be slow(er) even
5207 * if not seeky. In this respect, the estimated peak
5208 * rate is likely to be an average over the disk
5209 * surface. Accordingly, to not be too harsh with
5210 * unlucky processes, a process is deemed slow only if
5211 * its rate has been lower than half of the estimated
5214 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
5217 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
5223 * To be deemed as soft real-time, an application must meet two
5224 * requirements. First, the application must not require an average
5225 * bandwidth higher than the approximate bandwidth required to playback or
5226 * record a compressed high-definition video.
5227 * The next function is invoked on the completion of the last request of a
5228 * batch, to compute the next-start time instant, soft_rt_next_start, such
5229 * that, if the next request of the application does not arrive before
5230 * soft_rt_next_start, then the above requirement on the bandwidth is met.
5232 * The second requirement is that the request pattern of the application is
5233 * isochronous, i.e., that, after issuing a request or a batch of requests,
5234 * the application stops issuing new requests until all its pending requests
5235 * have been completed. After that, the application may issue a new batch,
5237 * For this reason the next function is invoked to compute
5238 * soft_rt_next_start only for applications that meet this requirement,
5239 * whereas soft_rt_next_start is set to infinity for applications that do
5242 * Unfortunately, even a greedy application may happen to behave in an
5243 * isochronous way if the CPU load is high. In fact, the application may
5244 * stop issuing requests while the CPUs are busy serving other processes,
5245 * then restart, then stop again for a while, and so on. In addition, if
5246 * the disk achieves a low enough throughput with the request pattern
5247 * issued by the application (e.g., because the request pattern is random
5248 * and/or the device is slow), then the application may meet the above
5249 * bandwidth requirement too. To prevent such a greedy application to be
5250 * deemed as soft real-time, a further rule is used in the computation of
5251 * soft_rt_next_start: soft_rt_next_start must be higher than the current
5252 * time plus the maximum time for which the arrival of a request is waited
5253 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
5254 * This filters out greedy applications, as the latter issue instead their
5255 * next request as soon as possible after the last one has been completed
5256 * (in contrast, when a batch of requests is completed, a soft real-time
5257 * application spends some time processing data).
5259 * Unfortunately, the last filter may easily generate false positives if
5260 * only bfqd->bfq_slice_idle is used as a reference time interval and one
5261 * or both the following cases occur:
5262 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
5263 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
5265 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
5266 * for a while, then suddenly 'jump' by several units to recover the lost
5267 * increments. This seems to happen, e.g., inside virtual machines.
5268 * To address this issue, we do not use as a reference time interval just
5269 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
5270 * particular we add the minimum number of jiffies for which the filter
5271 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
5274 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data
*bfqd
,
5275 struct bfq_queue
*bfqq
)
5277 return max(bfqq
->last_idle_bklogged
+
5278 HZ
* bfqq
->service_from_backlogged
/
5279 bfqd
->bfq_wr_max_softrt_rate
,
5280 jiffies
+ nsecs_to_jiffies(bfqq
->bfqd
->bfq_slice_idle
) + 4);
5284 * Return the farthest future time instant according to jiffies
5287 static unsigned long bfq_greatest_from_now(void)
5289 return jiffies
+ MAX_JIFFY_OFFSET
;
5293 * Return the farthest past time instant according to jiffies
5296 static unsigned long bfq_smallest_from_now(void)
5298 return jiffies
- MAX_JIFFY_OFFSET
;
5302 * bfq_bfqq_expire - expire a queue.
5303 * @bfqd: device owning the queue.
5304 * @bfqq: the queue to expire.
5305 * @compensate: if true, compensate for the time spent idling.
5306 * @reason: the reason causing the expiration.
5308 * If the process associated with bfqq does slow I/O (e.g., because it
5309 * issues random requests), we charge bfqq with the time it has been
5310 * in service instead of the service it has received (see
5311 * bfq_bfqq_charge_time for details on how this goal is achieved). As
5312 * a consequence, bfqq will typically get higher timestamps upon
5313 * reactivation, and hence it will be rescheduled as if it had
5314 * received more service than what it has actually received. In the
5315 * end, bfqq receives less service in proportion to how slowly its
5316 * associated process consumes its budgets (and hence how seriously it
5317 * tends to lower the throughput). In addition, this time-charging
5318 * strategy guarantees time fairness among slow processes. In
5319 * contrast, if the process associated with bfqq is not slow, we
5320 * charge bfqq exactly with the service it has received.
5322 * Charging time to the first type of queues and the exact service to
5323 * the other has the effect of using the WF2Q+ policy to schedule the
5324 * former on a timeslice basis, without violating service domain
5325 * guarantees among the latter.
5327 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
5328 struct bfq_queue
*bfqq
,
5330 enum bfqq_expiration reason
)
5333 unsigned long delta
= 0;
5334 struct bfq_entity
*entity
= &bfqq
->entity
;
5338 * Check whether the process is slow (see bfq_bfqq_is_slow).
5340 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
5343 * Increase service_from_backlogged before next statement,
5344 * because the possible next invocation of
5345 * bfq_bfqq_charge_time would likely inflate
5346 * entity->service. In contrast, service_from_backlogged must
5347 * contain real service, to enable the soft real-time
5348 * heuristic to correctly compute the bandwidth consumed by
5351 bfqq
->service_from_backlogged
+= entity
->service
;
5354 * As above explained, charge slow (typically seeky) and
5355 * timed-out queues with the time and not the service
5356 * received, to favor sequential workloads.
5358 * Processes doing I/O in the slower disk zones will tend to
5359 * be slow(er) even if not seeky. Therefore, since the
5360 * estimated peak rate is actually an average over the disk
5361 * surface, these processes may timeout just for bad luck. To
5362 * avoid punishing them, do not charge time to processes that
5363 * succeeded in consuming at least 2/3 of their budget. This
5364 * allows BFQ to preserve enough elasticity to still perform
5365 * bandwidth, and not time, distribution with little unlucky
5366 * or quasi-sequential processes.
5368 if (bfqq
->wr_coeff
== 1 &&
5370 (reason
== BFQQE_BUDGET_TIMEOUT
&&
5371 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3)))
5372 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
5374 if (reason
== BFQQE_TOO_IDLE
&&
5375 entity
->service
<= 2 * entity
->budget
/ 10)
5376 bfq_clear_bfqq_IO_bound(bfqq
);
5378 if (bfqd
->low_latency
&& bfqq
->wr_coeff
== 1)
5379 bfqq
->last_wr_start_finish
= jiffies
;
5381 if (bfqd
->low_latency
&& bfqd
->bfq_wr_max_softrt_rate
> 0 &&
5382 RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
5384 * If we get here, and there are no outstanding
5385 * requests, then the request pattern is isochronous
5386 * (see the comments on the function
5387 * bfq_bfqq_softrt_next_start()). Thus we can compute
5388 * soft_rt_next_start. If, instead, the queue still
5389 * has outstanding requests, then we have to wait for
5390 * the completion of all the outstanding requests to
5391 * discover whether the request pattern is actually
5394 if (bfqq
->dispatched
== 0)
5395 bfqq
->soft_rt_next_start
=
5396 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
5399 * The application is still waiting for the
5400 * completion of one or more requests:
5401 * prevent it from possibly being incorrectly
5402 * deemed as soft real-time by setting its
5403 * soft_rt_next_start to infinity. In fact,
5404 * without this assignment, the application
5405 * would be incorrectly deemed as soft
5407 * 1) it issued a new request before the
5408 * completion of all its in-flight
5410 * 2) at that time, its soft_rt_next_start
5411 * happened to be in the past.
5413 bfqq
->soft_rt_next_start
=
5414 bfq_greatest_from_now();
5416 * Schedule an update of soft_rt_next_start to when
5417 * the task may be discovered to be isochronous.
5419 bfq_mark_bfqq_softrt_update(bfqq
);
5423 bfq_log_bfqq(bfqd
, bfqq
,
5424 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
5425 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
5428 * Increase, decrease or leave budget unchanged according to
5431 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
5433 __bfq_bfqq_expire(bfqd
, bfqq
);
5435 /* mark bfqq as waiting a request only if a bic still points to it */
5436 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
5437 reason
!= BFQQE_BUDGET_TIMEOUT
&&
5438 reason
!= BFQQE_BUDGET_EXHAUSTED
)
5439 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
5443 * Budget timeout is not implemented through a dedicated timer, but
5444 * just checked on request arrivals and completions, as well as on
5445 * idle timer expirations.
5447 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
5449 return time_is_before_eq_jiffies(bfqq
->budget_timeout
);
5453 * If we expire a queue that is actively waiting (i.e., with the
5454 * device idled) for the arrival of a new request, then we may incur
5455 * the timestamp misalignment problem described in the body of the
5456 * function __bfq_activate_entity. Hence we return true only if this
5457 * condition does not hold, or if the queue is slow enough to deserve
5458 * only to be kicked off for preserving a high throughput.
5460 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
5462 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
5463 "may_budget_timeout: wait_request %d left %d timeout %d",
5464 bfq_bfqq_wait_request(bfqq
),
5465 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
5466 bfq_bfqq_budget_timeout(bfqq
));
5468 return (!bfq_bfqq_wait_request(bfqq
) ||
5469 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
5471 bfq_bfqq_budget_timeout(bfqq
);
5475 * For a queue that becomes empty, device idling is allowed only if
5476 * this function returns true for the queue. As a consequence, since
5477 * device idling plays a critical role in both throughput boosting and
5478 * service guarantees, the return value of this function plays a
5479 * critical role in both these aspects as well.
5481 * In a nutshell, this function returns true only if idling is
5482 * beneficial for throughput or, even if detrimental for throughput,
5483 * idling is however necessary to preserve service guarantees (low
5484 * latency, desired throughput distribution, ...). In particular, on
5485 * NCQ-capable devices, this function tries to return false, so as to
5486 * help keep the drives' internal queues full, whenever this helps the
5487 * device boost the throughput without causing any service-guarantee
5490 * In more detail, the return value of this function is obtained by,
5491 * first, computing a number of boolean variables that take into
5492 * account throughput and service-guarantee issues, and, then,
5493 * combining these variables in a logical expression. Most of the
5494 * issues taken into account are not trivial. We discuss these issues
5495 * individually while introducing the variables.
5497 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
5499 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5500 bool idling_boosts_thr
, asymmetric_scenario
;
5502 if (bfqd
->strict_guarantees
)
5506 * The next variable takes into account the cases where idling
5507 * boosts the throughput.
5509 * The value of the variable is computed considering that
5510 * idling is usually beneficial for the throughput if:
5511 * (a) the device is not NCQ-capable, or
5512 * (b) regardless of the presence of NCQ, the request pattern
5513 * for bfqq is I/O-bound (possible throughput losses
5514 * caused by granting idling to seeky queues are mitigated
5515 * by the fact that, in all scenarios where boosting
5516 * throughput is the best thing to do, i.e., in all
5517 * symmetric scenarios, only a minimal idle time is
5518 * allowed to seeky queues).
5520 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
5523 * There is then a case where idling must be performed not for
5524 * throughput concerns, but to preserve service guarantees. To
5525 * introduce it, we can note that allowing the drive to
5526 * enqueue more than one request at a time, and hence
5527 * delegating de facto final scheduling decisions to the
5528 * drive's internal scheduler, causes loss of control on the
5529 * actual request service order. In particular, the critical
5530 * situation is when requests from different processes happens
5531 * to be present, at the same time, in the internal queue(s)
5532 * of the drive. In such a situation, the drive, by deciding
5533 * the service order of the internally-queued requests, does
5534 * determine also the actual throughput distribution among
5535 * these processes. But the drive typically has no notion or
5536 * concern about per-process throughput distribution, and
5537 * makes its decisions only on a per-request basis. Therefore,
5538 * the service distribution enforced by the drive's internal
5539 * scheduler is likely to coincide with the desired
5540 * device-throughput distribution only in a completely
5541 * symmetric scenario where: (i) each of these processes must
5542 * get the same throughput as the others; (ii) all these
5543 * processes have the same I/O pattern (either sequential or
5544 * random). In fact, in such a scenario, the drive will tend
5545 * to treat the requests of each of these processes in about
5546 * the same way as the requests of the others, and thus to
5547 * provide each of these processes with about the same
5548 * throughput (which is exactly the desired throughput
5549 * distribution). In contrast, in any asymmetric scenario,
5550 * device idling is certainly needed to guarantee that bfqq
5551 * receives its assigned fraction of the device throughput
5552 * (see [1] for details).
5554 * As for sub-condition (i), actually we check only whether
5555 * bfqq is being weight-raised. In fact, if bfqq is not being
5556 * weight-raised, we have that:
5557 * - if the process associated with bfqq is not I/O-bound, then
5558 * it is not either latency- or throughput-critical; therefore
5559 * idling is not needed for bfqq;
5560 * - if the process asociated with bfqq is I/O-bound, then
5561 * idling is already granted with bfqq (see the comments on
5562 * idling_boosts_thr).
5564 * We do not check sub-condition (ii) at all, i.e., the next
5565 * variable is true if and only if bfqq is being
5566 * weight-raised. We do not need to control sub-condition (ii)
5567 * for the following reason:
5568 * - if bfqq is being weight-raised, then idling is already
5569 * guaranteed to bfqq by sub-condition (i);
5570 * - if bfqq is not being weight-raised, then idling is
5571 * already guaranteed to bfqq (only) if it matters, i.e., if
5572 * bfqq is associated to a currently I/O-bound process (see
5573 * the above comment on sub-condition (i)).
5575 * As a side note, it is worth considering that the above
5576 * device-idling countermeasures may however fail in the
5577 * following unlucky scenario: if idling is (correctly)
5578 * disabled in a time period during which the symmetry
5579 * sub-condition holds, and hence the device is allowed to
5580 * enqueue many requests, but at some later point in time some
5581 * sub-condition stops to hold, then it may become impossible
5582 * to let requests be served in the desired order until all
5583 * the requests already queued in the device have been served.
5585 asymmetric_scenario
= bfqq
->wr_coeff
> 1;
5588 * We have now all the components we need to compute the return
5589 * value of the function, which is true only if both the following
5591 * 1) bfqq is sync, because idling make sense only for sync queues;
5592 * 2) idling either boosts the throughput (without issues), or
5593 * is necessary to preserve service guarantees.
5595 return bfq_bfqq_sync(bfqq
) &&
5596 (idling_boosts_thr
|| asymmetric_scenario
);
5600 * If the in-service queue is empty but the function bfq_bfqq_may_idle
5601 * returns true, then:
5602 * 1) the queue must remain in service and cannot be expired, and
5603 * 2) the device must be idled to wait for the possible arrival of a new
5604 * request for the queue.
5605 * See the comments on the function bfq_bfqq_may_idle for the reasons
5606 * why performing device idling is the best choice to boost the throughput
5607 * and preserve service guarantees when bfq_bfqq_may_idle itself
5610 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
5612 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5614 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
5615 bfq_bfqq_may_idle(bfqq
);
5619 * Select a queue for service. If we have a current queue in service,
5620 * check whether to continue servicing it, or retrieve and set a new one.
5622 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
5624 struct bfq_queue
*bfqq
;
5625 struct request
*next_rq
;
5626 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
5628 bfqq
= bfqd
->in_service_queue
;
5632 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
5634 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
5635 !bfq_bfqq_wait_request(bfqq
) &&
5636 !bfq_bfqq_must_idle(bfqq
))
5641 * This loop is rarely executed more than once. Even when it
5642 * happens, it is much more convenient to re-execute this loop
5643 * than to return NULL and trigger a new dispatch to get a
5646 next_rq
= bfqq
->next_rq
;
5648 * If bfqq has requests queued and it has enough budget left to
5649 * serve them, keep the queue, otherwise expire it.
5652 if (bfq_serv_to_charge(next_rq
, bfqq
) >
5653 bfq_bfqq_budget_left(bfqq
)) {
5655 * Expire the queue for budget exhaustion,
5656 * which makes sure that the next budget is
5657 * enough to serve the next request, even if
5658 * it comes from the fifo expired path.
5660 reason
= BFQQE_BUDGET_EXHAUSTED
;
5664 * The idle timer may be pending because we may
5665 * not disable disk idling even when a new request
5668 if (bfq_bfqq_wait_request(bfqq
)) {
5670 * If we get here: 1) at least a new request
5671 * has arrived but we have not disabled the
5672 * timer because the request was too small,
5673 * 2) then the block layer has unplugged
5674 * the device, causing the dispatch to be
5677 * Since the device is unplugged, now the
5678 * requests are probably large enough to
5679 * provide a reasonable throughput.
5680 * So we disable idling.
5682 bfq_clear_bfqq_wait_request(bfqq
);
5683 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
5684 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
5691 * No requests pending. However, if the in-service queue is idling
5692 * for a new request, or has requests waiting for a completion and
5693 * may idle after their completion, then keep it anyway.
5695 if (bfq_bfqq_wait_request(bfqq
) ||
5696 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
5701 reason
= BFQQE_NO_MORE_REQUESTS
;
5703 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
5705 bfqq
= bfq_set_in_service_queue(bfqd
);
5707 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
5712 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
5714 bfq_log(bfqd
, "select_queue: no queue returned");
5719 static void bfq_update_wr_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5721 struct bfq_entity
*entity
= &bfqq
->entity
;
5723 if (bfqq
->wr_coeff
> 1) { /* queue is being weight-raised */
5724 bfq_log_bfqq(bfqd
, bfqq
,
5725 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
5726 jiffies_to_msecs(jiffies
- bfqq
->last_wr_start_finish
),
5727 jiffies_to_msecs(bfqq
->wr_cur_max_time
),
5729 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
5731 if (entity
->prio_changed
)
5732 bfq_log_bfqq(bfqd
, bfqq
, "WARN: pending prio change");
5735 * If too much time has elapsed from the beginning of
5736 * this weight-raising period, then end weight
5739 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
+
5740 bfqq
->wr_cur_max_time
)) {
5741 if (bfqq
->wr_cur_max_time
!= bfqd
->bfq_wr_rt_max_time
||
5742 time_is_before_jiffies(bfqq
->wr_start_at_switch_to_srt
+
5743 bfq_wr_duration(bfqd
)))
5744 bfq_bfqq_end_wr(bfqq
);
5746 /* switch back to interactive wr */
5747 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
5748 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
5749 bfqq
->last_wr_start_finish
=
5750 bfqq
->wr_start_at_switch_to_srt
;
5751 bfqq
->entity
.prio_changed
= 1;
5755 /* Update weight both if it must be raised and if it must be lowered */
5756 if ((entity
->weight
> entity
->orig_weight
) != (bfqq
->wr_coeff
> 1))
5757 __bfq_entity_update_weight_prio(
5758 bfq_entity_service_tree(entity
),
5763 * Dispatch next request from bfqq.
5765 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
5766 struct bfq_queue
*bfqq
)
5768 struct request
*rq
= bfqq
->next_rq
;
5769 unsigned long service_to_charge
;
5771 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
5773 bfq_bfqq_served(bfqq
, service_to_charge
);
5775 bfq_dispatch_remove(bfqd
->queue
, rq
);
5778 * If weight raising has to terminate for bfqq, then next
5779 * function causes an immediate update of bfqq's weight,
5780 * without waiting for next activation. As a consequence, on
5781 * expiration, bfqq will be timestamped as if has never been
5782 * weight-raised during this service slot, even if it has
5783 * received part or even most of the service as a
5784 * weight-raised queue. This inflates bfqq's timestamps, which
5785 * is beneficial, as bfqq is then more willing to leave the
5786 * device immediately to possible other weight-raised queues.
5788 bfq_update_wr_data(bfqd
, bfqq
);
5790 if (!bfqd
->in_service_bic
) {
5791 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
5792 bfqd
->in_service_bic
= RQ_BIC(rq
);
5796 * Expire bfqq, pretending that its budget expired, if bfqq
5797 * belongs to CLASS_IDLE and other queues are waiting for
5800 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
5806 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
5810 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
5812 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5815 * Avoiding lock: a race on bfqd->busy_queues should cause at
5816 * most a call to dispatch for nothing
5818 return !list_empty_careful(&bfqd
->dispatch
) ||
5819 bfqd
->busy_queues
> 0;
5822 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5824 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5825 struct request
*rq
= NULL
;
5826 struct bfq_queue
*bfqq
= NULL
;
5828 if (!list_empty(&bfqd
->dispatch
)) {
5829 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
5831 list_del_init(&rq
->queuelist
);
5837 * Increment counters here, because this
5838 * dispatch does not follow the standard
5839 * dispatch flow (where counters are
5844 goto inc_in_driver_start_rq
;
5848 * We exploit the put_rq_private hook to decrement
5849 * rq_in_driver, but put_rq_private will not be
5850 * invoked on this request. So, to avoid unbalance,
5851 * just start this request, without incrementing
5852 * rq_in_driver. As a negative consequence,
5853 * rq_in_driver is deceptively lower than it should be
5854 * while this request is in service. This may cause
5855 * bfq_schedule_dispatch to be invoked uselessly.
5857 * As for implementing an exact solution, the
5858 * put_request hook, if defined, is probably invoked
5859 * also on this request. So, by exploiting this hook,
5860 * we could 1) increment rq_in_driver here, and 2)
5861 * decrement it in put_request. Such a solution would
5862 * let the value of the counter be always accurate,
5863 * but it would entail using an extra interface
5864 * function. This cost seems higher than the benefit,
5865 * being the frequency of non-elevator-private
5866 * requests very low.
5871 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
5873 if (bfqd
->busy_queues
== 0)
5877 * Force device to serve one request at a time if
5878 * strict_guarantees is true. Forcing this service scheme is
5879 * currently the ONLY way to guarantee that the request
5880 * service order enforced by the scheduler is respected by a
5881 * queueing device. Otherwise the device is free even to make
5882 * some unlucky request wait for as long as the device
5885 * Of course, serving one request at at time may cause loss of
5888 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
5891 bfqq
= bfq_select_queue(bfqd
);
5895 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
5898 inc_in_driver_start_rq
:
5899 bfqd
->rq_in_driver
++;
5901 rq
->rq_flags
|= RQF_STARTED
;
5907 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5909 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5912 spin_lock_irq(&bfqd
->lock
);
5913 rq
= __bfq_dispatch_request(hctx
);
5914 spin_unlock_irq(&bfqd
->lock
);
5920 * Task holds one reference to the queue, dropped when task exits. Each rq
5921 * in-flight on this queue also holds a reference, dropped when rq is freed.
5923 * Scheduler lock must be held here. Recall not to use bfqq after calling
5924 * this function on it.
5926 static void bfq_put_queue(struct bfq_queue
*bfqq
)
5928 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5929 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
5933 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
5940 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
5942 kmem_cache_free(bfq_pool
, bfqq
);
5943 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5948 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5950 if (bfqq
== bfqd
->in_service_queue
) {
5951 __bfq_bfqq_expire(bfqd
, bfqq
);
5952 bfq_schedule_dispatch(bfqd
);
5955 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
5957 bfq_put_queue(bfqq
); /* release process reference */
5960 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
5962 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
5963 struct bfq_data
*bfqd
;
5966 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
5969 unsigned long flags
;
5971 spin_lock_irqsave(&bfqd
->lock
, flags
);
5972 bfq_exit_bfqq(bfqd
, bfqq
);
5973 bic_set_bfqq(bic
, NULL
, is_sync
);
5974 spin_unlock_irq(&bfqd
->lock
);
5978 static void bfq_exit_icq(struct io_cq
*icq
)
5980 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
5982 bfq_exit_icq_bfqq(bic
, true);
5983 bfq_exit_icq_bfqq(bic
, false);
5987 * Update the entity prio values; note that the new values will not
5988 * be used until the next (re)activation.
5991 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
5993 struct task_struct
*tsk
= current
;
5995 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6000 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6001 switch (ioprio_class
) {
6003 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
6004 "bfq: bad prio class %d\n", ioprio_class
);
6005 case IOPRIO_CLASS_NONE
:
6007 * No prio set, inherit CPU scheduling settings.
6009 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
6010 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
6012 case IOPRIO_CLASS_RT
:
6013 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6014 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
6016 case IOPRIO_CLASS_BE
:
6017 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6018 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
6020 case IOPRIO_CLASS_IDLE
:
6021 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
6022 bfqq
->new_ioprio
= 7;
6023 bfq_clear_bfqq_idle_window(bfqq
);
6027 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
6028 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
6030 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
6033 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
6034 bfqq
->entity
.prio_changed
= 1;
6037 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
6039 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
6040 struct bfq_queue
*bfqq
;
6041 int ioprio
= bic
->icq
.ioc
->ioprio
;
6044 * This condition may trigger on a newly created bic, be sure to
6045 * drop the lock before returning.
6047 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
6050 bic
->ioprio
= ioprio
;
6052 bfqq
= bic_to_bfqq(bic
, false);
6054 /* release process reference on this queue */
6055 bfq_put_queue(bfqq
);
6056 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
6057 bic_set_bfqq(bic
, bfqq
, false);
6060 bfqq
= bic_to_bfqq(bic
, true);
6062 bfq_set_next_ioprio_data(bfqq
, bic
);
6065 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6066 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
6068 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
6069 INIT_LIST_HEAD(&bfqq
->fifo
);
6075 bfq_set_next_ioprio_data(bfqq
, bic
);
6078 if (!bfq_class_idle(bfqq
))
6079 bfq_mark_bfqq_idle_window(bfqq
);
6080 bfq_mark_bfqq_sync(bfqq
);
6082 bfq_clear_bfqq_sync(bfqq
);
6084 /* set end request to minus infinity from now */
6085 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
6087 bfq_mark_bfqq_IO_bound(bfqq
);
6091 /* Tentative initial value to trade off between thr and lat */
6092 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
6093 bfqq
->budget_timeout
= bfq_smallest_from_now();
6096 bfqq
->last_wr_start_finish
= bfq_smallest_from_now();
6097 bfqq
->wr_start_at_switch_to_srt
= bfq_smallest_from_now();
6100 * Set to the value for which bfqq will not be deemed as
6101 * soft rt when it becomes backlogged.
6103 bfqq
->soft_rt_next_start
= bfq_greatest_from_now();
6105 /* first request is almost certainly seeky */
6106 bfqq
->seek_history
= 1;
6109 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
6110 struct bfq_group
*bfqg
,
6111 int ioprio_class
, int ioprio
)
6113 switch (ioprio_class
) {
6114 case IOPRIO_CLASS_RT
:
6115 return &bfqg
->async_bfqq
[0][ioprio
];
6116 case IOPRIO_CLASS_NONE
:
6117 ioprio
= IOPRIO_NORM
;
6119 case IOPRIO_CLASS_BE
:
6120 return &bfqg
->async_bfqq
[1][ioprio
];
6121 case IOPRIO_CLASS_IDLE
:
6122 return &bfqg
->async_idle_bfqq
;
6128 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
6129 struct bio
*bio
, bool is_sync
,
6130 struct bfq_io_cq
*bic
)
6132 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6133 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6134 struct bfq_queue
**async_bfqq
= NULL
;
6135 struct bfq_queue
*bfqq
;
6136 struct bfq_group
*bfqg
;
6140 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
6142 bfqq
= &bfqd
->oom_bfqq
;
6147 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
6154 bfqq
= kmem_cache_alloc_node(bfq_pool
,
6155 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
6159 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
6161 bfq_init_entity(&bfqq
->entity
, bfqg
);
6162 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
6164 bfqq
= &bfqd
->oom_bfqq
;
6165 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
6170 * Pin the queue now that it's allocated, scheduler exit will
6175 * Extra group reference, w.r.t. sync
6176 * queue. This extra reference is removed
6177 * only if bfqq->bfqg disappears, to
6178 * guarantee that this queue is not freed
6179 * until its group goes away.
6181 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
6187 bfqq
->ref
++; /* get a process reference to this queue */
6188 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
6193 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
6194 struct bfq_queue
*bfqq
)
6196 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
6197 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
6199 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
6201 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
6202 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
6203 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
6204 ttime
->ttime_samples
);
6208 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6211 bfqq
->seek_history
<<= 1;
6212 bfqq
->seek_history
|=
6213 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
6214 (!blk_queue_nonrot(bfqd
->queue
) ||
6215 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
6219 * Disable idle window if the process thinks too long or seeks so much that
6220 * it doesn't matter.
6222 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
6223 struct bfq_queue
*bfqq
,
6224 struct bfq_io_cq
*bic
)
6228 /* Don't idle for async or idle io prio class. */
6229 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
6232 enable_idle
= bfq_bfqq_idle_window(bfqq
);
6234 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
6235 bfqd
->bfq_slice_idle
== 0 ||
6236 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
6237 bfqq
->wr_coeff
== 1))
6239 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
6240 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
6241 bfqq
->wr_coeff
== 1)
6246 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
6250 bfq_mark_bfqq_idle_window(bfqq
);
6252 bfq_clear_bfqq_idle_window(bfqq
);
6256 * Called when a new fs request (rq) is added to bfqq. Check if there's
6257 * something we should do about it.
6259 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6262 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
6264 if (rq
->cmd_flags
& REQ_META
)
6265 bfqq
->meta_pending
++;
6267 bfq_update_io_thinktime(bfqd
, bfqq
);
6268 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
6269 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
6271 bfq_update_idle_window(bfqd
, bfqq
, bic
);
6273 bfq_log_bfqq(bfqd
, bfqq
,
6274 "rq_enqueued: idle_window=%d (seeky %d)",
6275 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
6277 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
6279 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
6280 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
6281 blk_rq_sectors(rq
) < 32;
6282 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
6285 * There is just this request queued: if the request
6286 * is small and the queue is not to be expired, then
6289 * In this way, if the device is being idled to wait
6290 * for a new request from the in-service queue, we
6291 * avoid unplugging the device and committing the
6292 * device to serve just a small request. On the
6293 * contrary, we wait for the block layer to decide
6294 * when to unplug the device: hopefully, new requests
6295 * will be merged to this one quickly, then the device
6296 * will be unplugged and larger requests will be
6299 if (small_req
&& !budget_timeout
)
6303 * A large enough request arrived, or the queue is to
6304 * be expired: in both cases disk idling is to be
6305 * stopped, so clear wait_request flag and reset
6308 bfq_clear_bfqq_wait_request(bfqq
);
6309 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
6310 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
6313 * The queue is not empty, because a new request just
6314 * arrived. Hence we can safely expire the queue, in
6315 * case of budget timeout, without risking that the
6316 * timestamps of the queue are not updated correctly.
6317 * See [1] for more details.
6320 bfq_bfqq_expire(bfqd
, bfqq
, false,
6321 BFQQE_BUDGET_TIMEOUT
);
6325 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
6327 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
6329 bfq_add_request(rq
);
6331 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
6332 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
6334 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
6337 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
6340 struct request_queue
*q
= hctx
->queue
;
6341 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
6343 spin_lock_irq(&bfqd
->lock
);
6344 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
6345 spin_unlock_irq(&bfqd
->lock
);
6349 spin_unlock_irq(&bfqd
->lock
);
6351 blk_mq_sched_request_inserted(rq
);
6353 spin_lock_irq(&bfqd
->lock
);
6354 if (at_head
|| blk_rq_is_passthrough(rq
)) {
6356 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
6358 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
6360 __bfq_insert_request(bfqd
, rq
);
6362 if (rq_mergeable(rq
)) {
6363 elv_rqhash_add(q
, rq
);
6369 spin_unlock_irq(&bfqd
->lock
);
6372 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
6373 struct list_head
*list
, bool at_head
)
6375 while (!list_empty(list
)) {
6378 rq
= list_first_entry(list
, struct request
, queuelist
);
6379 list_del_init(&rq
->queuelist
);
6380 bfq_insert_request(hctx
, rq
, at_head
);
6384 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
6386 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
6387 bfqd
->rq_in_driver
);
6389 if (bfqd
->hw_tag
== 1)
6393 * This sample is valid if the number of outstanding requests
6394 * is large enough to allow a queueing behavior. Note that the
6395 * sum is not exact, as it's not taking into account deactivated
6398 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
6401 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
6404 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
6405 bfqd
->max_rq_in_driver
= 0;
6406 bfqd
->hw_tag_samples
= 0;
6409 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
6414 bfq_update_hw_tag(bfqd
);
6416 bfqd
->rq_in_driver
--;
6419 if (!bfqq
->dispatched
&& !bfq_bfqq_busy(bfqq
)) {
6421 * Set budget_timeout (which we overload to store the
6422 * time at which the queue remains with no backlog and
6423 * no outstanding request; used by the weight-raising
6426 bfqq
->budget_timeout
= jiffies
;
6429 now_ns
= ktime_get_ns();
6431 bfqq
->ttime
.last_end_request
= now_ns
;
6434 * Using us instead of ns, to get a reasonable precision in
6435 * computing rate in next check.
6437 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
6440 * If the request took rather long to complete, and, according
6441 * to the maximum request size recorded, this completion latency
6442 * implies that the request was certainly served at a very low
6443 * rate (less than 1M sectors/sec), then the whole observation
6444 * interval that lasts up to this time instant cannot be a
6445 * valid time interval for computing a new peak rate. Invoke
6446 * bfq_update_rate_reset to have the following three steps
6448 * - close the observation interval at the last (previous)
6449 * request dispatch or completion
6450 * - compute rate, if possible, for that observation interval
6451 * - reset to zero samples, which will trigger a proper
6452 * re-initialization of the observation interval on next
6455 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
6456 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
6457 1UL<<(BFQ_RATE_SHIFT
- 10))
6458 bfq_update_rate_reset(bfqd
, NULL
);
6459 bfqd
->last_completion
= now_ns
;
6462 * If we are waiting to discover whether the request pattern
6463 * of the task associated with the queue is actually
6464 * isochronous, and both requisites for this condition to hold
6465 * are now satisfied, then compute soft_rt_next_start (see the
6466 * comments on the function bfq_bfqq_softrt_next_start()). We
6467 * schedule this delayed check when bfqq expires, if it still
6468 * has in-flight requests.
6470 if (bfq_bfqq_softrt_update(bfqq
) && bfqq
->dispatched
== 0 &&
6471 RB_EMPTY_ROOT(&bfqq
->sort_list
))
6472 bfqq
->soft_rt_next_start
=
6473 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
6476 * If this is the in-service queue, check if it needs to be expired,
6477 * or if we want to idle in case it has no pending requests.
6479 if (bfqd
->in_service_queue
== bfqq
) {
6480 if (bfqq
->dispatched
== 0 && bfq_bfqq_must_idle(bfqq
)) {
6481 bfq_arm_slice_timer(bfqd
);
6483 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
6484 bfq_bfqq_expire(bfqd
, bfqq
, false,
6485 BFQQE_BUDGET_TIMEOUT
);
6486 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
6487 (bfqq
->dispatched
== 0 ||
6488 !bfq_bfqq_may_idle(bfqq
)))
6489 bfq_bfqq_expire(bfqd
, bfqq
, false,
6490 BFQQE_NO_MORE_REQUESTS
);
6494 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
6498 bfq_put_queue(bfqq
);
6501 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
6503 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
6504 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6506 if (rq
->rq_flags
& RQF_STARTED
)
6507 bfqg_stats_update_completion(bfqq_group(bfqq
),
6508 rq_start_time_ns(rq
),
6509 rq_io_start_time_ns(rq
),
6512 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
6513 unsigned long flags
;
6515 spin_lock_irqsave(&bfqd
->lock
, flags
);
6517 bfq_completed_request(bfqq
, bfqd
);
6518 bfq_put_rq_priv_body(bfqq
);
6520 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6523 * Request rq may be still/already in the scheduler,
6524 * in which case we need to remove it. And we cannot
6525 * defer such a check and removal, to avoid
6526 * inconsistencies in the time interval from the end
6527 * of this function to the start of the deferred work.
6528 * This situation seems to occur only in process
6529 * context, as a consequence of a merge. In the
6530 * current version of the code, this implies that the
6534 if (!RB_EMPTY_NODE(&rq
->rb_node
))
6535 bfq_remove_request(q
, rq
);
6536 bfq_put_rq_priv_body(bfqq
);
6539 rq
->elv
.priv
[0] = NULL
;
6540 rq
->elv
.priv
[1] = NULL
;
6544 * Allocate bfq data structures associated with this request.
6546 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
6549 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
6550 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
6551 const int is_sync
= rq_is_sync(rq
);
6552 struct bfq_queue
*bfqq
;
6554 spin_lock_irq(&bfqd
->lock
);
6556 bfq_check_ioprio_change(bic
, bio
);
6561 bfq_bic_update_cgroup(bic
, bio
);
6563 bfqq
= bic_to_bfqq(bic
, is_sync
);
6564 if (!bfqq
|| bfqq
== &bfqd
->oom_bfqq
) {
6566 bfq_put_queue(bfqq
);
6567 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
6568 bic_set_bfqq(bic
, bfqq
, is_sync
);
6573 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
6574 rq
, bfqq
, bfqq
->ref
);
6576 rq
->elv
.priv
[0] = bic
;
6577 rq
->elv
.priv
[1] = bfqq
;
6579 spin_unlock_irq(&bfqd
->lock
);
6584 spin_unlock_irq(&bfqd
->lock
);
6589 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
6591 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6592 enum bfqq_expiration reason
;
6593 unsigned long flags
;
6595 spin_lock_irqsave(&bfqd
->lock
, flags
);
6596 bfq_clear_bfqq_wait_request(bfqq
);
6598 if (bfqq
!= bfqd
->in_service_queue
) {
6599 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6603 if (bfq_bfqq_budget_timeout(bfqq
))
6605 * Also here the queue can be safely expired
6606 * for budget timeout without wasting
6609 reason
= BFQQE_BUDGET_TIMEOUT
;
6610 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
6612 * The queue may not be empty upon timer expiration,
6613 * because we may not disable the timer when the
6614 * first request of the in-service queue arrives
6615 * during disk idling.
6617 reason
= BFQQE_TOO_IDLE
;
6619 goto schedule_dispatch
;
6621 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
6624 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6625 bfq_schedule_dispatch(bfqd
);
6629 * Handler of the expiration of the timer running if the in-service queue
6630 * is idling inside its time slice.
6632 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
6634 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
6636 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
6639 * Theoretical race here: the in-service queue can be NULL or
6640 * different from the queue that was idling if a new request
6641 * arrives for the current queue and there is a full dispatch
6642 * cycle that changes the in-service queue. This can hardly
6643 * happen, but in the worst case we just expire a queue too
6647 bfq_idle_slice_timer_body(bfqq
);
6649 return HRTIMER_NORESTART
;
6652 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
6653 struct bfq_queue
**bfqq_ptr
)
6655 struct bfq_queue
*bfqq
= *bfqq_ptr
;
6657 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
6659 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
6661 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
6663 bfq_put_queue(bfqq
);
6669 * Release all the bfqg references to its async queues. If we are
6670 * deallocating the group these queues may still contain requests, so
6671 * we reparent them to the root cgroup (i.e., the only one that will
6672 * exist for sure until all the requests on a device are gone).
6674 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
6678 for (i
= 0; i
< 2; i
++)
6679 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
6680 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
6682 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
6685 static void bfq_exit_queue(struct elevator_queue
*e
)
6687 struct bfq_data
*bfqd
= e
->elevator_data
;
6688 struct bfq_queue
*bfqq
, *n
;
6690 hrtimer_cancel(&bfqd
->idle_slice_timer
);
6692 spin_lock_irq(&bfqd
->lock
);
6693 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
6694 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
6695 spin_unlock_irq(&bfqd
->lock
);
6697 hrtimer_cancel(&bfqd
->idle_slice_timer
);
6699 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6700 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
6702 spin_lock_irq(&bfqd
->lock
);
6703 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
6704 kfree(bfqd
->root_group
);
6705 spin_unlock_irq(&bfqd
->lock
);
6711 static void bfq_init_root_group(struct bfq_group
*root_group
,
6712 struct bfq_data
*bfqd
)
6716 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6717 root_group
->entity
.parent
= NULL
;
6718 root_group
->my_entity
= NULL
;
6719 root_group
->bfqd
= bfqd
;
6721 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
6722 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
6723 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
6726 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
6728 struct bfq_data
*bfqd
;
6729 struct elevator_queue
*eq
;
6731 eq
= elevator_alloc(q
, e
);
6735 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
6737 kobject_put(&eq
->kobj
);
6740 eq
->elevator_data
= bfqd
;
6742 spin_lock_irq(q
->queue_lock
);
6744 spin_unlock_irq(q
->queue_lock
);
6747 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
6748 * Grab a permanent reference to it, so that the normal code flow
6749 * will not attempt to free it.
6751 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
6752 bfqd
->oom_bfqq
.ref
++;
6753 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
6754 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
6755 bfqd
->oom_bfqq
.entity
.new_weight
=
6756 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
6758 * Trigger weight initialization, according to ioprio, at the
6759 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
6760 * class won't be changed any more.
6762 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
6766 INIT_LIST_HEAD(&bfqd
->dispatch
);
6768 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
6770 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
6772 INIT_LIST_HEAD(&bfqd
->active_list
);
6773 INIT_LIST_HEAD(&bfqd
->idle_list
);
6777 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
6779 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
6780 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
6781 bfqd
->bfq_back_max
= bfq_back_max
;
6782 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
6783 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
6784 bfqd
->bfq_timeout
= bfq_timeout
;
6786 bfqd
->bfq_requests_within_timer
= 120;
6788 bfqd
->low_latency
= true;
6791 * Trade-off between responsiveness and fairness.
6793 bfqd
->bfq_wr_coeff
= 30;
6794 bfqd
->bfq_wr_rt_max_time
= msecs_to_jiffies(300);
6795 bfqd
->bfq_wr_max_time
= 0;
6796 bfqd
->bfq_wr_min_idle_time
= msecs_to_jiffies(2000);
6797 bfqd
->bfq_wr_min_inter_arr_async
= msecs_to_jiffies(500);
6798 bfqd
->bfq_wr_max_softrt_rate
= 7000; /*
6799 * Approximate rate required
6800 * to playback or record a
6801 * high-definition compressed
6806 * Begin by assuming, optimistically, that the device is a
6807 * high-speed one, and that its peak rate is equal to 2/3 of
6808 * the highest reference rate.
6810 bfqd
->RT_prod
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] *
6811 T_fast
[blk_queue_nonrot(bfqd
->queue
)];
6812 bfqd
->peak_rate
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] * 2 / 3;
6813 bfqd
->device_speed
= BFQ_BFQD_FAST
;
6815 spin_lock_init(&bfqd
->lock
);
6818 * The invocation of the next bfq_create_group_hierarchy
6819 * function is the head of a chain of function calls
6820 * (bfq_create_group_hierarchy->blkcg_activate_policy->
6821 * blk_mq_freeze_queue) that may lead to the invocation of the
6822 * has_work hook function. For this reason,
6823 * bfq_create_group_hierarchy is invoked only after all
6824 * scheduler data has been initialized, apart from the fields
6825 * that can be initialized only after invoking
6826 * bfq_create_group_hierarchy. This, in particular, enables
6827 * has_work to correctly return false. Of course, to avoid
6828 * other inconsistencies, the blk-mq stack must then refrain
6829 * from invoking further scheduler hooks before this init
6830 * function is finished.
6832 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
6833 if (!bfqd
->root_group
)
6835 bfq_init_root_group(bfqd
->root_group
, bfqd
);
6836 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
6843 kobject_put(&eq
->kobj
);
6847 static void bfq_slab_kill(void)
6849 kmem_cache_destroy(bfq_pool
);
6852 static int __init
bfq_slab_setup(void)
6854 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
6860 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
6862 return sprintf(page
, "%u\n", var
);
6865 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
6868 unsigned long new_val
;
6869 int ret
= kstrtoul(page
, 10, &new_val
);
6877 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
6878 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6880 struct bfq_data *bfqd = e->elevator_data; \
6881 u64 __data = __VAR; \
6883 __data = jiffies_to_msecs(__data); \
6884 else if (__CONV == 2) \
6885 __data = div_u64(__data, NSEC_PER_MSEC); \
6886 return bfq_var_show(__data, (page)); \
6888 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
6889 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
6890 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
6891 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
6892 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
6893 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
6894 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
6895 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
6896 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
6897 #undef SHOW_FUNCTION
6899 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6900 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6902 struct bfq_data *bfqd = e->elevator_data; \
6903 u64 __data = __VAR; \
6904 __data = div_u64(__data, NSEC_PER_USEC); \
6905 return bfq_var_show(__data, (page)); \
6907 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
6908 #undef USEC_SHOW_FUNCTION
6910 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6912 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
6914 struct bfq_data *bfqd = e->elevator_data; \
6915 unsigned long uninitialized_var(__data); \
6916 int ret = bfq_var_store(&__data, (page), count); \
6917 if (__data < (MIN)) \
6919 else if (__data > (MAX)) \
6922 *(__PTR) = msecs_to_jiffies(__data); \
6923 else if (__CONV == 2) \
6924 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6926 *(__PTR) = __data; \
6929 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
6931 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
6933 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
6934 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
6936 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
6937 #undef STORE_FUNCTION
6939 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
6940 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
6942 struct bfq_data *bfqd = e->elevator_data; \
6943 unsigned long uninitialized_var(__data); \
6944 int ret = bfq_var_store(&__data, (page), count); \
6945 if (__data < (MIN)) \
6947 else if (__data > (MAX)) \
6949 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
6952 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
6954 #undef USEC_STORE_FUNCTION
6956 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
6957 const char *page
, size_t count
)
6959 struct bfq_data
*bfqd
= e
->elevator_data
;
6960 unsigned long uninitialized_var(__data
);
6961 int ret
= bfq_var_store(&__data
, (page
), count
);
6964 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6966 if (__data
> INT_MAX
)
6968 bfqd
->bfq_max_budget
= __data
;
6971 bfqd
->bfq_user_max_budget
= __data
;
6977 * Leaving this name to preserve name compatibility with cfq
6978 * parameters, but this timeout is used for both sync and async.
6980 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
6981 const char *page
, size_t count
)
6983 struct bfq_data
*bfqd
= e
->elevator_data
;
6984 unsigned long uninitialized_var(__data
);
6985 int ret
= bfq_var_store(&__data
, (page
), count
);
6989 else if (__data
> INT_MAX
)
6992 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
6993 if (bfqd
->bfq_user_max_budget
== 0)
6994 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
6999 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
7000 const char *page
, size_t count
)
7002 struct bfq_data
*bfqd
= e
->elevator_data
;
7003 unsigned long uninitialized_var(__data
);
7004 int ret
= bfq_var_store(&__data
, (page
), count
);
7008 if (!bfqd
->strict_guarantees
&& __data
== 1
7009 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
7010 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
7012 bfqd
->strict_guarantees
= __data
;
7017 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
7018 const char *page
, size_t count
)
7020 struct bfq_data
*bfqd
= e
->elevator_data
;
7021 unsigned long uninitialized_var(__data
);
7022 int ret
= bfq_var_store(&__data
, (page
), count
);
7026 if (__data
== 0 && bfqd
->low_latency
!= 0)
7028 bfqd
->low_latency
= __data
;
7033 #define BFQ_ATTR(name) \
7034 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
7036 static struct elv_fs_entry bfq_attrs
[] = {
7037 BFQ_ATTR(fifo_expire_sync
),
7038 BFQ_ATTR(fifo_expire_async
),
7039 BFQ_ATTR(back_seek_max
),
7040 BFQ_ATTR(back_seek_penalty
),
7041 BFQ_ATTR(slice_idle
),
7042 BFQ_ATTR(slice_idle_us
),
7043 BFQ_ATTR(max_budget
),
7044 BFQ_ATTR(timeout_sync
),
7045 BFQ_ATTR(strict_guarantees
),
7046 BFQ_ATTR(low_latency
),
7050 static struct elevator_type iosched_bfq_mq
= {
7052 .get_rq_priv
= bfq_get_rq_private
,
7053 .put_rq_priv
= bfq_put_rq_private
,
7054 .exit_icq
= bfq_exit_icq
,
7055 .insert_requests
= bfq_insert_requests
,
7056 .dispatch_request
= bfq_dispatch_request
,
7057 .next_request
= elv_rb_latter_request
,
7058 .former_request
= elv_rb_former_request
,
7059 .allow_merge
= bfq_allow_bio_merge
,
7060 .bio_merge
= bfq_bio_merge
,
7061 .request_merge
= bfq_request_merge
,
7062 .requests_merged
= bfq_requests_merged
,
7063 .request_merged
= bfq_request_merged
,
7064 .has_work
= bfq_has_work
,
7065 .init_sched
= bfq_init_queue
,
7066 .exit_sched
= bfq_exit_queue
,
7070 .icq_size
= sizeof(struct bfq_io_cq
),
7071 .icq_align
= __alignof__(struct bfq_io_cq
),
7072 .elevator_attrs
= bfq_attrs
,
7073 .elevator_name
= "bfq",
7074 .elevator_owner
= THIS_MODULE
,
7077 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7078 static struct blkcg_policy blkcg_policy_bfq
= {
7079 .dfl_cftypes
= bfq_blkg_files
,
7080 .legacy_cftypes
= bfq_blkcg_legacy_files
,
7082 .cpd_alloc_fn
= bfq_cpd_alloc
,
7083 .cpd_init_fn
= bfq_cpd_init
,
7084 .cpd_bind_fn
= bfq_cpd_init
,
7085 .cpd_free_fn
= bfq_cpd_free
,
7087 .pd_alloc_fn
= bfq_pd_alloc
,
7088 .pd_init_fn
= bfq_pd_init
,
7089 .pd_offline_fn
= bfq_pd_offline
,
7090 .pd_free_fn
= bfq_pd_free
,
7091 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
7095 static int __init
bfq_init(void)
7099 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7100 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
7106 if (bfq_slab_setup())
7110 * Times to load large popular applications for the typical
7111 * systems installed on the reference devices (see the
7112 * comments before the definitions of the next two
7113 * arrays). Actually, we use slightly slower values, as the
7114 * estimated peak rate tends to be smaller than the actual
7115 * peak rate. The reason for this last fact is that estimates
7116 * are computed over much shorter time intervals than the long
7117 * intervals typically used for benchmarking. Why? First, to
7118 * adapt more quickly to variations. Second, because an I/O
7119 * scheduler cannot rely on a peak-rate-evaluation workload to
7120 * be run for a long time.
7122 T_slow
[0] = msecs_to_jiffies(3500); /* actually 4 sec */
7123 T_slow
[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
7124 T_fast
[0] = msecs_to_jiffies(7000); /* actually 8 sec */
7125 T_fast
[1] = msecs_to_jiffies(2500); /* actually 3 sec */
7128 * Thresholds that determine the switch between speed classes
7129 * (see the comments before the definition of the array
7130 * device_speed_thresh). These thresholds are biased towards
7131 * transitions to the fast class. This is safer than the
7132 * opposite bias. In fact, a wrong transition to the slow
7133 * class results in short weight-raising periods, because the
7134 * speed of the device then tends to be higher that the
7135 * reference peak rate. On the opposite end, a wrong
7136 * transition to the fast class tends to increase
7137 * weight-raising periods, because of the opposite reason.
7139 device_speed_thresh
[0] = (4 * R_slow
[0]) / 3;
7140 device_speed_thresh
[1] = (4 * R_slow
[1]) / 3;
7142 ret
= elv_register(&iosched_bfq_mq
);
7149 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7150 blkcg_policy_unregister(&blkcg_policy_bfq
);
7155 static void __exit
bfq_exit(void)
7157 elv_unregister(&iosched_bfq_mq
);
7158 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7159 blkcg_policy_unregister(&blkcg_policy_bfq
);
7164 module_init(bfq_init
);
7165 module_exit(bfq_exit
);
7167 MODULE_AUTHOR("Paolo Valente");
7168 MODULE_LICENSE("GPL");
7169 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");