2 * Budget Fair Queueing (BFQ) I/O scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
90 #include <linux/module.h>
91 #include <linux/slab.h>
92 #include <linux/blkdev.h>
93 #include <linux/cgroup.h>
94 #include <linux/elevator.h>
95 #include <linux/ktime.h>
96 #include <linux/rbtree.h>
97 #include <linux/ioprio.h>
98 #include <linux/sbitmap.h>
99 #include <linux/delay.h>
103 #include "blk-mq-tag.h"
104 #include "blk-mq-sched.h"
105 #include <linux/blktrace_api.h>
106 #include <linux/hrtimer.h>
107 #include <linux/blk-cgroup.h>
109 #define BFQ_IOPRIO_CLASSES 3
110 #define BFQ_CL_IDLE_TIMEOUT (HZ/5)
112 #define BFQ_MIN_WEIGHT 1
113 #define BFQ_MAX_WEIGHT 1000
114 #define BFQ_WEIGHT_CONVERSION_COEFF 10
116 #define BFQ_DEFAULT_QUEUE_IOPRIO 4
118 #define BFQ_WEIGHT_LEGACY_DFL 100
119 #define BFQ_DEFAULT_GRP_IOPRIO 0
120 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
127 #define BFQ_SOFTRT_WEIGHT_FACTOR 100
132 * struct bfq_service_tree - per ioprio_class service tree.
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
139 struct bfq_service_tree
{
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active
;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
145 /* idle entity with minimum F_i */
146 struct bfq_entity
*first_idle
;
147 /* idle entity with maximum F_i */
148 struct bfq_entity
*last_idle
;
150 /* scheduler virtual time */
152 /* scheduler weight sum; active and idle entities contribute to it */
157 * struct bfq_sched_data - multi-class scheduler.
159 * bfq_sched_data is the basic scheduler queue. It supports three
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
173 struct bfq_sched_data
{
174 /* entity in service */
175 struct bfq_entity
*in_service_entity
;
176 /* head-of-line entity (see comments above) */
177 struct bfq_entity
*next_in_service
;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree
[BFQ_IOPRIO_CLASSES
];
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service
;
186 * struct bfq_entity - schedulable entity.
188 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
189 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
190 * entity belongs to the sched_data of the parent group in the cgroup
191 * hierarchy. Non-leaf entities have also their own sched_data, stored
194 * Each entity stores independently its priority values; this would
195 * allow different weights on different devices, but this
196 * functionality is not exported to userspace by now. Priorities and
197 * weights are updated lazily, first storing the new values into the
198 * new_* fields, then setting the @prio_changed flag. As soon as
199 * there is a transition in the entity state that allows the priority
200 * update to take place the effective and the requested priority
201 * values are synchronized.
203 * Unless cgroups are used, the weight value is calculated from the
204 * ioprio to export the same interface as CFQ. When dealing with
205 * ``well-behaved'' queues (i.e., queues that do not spend too much
206 * time to consume their budget and have true sequential behavior, and
207 * when there are no external factors breaking anticipation) the
208 * relative weights at each level of the cgroups hierarchy should be
209 * guaranteed. All the fields are protected by the queue lock of the
213 /* service_tree member */
214 struct rb_node rb_node
;
217 * Flag, true if the entity is on a tree (either the active or
218 * the idle one of its service_tree) or is in service.
222 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
225 /* tree the entity is enqueued into; %NULL if not on a tree */
226 struct rb_root
*tree
;
229 * minimum start time of the (active) subtree rooted at this
230 * entity; used for O(log N) lookups into active trees
234 /* amount of service received during the last service slot */
237 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
240 /* weight of the queue */
242 /* next weight if a change is in progress */
245 /* original weight, used to implement weight boosting */
248 /* parent entity, for hierarchical scheduling */
249 struct bfq_entity
*parent
;
252 * For non-leaf nodes in the hierarchy, the associated
253 * scheduler queue, %NULL on leaf nodes.
255 struct bfq_sched_data
*my_sched_data
;
256 /* the scheduler queue this entity belongs to */
257 struct bfq_sched_data
*sched_data
;
259 /* flag, set to request a weight, ioprio or ioprio_class change */
266 * struct bfq_ttime - per process thinktime stats.
269 /* completion time of the last request */
270 u64 last_end_request
;
272 /* total process thinktime */
274 /* number of thinktime samples */
275 unsigned long ttime_samples
;
276 /* average process thinktime */
281 * struct bfq_queue - leaf schedulable entity.
283 * A bfq_queue is a leaf request queue; it can be associated with an
284 * io_context or more, if it is async. @cgroup holds a reference to
285 * the cgroup, to be sure that it does not disappear while a bfqq
286 * still references it (mostly to avoid races between request issuing
287 * and task migration followed by cgroup destruction). All the fields
288 * are protected by the queue lock of the containing bfqd.
291 /* reference counter */
293 /* parent bfq_data */
294 struct bfq_data
*bfqd
;
296 /* current ioprio and ioprio class */
297 unsigned short ioprio
, ioprio_class
;
298 /* next ioprio and ioprio class if a change is in progress */
299 unsigned short new_ioprio
, new_ioprio_class
;
301 /* sorted list of pending requests */
302 struct rb_root sort_list
;
303 /* if fifo isn't expired, next request to serve */
304 struct request
*next_rq
;
305 /* number of sync and async requests queued */
307 /* number of requests currently allocated */
309 /* number of pending metadata requests */
311 /* fifo list of requests in sort_list */
312 struct list_head fifo
;
314 /* entity representing this queue in the scheduler */
315 struct bfq_entity entity
;
317 /* maximum budget allowed from the feedback mechanism */
319 /* budget expiration (in jiffies) */
320 unsigned long budget_timeout
;
322 /* number of requests on the dispatch list or inside driver */
328 /* node for active/idle bfqq list inside parent bfqd */
329 struct list_head bfqq_list
;
331 /* associated @bfq_ttime struct */
332 struct bfq_ttime ttime
;
334 /* bit vector: a 1 for each seeky requests in history */
336 /* position of the last request enqueued */
337 sector_t last_request_pos
;
339 /* Number of consecutive pairs of request completion and
340 * arrival, such that the queue becomes idle after the
341 * completion, but the next request arrives within an idle
342 * time slice; used only if the queue's IO_bound flag has been
345 unsigned int requests_within_timer
;
347 /* pid of the process owning the queue, used for logging purposes */
350 /* current maximum weight-raising time for this queue */
351 unsigned long wr_cur_max_time
;
353 * Minimum time instant such that, only if a new request is
354 * enqueued after this time instant in an idle @bfq_queue with
355 * no outstanding requests, then the task associated with the
356 * queue it is deemed as soft real-time (see the comments on
357 * the function bfq_bfqq_softrt_next_start())
359 unsigned long soft_rt_next_start
;
361 * Start time of the current weight-raising period if
362 * the @bfq-queue is being weight-raised, otherwise
363 * finish time of the last weight-raising period.
365 unsigned long last_wr_start_finish
;
366 /* factor by which the weight of this queue is multiplied */
367 unsigned int wr_coeff
;
369 * Time of the last transition of the @bfq_queue from idle to
372 unsigned long last_idle_bklogged
;
374 * Cumulative service received from the @bfq_queue since the
375 * last transition from idle to backlogged.
377 unsigned long service_from_backlogged
;
379 * Value of wr start time when switching to soft rt
381 unsigned long wr_start_at_switch_to_srt
;
385 * struct bfq_io_cq - per (request_queue, io_context) structure.
388 /* associated io_cq structure */
389 struct io_cq icq
; /* must be the first member */
390 /* array of two process queues, the sync and the async */
391 struct bfq_queue
*bfqq
[2];
392 /* per (request_queue, blkcg) ioprio */
394 #ifdef CONFIG_BFQ_GROUP_IOSCHED
395 uint64_t blkcg_serial_nr
; /* the current blkcg serial */
399 enum bfq_device_speed
{
405 * struct bfq_data - per-device data structure.
407 * All the fields are protected by @lock.
410 /* device request queue */
411 struct request_queue
*queue
;
413 struct list_head dispatch
;
415 /* root bfq_group for the device */
416 struct bfq_group
*root_group
;
419 * Number of bfq_queues containing requests (including the
420 * queue in service, even if it is idling).
423 /* number of weight-raised busy @bfq_queues */
425 /* number of queued requests */
427 /* number of requests dispatched and waiting for completion */
431 * Maximum number of requests in driver in the last
432 * @hw_tag_samples completed requests.
434 int max_rq_in_driver
;
435 /* number of samples used to calculate hw_tag */
437 /* flag set to one if the driver is showing a queueing behavior */
440 /* number of budgets assigned */
441 int budgets_assigned
;
444 * Timer set when idling (waiting) for the next request from
445 * the queue in service.
447 struct hrtimer idle_slice_timer
;
449 /* bfq_queue in service */
450 struct bfq_queue
*in_service_queue
;
451 /* bfq_io_cq (bic) associated with the @in_service_queue */
452 struct bfq_io_cq
*in_service_bic
;
454 /* on-disk position of the last served request */
455 sector_t last_position
;
457 /* time of last request completion (ns) */
460 /* time of first rq dispatch in current observation interval (ns) */
462 /* time of last rq dispatch in current observation interval (ns) */
465 /* beginning of the last budget */
466 ktime_t last_budget_start
;
467 /* beginning of the last idle slice */
468 ktime_t last_idling_start
;
470 /* number of samples in current observation interval */
471 int peak_rate_samples
;
472 /* num of samples of seq dispatches in current observation interval */
473 u32 sequential_samples
;
474 /* total num of sectors transferred in current observation interval */
475 u64 tot_sectors_dispatched
;
476 /* max rq size seen during current observation interval (sectors) */
477 u32 last_rq_max_size
;
478 /* time elapsed from first dispatch in current observ. interval (us) */
479 u64 delta_from_first
;
481 * Current estimate of the device peak rate, measured in
482 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
483 * BFQ_RATE_SHIFT is performed to increase precision in
484 * fixed-point calculations.
488 /* maximum budget allotted to a bfq_queue before rescheduling */
491 /* list of all the bfq_queues active on the device */
492 struct list_head active_list
;
493 /* list of all the bfq_queues idle on the device */
494 struct list_head idle_list
;
497 * Timeout for async/sync requests; when it fires, requests
498 * are served in fifo order.
500 u64 bfq_fifo_expire
[2];
501 /* weight of backward seeks wrt forward ones */
502 unsigned int bfq_back_penalty
;
503 /* maximum allowed backward seek */
504 unsigned int bfq_back_max
;
505 /* maximum idling time */
508 /* user-configured max budget value (0 for auto-tuning) */
509 int bfq_user_max_budget
;
511 * Timeout for bfq_queues to consume their budget; used to
512 * prevent seeky queues from imposing long latencies to
513 * sequential or quasi-sequential ones (this also implies that
514 * seeky queues cannot receive guarantees in the service
515 * domain; after a timeout they are charged for the time they
516 * have been in service, to preserve fairness among them, but
517 * without service-domain guarantees).
519 unsigned int bfq_timeout
;
522 * Number of consecutive requests that must be issued within
523 * the idle time slice to set again idling to a queue which
524 * was marked as non-I/O-bound (see the definition of the
525 * IO_bound flag for further details).
527 unsigned int bfq_requests_within_timer
;
530 * Force device idling whenever needed to provide accurate
531 * service guarantees, without caring about throughput
532 * issues. CAVEAT: this may even increase latencies, in case
533 * of useless idling for processes that did stop doing I/O.
535 bool strict_guarantees
;
537 /* if set to true, low-latency heuristics are enabled */
540 * Maximum factor by which the weight of a weight-raised queue
543 unsigned int bfq_wr_coeff
;
544 /* maximum duration of a weight-raising period (jiffies) */
545 unsigned int bfq_wr_max_time
;
547 /* Maximum weight-raising duration for soft real-time processes */
548 unsigned int bfq_wr_rt_max_time
;
550 * Minimum idle period after which weight-raising may be
551 * reactivated for a queue (in jiffies).
553 unsigned int bfq_wr_min_idle_time
;
555 * Minimum period between request arrivals after which
556 * weight-raising may be reactivated for an already busy async
557 * queue (in jiffies).
559 unsigned long bfq_wr_min_inter_arr_async
;
561 /* Max service-rate for a soft real-time queue, in sectors/sec */
562 unsigned int bfq_wr_max_softrt_rate
;
564 * Cached value of the product R*T, used for computing the
565 * maximum duration of weight raising automatically.
568 /* device-speed class for the low-latency heuristic */
569 enum bfq_device_speed device_speed
;
571 /* fallback dummy bfqq for extreme OOM conditions */
572 struct bfq_queue oom_bfqq
;
577 * bic associated with the task issuing current bio for
578 * merging. This and the next field are used as a support to
579 * be able to perform the bic lookup, needed by bio-merge
580 * functions, before the scheduler lock is taken, and thus
581 * avoid taking the request-queue lock while the scheduler
582 * lock is being held.
584 struct bfq_io_cq
*bio_bic
;
585 /* bfqq associated with the task issuing current bio for merging */
586 struct bfq_queue
*bio_bfqq
;
589 enum bfqq_state_flags
{
590 BFQQF_busy
= 0, /* has requests or is in service */
591 BFQQF_wait_request
, /* waiting for a request */
592 BFQQF_non_blocking_wait_rq
, /*
593 * waiting for a request
594 * without idling the device
596 BFQQF_fifo_expire
, /* FIFO checked in this slice */
597 BFQQF_idle_window
, /* slice idling enabled */
598 BFQQF_sync
, /* synchronous queue */
600 * bfqq has timed-out at least once
601 * having consumed at most 2/10 of
604 BFQQF_softrt_update
, /*
605 * may need softrt-next-start
610 #define BFQ_BFQQ_FNS(name) \
611 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
613 __set_bit(BFQQF_##name, &(bfqq)->flags); \
615 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
617 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
619 static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
621 return test_bit(BFQQF_##name, &(bfqq)->flags); \
625 BFQ_BFQQ_FNS(wait_request
);
626 BFQ_BFQQ_FNS(non_blocking_wait_rq
);
627 BFQ_BFQQ_FNS(fifo_expire
);
628 BFQ_BFQQ_FNS(idle_window
);
630 BFQ_BFQQ_FNS(IO_bound
);
631 BFQ_BFQQ_FNS(softrt_update
);
634 /* Logging facilities. */
635 #ifdef CONFIG_BFQ_GROUP_IOSCHED
636 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
637 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
);
639 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
642 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
643 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
644 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
648 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
651 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
652 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
655 #else /* CONFIG_BFQ_GROUP_IOSCHED */
657 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
658 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
659 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
661 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
663 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
665 #define bfq_log(bfqd, fmt, args...) \
666 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
668 /* Expiration reasons. */
669 enum bfqq_expiration
{
670 BFQQE_TOO_IDLE
= 0, /*
671 * queue has been idling for
674 BFQQE_BUDGET_TIMEOUT
, /* budget took too long to be used */
675 BFQQE_BUDGET_EXHAUSTED
, /* budget consumed */
676 BFQQE_NO_MORE_REQUESTS
, /* the queue has no more requests */
677 BFQQE_PREEMPTED
/* preemption in progress */
681 #ifdef CONFIG_BFQ_GROUP_IOSCHED
682 /* number of ios merged */
683 struct blkg_rwstat merged
;
684 /* total time spent on device in ns, may not be accurate w/ queueing */
685 struct blkg_rwstat service_time
;
686 /* total time spent waiting in scheduler queue in ns */
687 struct blkg_rwstat wait_time
;
688 /* number of IOs queued up */
689 struct blkg_rwstat queued
;
690 /* total disk time and nr sectors dispatched by this group */
691 struct blkg_stat time
;
692 /* sum of number of ios queued across all samples */
693 struct blkg_stat avg_queue_size_sum
;
694 /* count of samples taken for average */
695 struct blkg_stat avg_queue_size_samples
;
696 /* how many times this group has been removed from service tree */
697 struct blkg_stat dequeue
;
698 /* total time spent waiting for it to be assigned a timeslice. */
699 struct blkg_stat group_wait_time
;
700 /* time spent idling for this blkcg_gq */
701 struct blkg_stat idle_time
;
702 /* total time with empty current active q with other requests queued */
703 struct blkg_stat empty_time
;
704 /* fields after this shouldn't be cleared on stat reset */
705 uint64_t start_group_wait_time
;
706 uint64_t start_idle_time
;
707 uint64_t start_empty_time
;
709 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
712 #ifdef CONFIG_BFQ_GROUP_IOSCHED
715 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
717 * @ps: @blkcg_policy_storage that this structure inherits
718 * @weight: weight of the bfq_group
720 struct bfq_group_data
{
721 /* must be the first member */
722 struct blkcg_policy_data pd
;
728 * struct bfq_group - per (device, cgroup) data structure.
729 * @entity: schedulable entity to insert into the parent group sched_data.
730 * @sched_data: own sched_data, to contain child entities (they may be
731 * both bfq_queues and bfq_groups).
732 * @bfqd: the bfq_data for the device this group acts upon.
733 * @async_bfqq: array of async queues for all the tasks belonging to
734 * the group, one queue per ioprio value per ioprio_class,
735 * except for the idle class that has only one queue.
736 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
737 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
738 * to avoid too many special cases during group creation/
740 * @stats: stats for this bfqg.
742 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
743 * there is a set of bfq_groups, each one collecting the lower-level
744 * entities belonging to the group that are acting on the same device.
746 * Locking works as follows:
747 * o @bfqd is protected by the queue lock, RCU is used to access it
749 * o All the other fields are protected by the @bfqd queue lock.
752 /* must be the first member */
753 struct blkg_policy_data pd
;
755 struct bfq_entity entity
;
756 struct bfq_sched_data sched_data
;
760 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
761 struct bfq_queue
*async_idle_bfqq
;
763 struct bfq_entity
*my_entity
;
765 struct bfqg_stats stats
;
770 struct bfq_sched_data sched_data
;
772 struct bfq_queue
*async_bfqq
[2][IOPRIO_BE_NR
];
773 struct bfq_queue
*async_idle_bfqq
;
775 struct rb_root rq_pos_tree
;
779 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
);
781 static unsigned int bfq_class_idx(struct bfq_entity
*entity
)
783 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
785 return bfqq
? bfqq
->ioprio_class
- 1 :
786 BFQ_DEFAULT_GRP_CLASS
- 1;
789 static struct bfq_service_tree
*
790 bfq_entity_service_tree(struct bfq_entity
*entity
)
792 struct bfq_sched_data
*sched_data
= entity
->sched_data
;
793 unsigned int idx
= bfq_class_idx(entity
);
795 return sched_data
->service_tree
+ idx
;
798 static struct bfq_queue
*bic_to_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
800 return bic
->bfqq
[is_sync
];
803 static void bic_set_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
,
806 bic
->bfqq
[is_sync
] = bfqq
;
809 static struct bfq_data
*bic_to_bfqd(struct bfq_io_cq
*bic
)
811 return bic
->icq
.q
->elevator
->elevator_data
;
814 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
);
815 static void bfq_put_queue(struct bfq_queue
*bfqq
);
816 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
817 struct bio
*bio
, bool is_sync
,
818 struct bfq_io_cq
*bic
);
819 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
820 struct bfq_group
*bfqg
);
821 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
);
822 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
);
824 /* Expiration time of sync (0) and async (1) requests, in ns. */
825 static const u64 bfq_fifo_expire
[2] = { NSEC_PER_SEC
/ 4, NSEC_PER_SEC
/ 8 };
827 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
828 static const int bfq_back_max
= 16 * 1024;
830 /* Penalty of a backwards seek, in number of sectors. */
831 static const int bfq_back_penalty
= 2;
833 /* Idling period duration, in ns. */
834 static u64 bfq_slice_idle
= NSEC_PER_SEC
/ 125;
836 /* Minimum number of assigned budgets for which stats are safe to compute. */
837 static const int bfq_stats_min_budgets
= 194;
839 /* Default maximum budget values, in sectors and number of requests. */
840 static const int bfq_default_max_budget
= 16 * 1024;
843 * Async to sync throughput distribution is controlled as follows:
844 * when an async request is served, the entity is charged the number
845 * of sectors of the request, multiplied by the factor below
847 static const int bfq_async_charge_factor
= 10;
849 /* Default timeout values, in jiffies, approximating CFQ defaults. */
850 static const int bfq_timeout
= HZ
/ 8;
852 static struct kmem_cache
*bfq_pool
;
854 /* Below this threshold (in ns), we consider thinktime immediate. */
855 #define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
857 /* hw_tag detection: parallel requests threshold and min samples needed. */
858 #define BFQ_HW_QUEUE_THRESHOLD 4
859 #define BFQ_HW_QUEUE_SAMPLES 32
861 #define BFQQ_SEEK_THR (sector_t)(8 * 100)
862 #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
863 #define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
864 #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
866 /* Min number of samples required to perform peak-rate update */
867 #define BFQ_RATE_MIN_SAMPLES 32
868 /* Min observation time interval required to perform a peak-rate update (ns) */
869 #define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
870 /* Target observation time interval for a peak-rate update (ns) */
871 #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
873 /* Shift used for peak rate fixed precision calculations. */
874 #define BFQ_RATE_SHIFT 16
877 * By default, BFQ computes the duration of the weight raising for
878 * interactive applications automatically, using the following formula:
879 * duration = (R / r) * T, where r is the peak rate of the device, and
880 * R and T are two reference parameters.
881 * In particular, R is the peak rate of the reference device (see below),
882 * and T is a reference time: given the systems that are likely to be
883 * installed on the reference device according to its speed class, T is
884 * about the maximum time needed, under BFQ and while reading two files in
885 * parallel, to load typical large applications on these systems.
886 * In practice, the slower/faster the device at hand is, the more/less it
887 * takes to load applications with respect to the reference device.
888 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
891 * BFQ uses four different reference pairs (R, T), depending on:
892 * . whether the device is rotational or non-rotational;
893 * . whether the device is slow, such as old or portable HDDs, as well as
894 * SD cards, or fast, such as newer HDDs and SSDs.
896 * The device's speed class is dynamically (re)detected in
897 * bfq_update_peak_rate() every time the estimated peak rate is updated.
899 * In the following definitions, R_slow[0]/R_fast[0] and
900 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
901 * rotational device, whereas R_slow[1]/R_fast[1] and
902 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
903 * non-rotational device. Finally, device_speed_thresh are the
904 * thresholds used to switch between speed classes. The reference
905 * rates are not the actual peak rates of the devices used as a
906 * reference, but slightly lower values. The reason for using these
907 * slightly lower values is that the peak-rate estimator tends to
908 * yield slightly lower values than the actual peak rate (it can yield
909 * the actual peak rate only if there is only one process doing I/O,
910 * and the process does sequential I/O).
912 * Both the reference peak rates and the thresholds are measured in
913 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
915 static int R_slow
[2] = {1000, 10700};
916 static int R_fast
[2] = {14000, 33000};
918 * To improve readability, a conversion function is used to initialize the
919 * following arrays, which entails that they can be initialized only in a
922 static int T_slow
[2];
923 static int T_fast
[2];
924 static int device_speed_thresh
[2];
926 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
927 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
929 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
930 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
933 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
934 * @icq: the iocontext queue.
936 static struct bfq_io_cq
*icq_to_bic(struct io_cq
*icq
)
938 /* bic->icq is the first member, %NULL will convert to %NULL */
939 return container_of(icq
, struct bfq_io_cq
, icq
);
943 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
944 * @bfqd: the lookup key.
945 * @ioc: the io_context of the process doing I/O.
946 * @q: the request queue.
948 static struct bfq_io_cq
*bfq_bic_lookup(struct bfq_data
*bfqd
,
949 struct io_context
*ioc
,
950 struct request_queue
*q
)
954 struct bfq_io_cq
*icq
;
956 spin_lock_irqsave(q
->queue_lock
, flags
);
957 icq
= icq_to_bic(ioc_lookup_icq(ioc
, q
));
958 spin_unlock_irqrestore(q
->queue_lock
, flags
);
967 * Scheduler run of queue, if there are requests pending and no one in the
968 * driver that will restart queueing.
970 static void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
972 if (bfqd
->queued
!= 0) {
973 bfq_log(bfqd
, "schedule dispatch");
974 blk_mq_run_hw_queues(bfqd
->queue
, true);
979 * bfq_gt - compare two timestamps.
983 * Return @a > @b, dealing with wrapping correctly.
985 static int bfq_gt(u64 a
, u64 b
)
987 return (s64
)(a
- b
) > 0;
990 static struct bfq_entity
*bfq_root_active_entity(struct rb_root
*tree
)
992 struct rb_node
*node
= tree
->rb_node
;
994 return rb_entry(node
, struct bfq_entity
, rb_node
);
997 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
);
999 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
);
1002 * bfq_update_next_in_service - update sd->next_in_service
1003 * @sd: sched_data for which to perform the update.
1004 * @new_entity: if not NULL, pointer to the entity whose activation,
1005 * requeueing or repositionig triggered the invocation of
1008 * This function is called to update sd->next_in_service, which, in
1009 * its turn, may change as a consequence of the insertion or
1010 * extraction of an entity into/from one of the active trees of
1011 * sd. These insertions/extractions occur as a consequence of
1012 * activations/deactivations of entities, with some activations being
1013 * 'true' activations, and other activations being requeueings (i.e.,
1014 * implementing the second, requeueing phase of the mechanism used to
1015 * reposition an entity in its active tree; see comments on
1016 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1017 * both the last two activation sub-cases, new_entity points to the
1018 * just activated or requeued entity.
1020 * Returns true if sd->next_in_service changes in such a way that
1021 * entity->parent may become the next_in_service for its parent
1024 static bool bfq_update_next_in_service(struct bfq_sched_data
*sd
,
1025 struct bfq_entity
*new_entity
)
1027 struct bfq_entity
*next_in_service
= sd
->next_in_service
;
1028 bool parent_sched_may_change
= false;
1031 * If this update is triggered by the activation, requeueing
1032 * or repositiong of an entity that does not coincide with
1033 * sd->next_in_service, then a full lookup in the active tree
1034 * can be avoided. In fact, it is enough to check whether the
1035 * just-modified entity has a higher priority than
1036 * sd->next_in_service, or, even if it has the same priority
1037 * as sd->next_in_service, is eligible and has a lower virtual
1038 * finish time than sd->next_in_service. If this compound
1039 * condition holds, then the new entity becomes the new
1040 * next_in_service. Otherwise no change is needed.
1042 if (new_entity
&& new_entity
!= sd
->next_in_service
) {
1044 * Flag used to decide whether to replace
1045 * sd->next_in_service with new_entity. Tentatively
1046 * set to true, and left as true if
1047 * sd->next_in_service is NULL.
1049 bool replace_next
= true;
1052 * If there is already a next_in_service candidate
1053 * entity, then compare class priorities or timestamps
1054 * to decide whether to replace sd->service_tree with
1057 if (next_in_service
) {
1058 unsigned int new_entity_class_idx
=
1059 bfq_class_idx(new_entity
);
1060 struct bfq_service_tree
*st
=
1061 sd
->service_tree
+ new_entity_class_idx
;
1064 * For efficiency, evaluate the most likely
1065 * sub-condition first.
1068 (new_entity_class_idx
==
1069 bfq_class_idx(next_in_service
)
1071 !bfq_gt(new_entity
->start
, st
->vtime
)
1073 bfq_gt(next_in_service
->finish
,
1074 new_entity
->finish
))
1076 new_entity_class_idx
<
1077 bfq_class_idx(next_in_service
);
1081 next_in_service
= new_entity
;
1082 } else /* invoked because of a deactivation: lookup needed */
1083 next_in_service
= bfq_lookup_next_entity(sd
);
1085 if (next_in_service
) {
1086 parent_sched_may_change
= !sd
->next_in_service
||
1087 bfq_update_parent_budget(next_in_service
);
1090 sd
->next_in_service
= next_in_service
;
1092 if (!next_in_service
)
1093 return parent_sched_may_change
;
1095 return parent_sched_may_change
;
1098 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1099 /* both next loops stop at one of the child entities of the root group */
1100 #define for_each_entity(entity) \
1101 for (; entity ; entity = entity->parent)
1104 * For each iteration, compute parent in advance, so as to be safe if
1105 * entity is deallocated during the iteration. Such a deallocation may
1106 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1107 * containing entity.
1109 #define for_each_entity_safe(entity, parent) \
1110 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1113 * Returns true if this budget changes may let next_in_service->parent
1114 * become the next_in_service entity for its parent entity.
1116 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1118 struct bfq_entity
*bfqg_entity
;
1119 struct bfq_group
*bfqg
;
1120 struct bfq_sched_data
*group_sd
;
1123 group_sd
= next_in_service
->sched_data
;
1125 bfqg
= container_of(group_sd
, struct bfq_group
, sched_data
);
1127 * bfq_group's my_entity field is not NULL only if the group
1128 * is not the root group. We must not touch the root entity
1129 * as it must never become an in-service entity.
1131 bfqg_entity
= bfqg
->my_entity
;
1133 if (bfqg_entity
->budget
> next_in_service
->budget
)
1135 bfqg_entity
->budget
= next_in_service
->budget
;
1142 * This function tells whether entity stops being a candidate for next
1143 * service, according to the following logic.
1145 * This function is invoked for an entity that is about to be set in
1146 * service. If such an entity is a queue, then the entity is no longer
1147 * a candidate for next service (i.e, a candidate entity to serve
1148 * after the in-service entity is expired). The function then returns
1151 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1153 if (bfq_entity_to_bfqq(entity
))
1159 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1161 * Next two macros are fake loops when cgroups support is not
1162 * enabled. I fact, in such a case, there is only one level to go up
1163 * (to reach the root group).
1165 #define for_each_entity(entity) \
1166 for (; entity ; entity = NULL)
1168 #define for_each_entity_safe(entity, parent) \
1169 for (parent = NULL; entity ; entity = parent)
1171 static bool bfq_update_parent_budget(struct bfq_entity
*next_in_service
)
1176 static bool bfq_no_longer_next_in_service(struct bfq_entity
*entity
)
1181 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1184 * Shift for timestamp calculations. This actually limits the maximum
1185 * service allowed in one timestamp delta (small shift values increase it),
1186 * the maximum total weight that can be used for the queues in the system
1187 * (big shift values increase it), and the period of virtual time
1190 #define WFQ_SERVICE_SHIFT 22
1192 static struct bfq_queue
*bfq_entity_to_bfqq(struct bfq_entity
*entity
)
1194 struct bfq_queue
*bfqq
= NULL
;
1196 if (!entity
->my_sched_data
)
1197 bfqq
= container_of(entity
, struct bfq_queue
, entity
);
1204 * bfq_delta - map service into the virtual time domain.
1205 * @service: amount of service.
1206 * @weight: scale factor (weight of an entity or weight sum).
1208 static u64
bfq_delta(unsigned long service
, unsigned long weight
)
1210 u64 d
= (u64
)service
<< WFQ_SERVICE_SHIFT
;
1217 * bfq_calc_finish - assign the finish time to an entity.
1218 * @entity: the entity to act upon.
1219 * @service: the service to be charged to the entity.
1221 static void bfq_calc_finish(struct bfq_entity
*entity
, unsigned long service
)
1223 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1225 entity
->finish
= entity
->start
+
1226 bfq_delta(service
, entity
->weight
);
1229 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1230 "calc_finish: serv %lu, w %d",
1231 service
, entity
->weight
);
1232 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1233 "calc_finish: start %llu, finish %llu, delta %llu",
1234 entity
->start
, entity
->finish
,
1235 bfq_delta(service
, entity
->weight
));
1240 * bfq_entity_of - get an entity from a node.
1241 * @node: the node field of the entity.
1243 * Convert a node pointer to the relative entity. This is used only
1244 * to simplify the logic of some functions and not as the generic
1245 * conversion mechanism because, e.g., in the tree walking functions,
1246 * the check for a %NULL value would be redundant.
1248 static struct bfq_entity
*bfq_entity_of(struct rb_node
*node
)
1250 struct bfq_entity
*entity
= NULL
;
1253 entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1259 * bfq_extract - remove an entity from a tree.
1260 * @root: the tree root.
1261 * @entity: the entity to remove.
1263 static void bfq_extract(struct rb_root
*root
, struct bfq_entity
*entity
)
1265 entity
->tree
= NULL
;
1266 rb_erase(&entity
->rb_node
, root
);
1270 * bfq_idle_extract - extract an entity from the idle tree.
1271 * @st: the service tree of the owning @entity.
1272 * @entity: the entity being removed.
1274 static void bfq_idle_extract(struct bfq_service_tree
*st
,
1275 struct bfq_entity
*entity
)
1277 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1278 struct rb_node
*next
;
1280 if (entity
== st
->first_idle
) {
1281 next
= rb_next(&entity
->rb_node
);
1282 st
->first_idle
= bfq_entity_of(next
);
1285 if (entity
== st
->last_idle
) {
1286 next
= rb_prev(&entity
->rb_node
);
1287 st
->last_idle
= bfq_entity_of(next
);
1290 bfq_extract(&st
->idle
, entity
);
1293 list_del(&bfqq
->bfqq_list
);
1297 * bfq_insert - generic tree insertion.
1299 * @entity: entity to insert.
1301 * This is used for the idle and the active tree, since they are both
1302 * ordered by finish time.
1304 static void bfq_insert(struct rb_root
*root
, struct bfq_entity
*entity
)
1306 struct bfq_entity
*entry
;
1307 struct rb_node
**node
= &root
->rb_node
;
1308 struct rb_node
*parent
= NULL
;
1312 entry
= rb_entry(parent
, struct bfq_entity
, rb_node
);
1314 if (bfq_gt(entry
->finish
, entity
->finish
))
1315 node
= &parent
->rb_left
;
1317 node
= &parent
->rb_right
;
1320 rb_link_node(&entity
->rb_node
, parent
, node
);
1321 rb_insert_color(&entity
->rb_node
, root
);
1323 entity
->tree
= root
;
1327 * bfq_update_min - update the min_start field of a entity.
1328 * @entity: the entity to update.
1329 * @node: one of its children.
1331 * This function is called when @entity may store an invalid value for
1332 * min_start due to updates to the active tree. The function assumes
1333 * that the subtree rooted at @node (which may be its left or its right
1334 * child) has a valid min_start value.
1336 static void bfq_update_min(struct bfq_entity
*entity
, struct rb_node
*node
)
1338 struct bfq_entity
*child
;
1341 child
= rb_entry(node
, struct bfq_entity
, rb_node
);
1342 if (bfq_gt(entity
->min_start
, child
->min_start
))
1343 entity
->min_start
= child
->min_start
;
1348 * bfq_update_active_node - recalculate min_start.
1349 * @node: the node to update.
1351 * @node may have changed position or one of its children may have moved,
1352 * this function updates its min_start value. The left and right subtrees
1353 * are assumed to hold a correct min_start value.
1355 static void bfq_update_active_node(struct rb_node
*node
)
1357 struct bfq_entity
*entity
= rb_entry(node
, struct bfq_entity
, rb_node
);
1359 entity
->min_start
= entity
->start
;
1360 bfq_update_min(entity
, node
->rb_right
);
1361 bfq_update_min(entity
, node
->rb_left
);
1365 * bfq_update_active_tree - update min_start for the whole active tree.
1366 * @node: the starting node.
1368 * @node must be the deepest modified node after an update. This function
1369 * updates its min_start using the values held by its children, assuming
1370 * that they did not change, and then updates all the nodes that may have
1371 * changed in the path to the root. The only nodes that may have changed
1372 * are the ones in the path or their siblings.
1374 static void bfq_update_active_tree(struct rb_node
*node
)
1376 struct rb_node
*parent
;
1379 bfq_update_active_node(node
);
1381 parent
= rb_parent(node
);
1385 if (node
== parent
->rb_left
&& parent
->rb_right
)
1386 bfq_update_active_node(parent
->rb_right
);
1387 else if (parent
->rb_left
)
1388 bfq_update_active_node(parent
->rb_left
);
1395 * bfq_active_insert - insert an entity in the active tree of its
1397 * @st: the service tree of the entity.
1398 * @entity: the entity being inserted.
1400 * The active tree is ordered by finish time, but an extra key is kept
1401 * per each node, containing the minimum value for the start times of
1402 * its children (and the node itself), so it's possible to search for
1403 * the eligible node with the lowest finish time in logarithmic time.
1405 static void bfq_active_insert(struct bfq_service_tree
*st
,
1406 struct bfq_entity
*entity
)
1408 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1409 struct rb_node
*node
= &entity
->rb_node
;
1410 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1411 struct bfq_sched_data
*sd
= NULL
;
1412 struct bfq_group
*bfqg
= NULL
;
1413 struct bfq_data
*bfqd
= NULL
;
1416 bfq_insert(&st
->active
, entity
);
1419 node
= node
->rb_left
;
1420 else if (node
->rb_right
)
1421 node
= node
->rb_right
;
1423 bfq_update_active_tree(node
);
1425 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1426 sd
= entity
->sched_data
;
1427 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1428 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1431 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->active_list
);
1435 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1436 * @ioprio: the ioprio value to convert.
1438 static unsigned short bfq_ioprio_to_weight(int ioprio
)
1440 return (IOPRIO_BE_NR
- ioprio
) * BFQ_WEIGHT_CONVERSION_COEFF
;
1444 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1445 * @weight: the weight value to convert.
1447 * To preserve as much as possible the old only-ioprio user interface,
1448 * 0 is used as an escape ioprio value for weights (numerically) equal or
1449 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1451 static unsigned short bfq_weight_to_ioprio(int weight
)
1453 return max_t(int, 0,
1454 IOPRIO_BE_NR
* BFQ_WEIGHT_CONVERSION_COEFF
- weight
);
1457 static void bfq_get_entity(struct bfq_entity
*entity
)
1459 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1463 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "get_entity: %p %d",
1469 * bfq_find_deepest - find the deepest node that an extraction can modify.
1470 * @node: the node being removed.
1472 * Do the first step of an extraction in an rb tree, looking for the
1473 * node that will replace @node, and returning the deepest node that
1474 * the following modifications to the tree can touch. If @node is the
1475 * last node in the tree return %NULL.
1477 static struct rb_node
*bfq_find_deepest(struct rb_node
*node
)
1479 struct rb_node
*deepest
;
1481 if (!node
->rb_right
&& !node
->rb_left
)
1482 deepest
= rb_parent(node
);
1483 else if (!node
->rb_right
)
1484 deepest
= node
->rb_left
;
1485 else if (!node
->rb_left
)
1486 deepest
= node
->rb_right
;
1488 deepest
= rb_next(node
);
1489 if (deepest
->rb_right
)
1490 deepest
= deepest
->rb_right
;
1491 else if (rb_parent(deepest
) != node
)
1492 deepest
= rb_parent(deepest
);
1499 * bfq_active_extract - remove an entity from the active tree.
1500 * @st: the service_tree containing the tree.
1501 * @entity: the entity being removed.
1503 static void bfq_active_extract(struct bfq_service_tree
*st
,
1504 struct bfq_entity
*entity
)
1506 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1507 struct rb_node
*node
;
1508 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1509 struct bfq_sched_data
*sd
= NULL
;
1510 struct bfq_group
*bfqg
= NULL
;
1511 struct bfq_data
*bfqd
= NULL
;
1514 node
= bfq_find_deepest(&entity
->rb_node
);
1515 bfq_extract(&st
->active
, entity
);
1518 bfq_update_active_tree(node
);
1520 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1521 sd
= entity
->sched_data
;
1522 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1523 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1526 list_del(&bfqq
->bfqq_list
);
1530 * bfq_idle_insert - insert an entity into the idle tree.
1531 * @st: the service tree containing the tree.
1532 * @entity: the entity to insert.
1534 static void bfq_idle_insert(struct bfq_service_tree
*st
,
1535 struct bfq_entity
*entity
)
1537 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1538 struct bfq_entity
*first_idle
= st
->first_idle
;
1539 struct bfq_entity
*last_idle
= st
->last_idle
;
1541 if (!first_idle
|| bfq_gt(first_idle
->finish
, entity
->finish
))
1542 st
->first_idle
= entity
;
1543 if (!last_idle
|| bfq_gt(entity
->finish
, last_idle
->finish
))
1544 st
->last_idle
= entity
;
1546 bfq_insert(&st
->idle
, entity
);
1549 list_add(&bfqq
->bfqq_list
, &bfqq
->bfqd
->idle_list
);
1553 * bfq_forget_entity - do not consider entity any longer for scheduling
1554 * @st: the service tree.
1555 * @entity: the entity being removed.
1556 * @is_in_service: true if entity is currently the in-service entity.
1558 * Forget everything about @entity. In addition, if entity represents
1559 * a queue, and the latter is not in service, then release the service
1560 * reference to the queue (the one taken through bfq_get_entity). In
1561 * fact, in this case, there is really no more service reference to
1562 * the queue, as the latter is also outside any service tree. If,
1563 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1564 * will take care of putting the reference when the queue finally
1565 * stops being served.
1567 static void bfq_forget_entity(struct bfq_service_tree
*st
,
1568 struct bfq_entity
*entity
,
1571 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1573 entity
->on_st
= false;
1574 st
->wsum
-= entity
->weight
;
1575 if (bfqq
&& !is_in_service
)
1576 bfq_put_queue(bfqq
);
1580 * bfq_put_idle_entity - release the idle tree ref of an entity.
1581 * @st: service tree for the entity.
1582 * @entity: the entity being released.
1584 static void bfq_put_idle_entity(struct bfq_service_tree
*st
,
1585 struct bfq_entity
*entity
)
1587 bfq_idle_extract(st
, entity
);
1588 bfq_forget_entity(st
, entity
,
1589 entity
== entity
->sched_data
->in_service_entity
);
1593 * bfq_forget_idle - update the idle tree if necessary.
1594 * @st: the service tree to act upon.
1596 * To preserve the global O(log N) complexity we only remove one entry here;
1597 * as the idle tree will not grow indefinitely this can be done safely.
1599 static void bfq_forget_idle(struct bfq_service_tree
*st
)
1601 struct bfq_entity
*first_idle
= st
->first_idle
;
1602 struct bfq_entity
*last_idle
= st
->last_idle
;
1604 if (RB_EMPTY_ROOT(&st
->active
) && last_idle
&&
1605 !bfq_gt(last_idle
->finish
, st
->vtime
)) {
1607 * Forget the whole idle tree, increasing the vtime past
1608 * the last finish time of idle entities.
1610 st
->vtime
= last_idle
->finish
;
1613 if (first_idle
&& !bfq_gt(first_idle
->finish
, st
->vtime
))
1614 bfq_put_idle_entity(st
, first_idle
);
1617 static struct bfq_service_tree
*
1618 __bfq_entity_update_weight_prio(struct bfq_service_tree
*old_st
,
1619 struct bfq_entity
*entity
)
1621 struct bfq_service_tree
*new_st
= old_st
;
1623 if (entity
->prio_changed
) {
1624 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1625 unsigned int prev_weight
, new_weight
;
1626 struct bfq_data
*bfqd
= NULL
;
1627 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1628 struct bfq_sched_data
*sd
;
1629 struct bfq_group
*bfqg
;
1634 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1636 sd
= entity
->my_sched_data
;
1637 bfqg
= container_of(sd
, struct bfq_group
, sched_data
);
1638 bfqd
= (struct bfq_data
*)bfqg
->bfqd
;
1642 old_st
->wsum
-= entity
->weight
;
1644 if (entity
->new_weight
!= entity
->orig_weight
) {
1645 if (entity
->new_weight
< BFQ_MIN_WEIGHT
||
1646 entity
->new_weight
> BFQ_MAX_WEIGHT
) {
1647 pr_crit("update_weight_prio: new_weight %d\n",
1648 entity
->new_weight
);
1649 if (entity
->new_weight
< BFQ_MIN_WEIGHT
)
1650 entity
->new_weight
= BFQ_MIN_WEIGHT
;
1652 entity
->new_weight
= BFQ_MAX_WEIGHT
;
1654 entity
->orig_weight
= entity
->new_weight
;
1657 bfq_weight_to_ioprio(entity
->orig_weight
);
1661 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1662 entity
->prio_changed
= 0;
1665 * NOTE: here we may be changing the weight too early,
1666 * this will cause unfairness. The correct approach
1667 * would have required additional complexity to defer
1668 * weight changes to the proper time instants (i.e.,
1669 * when entity->finish <= old_st->vtime).
1671 new_st
= bfq_entity_service_tree(entity
);
1673 prev_weight
= entity
->weight
;
1674 new_weight
= entity
->orig_weight
*
1675 (bfqq
? bfqq
->wr_coeff
: 1);
1676 entity
->weight
= new_weight
;
1678 new_st
->wsum
+= entity
->weight
;
1680 if (new_st
!= old_st
)
1681 entity
->start
= new_st
->vtime
;
1687 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
);
1688 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
);
1691 * bfq_bfqq_served - update the scheduler status after selection for
1693 * @bfqq: the queue being served.
1694 * @served: bytes to transfer.
1696 * NOTE: this can be optimized, as the timestamps of upper level entities
1697 * are synchronized every time a new bfqq is selected for service. By now,
1698 * we keep it to better check consistency.
1700 static void bfq_bfqq_served(struct bfq_queue
*bfqq
, int served
)
1702 struct bfq_entity
*entity
= &bfqq
->entity
;
1703 struct bfq_service_tree
*st
;
1705 for_each_entity(entity
) {
1706 st
= bfq_entity_service_tree(entity
);
1708 entity
->service
+= served
;
1710 st
->vtime
+= bfq_delta(served
, st
->wsum
);
1711 bfq_forget_idle(st
);
1713 bfqg_stats_set_start_empty_time(bfqq_group(bfqq
));
1714 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "bfqq_served %d secs", served
);
1718 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1719 * of the time interval during which bfqq has been in
1722 * @bfqq: the queue that needs a service update.
1723 * @time_ms: the amount of time during which the queue has received service
1725 * If a queue does not consume its budget fast enough, then providing
1726 * the queue with service fairness may impair throughput, more or less
1727 * severely. For this reason, queues that consume their budget slowly
1728 * are provided with time fairness instead of service fairness. This
1729 * goal is achieved through the BFQ scheduling engine, even if such an
1730 * engine works in the service, and not in the time domain. The trick
1731 * is charging these queues with an inflated amount of service, equal
1732 * to the amount of service that they would have received during their
1733 * service slot if they had been fast, i.e., if their requests had
1734 * been dispatched at a rate equal to the estimated peak rate.
1736 * It is worth noting that time fairness can cause important
1737 * distortions in terms of bandwidth distribution, on devices with
1738 * internal queueing. The reason is that I/O requests dispatched
1739 * during the service slot of a queue may be served after that service
1740 * slot is finished, and may have a total processing time loosely
1741 * correlated with the duration of the service slot. This is
1742 * especially true for short service slots.
1744 static void bfq_bfqq_charge_time(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1745 unsigned long time_ms
)
1747 struct bfq_entity
*entity
= &bfqq
->entity
;
1748 int tot_serv_to_charge
= entity
->service
;
1749 unsigned int timeout_ms
= jiffies_to_msecs(bfq_timeout
);
1751 if (time_ms
> 0 && time_ms
< timeout_ms
)
1752 tot_serv_to_charge
=
1753 (bfqd
->bfq_max_budget
* time_ms
) / timeout_ms
;
1755 if (tot_serv_to_charge
< entity
->service
)
1756 tot_serv_to_charge
= entity
->service
;
1758 /* Increase budget to avoid inconsistencies */
1759 if (tot_serv_to_charge
> entity
->budget
)
1760 entity
->budget
= tot_serv_to_charge
;
1762 bfq_bfqq_served(bfqq
,
1763 max_t(int, 0, tot_serv_to_charge
- entity
->service
));
1766 static void bfq_update_fin_time_enqueue(struct bfq_entity
*entity
,
1767 struct bfq_service_tree
*st
,
1770 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1772 st
= __bfq_entity_update_weight_prio(st
, entity
);
1773 bfq_calc_finish(entity
, entity
->budget
);
1776 * If some queues enjoy backshifting for a while, then their
1777 * (virtual) finish timestamps may happen to become lower and
1778 * lower than the system virtual time. In particular, if
1779 * these queues often happen to be idle for short time
1780 * periods, and during such time periods other queues with
1781 * higher timestamps happen to be busy, then the backshifted
1782 * timestamps of the former queues can become much lower than
1783 * the system virtual time. In fact, to serve the queues with
1784 * higher timestamps while the ones with lower timestamps are
1785 * idle, the system virtual time may be pushed-up to much
1786 * higher values than the finish timestamps of the idle
1787 * queues. As a consequence, the finish timestamps of all new
1788 * or newly activated queues may end up being much larger than
1789 * those of lucky queues with backshifted timestamps. The
1790 * latter queues may then monopolize the device for a lot of
1791 * time. This would simply break service guarantees.
1793 * To reduce this problem, push up a little bit the
1794 * backshifted timestamps of the queue associated with this
1795 * entity (only a queue can happen to have the backshifted
1796 * flag set): just enough to let the finish timestamp of the
1797 * queue be equal to the current value of the system virtual
1798 * time. This may introduce a little unfairness among queues
1799 * with backshifted timestamps, but it does not break
1800 * worst-case fairness guarantees.
1802 * As a special case, if bfqq is weight-raised, push up
1803 * timestamps much less, to keep very low the probability that
1804 * this push up causes the backshifted finish timestamps of
1805 * weight-raised queues to become higher than the backshifted
1806 * finish timestamps of non weight-raised queues.
1808 if (backshifted
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1809 unsigned long delta
= st
->vtime
- entity
->finish
;
1812 delta
/= bfqq
->wr_coeff
;
1814 entity
->start
+= delta
;
1815 entity
->finish
+= delta
;
1818 bfq_active_insert(st
, entity
);
1822 * __bfq_activate_entity - handle activation of entity.
1823 * @entity: the entity being activated.
1824 * @non_blocking_wait_rq: true if entity was waiting for a request
1826 * Called for a 'true' activation, i.e., if entity is not active and
1827 * one of its children receives a new request.
1829 * Basically, this function updates the timestamps of entity and
1830 * inserts entity into its active tree, ater possible extracting it
1831 * from its idle tree.
1833 static void __bfq_activate_entity(struct bfq_entity
*entity
,
1834 bool non_blocking_wait_rq
)
1836 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1837 bool backshifted
= false;
1838 unsigned long long min_vstart
;
1840 /* See comments on bfq_fqq_update_budg_for_activation */
1841 if (non_blocking_wait_rq
&& bfq_gt(st
->vtime
, entity
->finish
)) {
1843 min_vstart
= entity
->finish
;
1845 min_vstart
= st
->vtime
;
1847 if (entity
->tree
== &st
->idle
) {
1849 * Must be on the idle tree, bfq_idle_extract() will
1852 bfq_idle_extract(st
, entity
);
1853 entity
->start
= bfq_gt(min_vstart
, entity
->finish
) ?
1854 min_vstart
: entity
->finish
;
1857 * The finish time of the entity may be invalid, and
1858 * it is in the past for sure, otherwise the queue
1859 * would have been on the idle tree.
1861 entity
->start
= min_vstart
;
1862 st
->wsum
+= entity
->weight
;
1864 * entity is about to be inserted into a service tree,
1865 * and then set in service: get a reference to make
1866 * sure entity does not disappear until it is no
1867 * longer in service or scheduled for service.
1869 bfq_get_entity(entity
);
1871 entity
->on_st
= true;
1874 bfq_update_fin_time_enqueue(entity
, st
, backshifted
);
1878 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1879 * @entity: the entity being requeued or repositioned.
1881 * Requeueing is needed if this entity stops being served, which
1882 * happens if a leaf descendant entity has expired. On the other hand,
1883 * repositioning is needed if the next_inservice_entity for the child
1884 * entity has changed. See the comments inside the function for
1887 * Basically, this function: 1) removes entity from its active tree if
1888 * present there, 2) updates the timestamps of entity and 3) inserts
1889 * entity back into its active tree (in the new, right position for
1890 * the new values of the timestamps).
1892 static void __bfq_requeue_entity(struct bfq_entity
*entity
)
1894 struct bfq_sched_data
*sd
= entity
->sched_data
;
1895 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1897 if (entity
== sd
->in_service_entity
) {
1899 * We are requeueing the current in-service entity,
1900 * which may have to be done for one of the following
1902 * - entity represents the in-service queue, and the
1903 * in-service queue is being requeued after an
1905 * - entity represents a group, and its budget has
1906 * changed because one of its child entities has
1907 * just been either activated or requeued for some
1908 * reason; the timestamps of the entity need then to
1909 * be updated, and the entity needs to be enqueued
1910 * or repositioned accordingly.
1912 * In particular, before requeueing, the start time of
1913 * the entity must be moved forward to account for the
1914 * service that the entity has received while in
1915 * service. This is done by the next instructions. The
1916 * finish time will then be updated according to this
1917 * new value of the start time, and to the budget of
1920 bfq_calc_finish(entity
, entity
->service
);
1921 entity
->start
= entity
->finish
;
1923 * In addition, if the entity had more than one child
1924 * when set in service, then was not extracted from
1925 * the active tree. This implies that the position of
1926 * the entity in the active tree may need to be
1927 * changed now, because we have just updated the start
1928 * time of the entity, and we will update its finish
1929 * time in a moment (the requeueing is then, more
1930 * precisely, a repositioning in this case). To
1931 * implement this repositioning, we: 1) dequeue the
1932 * entity here, 2) update the finish time and
1933 * requeue the entity according to the new
1937 bfq_active_extract(st
, entity
);
1938 } else { /* The entity is already active, and not in service */
1940 * In this case, this function gets called only if the
1941 * next_in_service entity below this entity has
1942 * changed, and this change has caused the budget of
1943 * this entity to change, which, finally implies that
1944 * the finish time of this entity must be
1945 * updated. Such an update may cause the scheduling,
1946 * i.e., the position in the active tree, of this
1947 * entity to change. We handle this change by: 1)
1948 * dequeueing the entity here, 2) updating the finish
1949 * time and requeueing the entity according to the new
1950 * timestamps below. This is the same approach as the
1951 * non-extracted-entity sub-case above.
1953 bfq_active_extract(st
, entity
);
1956 bfq_update_fin_time_enqueue(entity
, st
, false);
1959 static void __bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1960 struct bfq_sched_data
*sd
,
1961 bool non_blocking_wait_rq
)
1963 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
1965 if (sd
->in_service_entity
== entity
|| entity
->tree
== &st
->active
)
1967 * in service or already queued on the active tree,
1968 * requeue or reposition
1970 __bfq_requeue_entity(entity
);
1973 * Not in service and not queued on its active tree:
1974 * the activity is idle and this is a true activation.
1976 __bfq_activate_entity(entity
, non_blocking_wait_rq
);
1981 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1982 * and activate, requeue or reposition all ancestors
1983 * for which such an update becomes necessary.
1984 * @entity: the entity to activate.
1985 * @non_blocking_wait_rq: true if this entity was waiting for a request
1986 * @requeue: true if this is a requeue, which implies that bfqq is
1987 * being expired; thus ALL its ancestors stop being served and must
1988 * therefore be requeued
1990 static void bfq_activate_requeue_entity(struct bfq_entity
*entity
,
1991 bool non_blocking_wait_rq
,
1994 struct bfq_sched_data
*sd
;
1996 for_each_entity(entity
) {
1997 sd
= entity
->sched_data
;
1998 __bfq_activate_requeue_entity(entity
, sd
, non_blocking_wait_rq
);
2000 if (!bfq_update_next_in_service(sd
, entity
) && !requeue
)
2006 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2007 * @entity: the entity to deactivate.
2008 * @ins_into_idle_tree: if false, the entity will not be put into the
2011 * Deactivates an entity, independently from its previous state. Must
2012 * be invoked only if entity is on a service tree. Extracts the entity
2013 * from that tree, and if necessary and allowed, puts it on the idle
2016 static bool __bfq_deactivate_entity(struct bfq_entity
*entity
,
2017 bool ins_into_idle_tree
)
2019 struct bfq_sched_data
*sd
= entity
->sched_data
;
2020 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
2021 int is_in_service
= entity
== sd
->in_service_entity
;
2023 if (!entity
->on_st
) /* entity never activated, or already inactive */
2027 bfq_calc_finish(entity
, entity
->service
);
2029 if (entity
->tree
== &st
->active
)
2030 bfq_active_extract(st
, entity
);
2031 else if (!is_in_service
&& entity
->tree
== &st
->idle
)
2032 bfq_idle_extract(st
, entity
);
2034 if (!ins_into_idle_tree
|| !bfq_gt(entity
->finish
, st
->vtime
))
2035 bfq_forget_entity(st
, entity
, is_in_service
);
2037 bfq_idle_insert(st
, entity
);
2043 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2044 * @entity: the entity to deactivate.
2045 * @ins_into_idle_tree: true if the entity can be put on the idle tree
2047 static void bfq_deactivate_entity(struct bfq_entity
*entity
,
2048 bool ins_into_idle_tree
,
2051 struct bfq_sched_data
*sd
;
2052 struct bfq_entity
*parent
= NULL
;
2054 for_each_entity_safe(entity
, parent
) {
2055 sd
= entity
->sched_data
;
2057 if (!__bfq_deactivate_entity(entity
, ins_into_idle_tree
)) {
2059 * entity is not in any tree any more, so
2060 * this deactivation is a no-op, and there is
2061 * nothing to change for upper-level entities
2062 * (in case of expiration, this can never
2068 if (sd
->next_in_service
== entity
)
2070 * entity was the next_in_service entity,
2071 * then, since entity has just been
2072 * deactivated, a new one must be found.
2074 bfq_update_next_in_service(sd
, NULL
);
2076 if (sd
->next_in_service
)
2078 * The parent entity is still backlogged,
2079 * because next_in_service is not NULL. So, no
2080 * further upwards deactivation must be
2081 * performed. Yet, next_in_service has
2082 * changed. Then the schedule does need to be
2088 * If we get here, then the parent is no more
2089 * backlogged and we need to propagate the
2090 * deactivation upwards. Thus let the loop go on.
2094 * Also let parent be queued into the idle tree on
2095 * deactivation, to preserve service guarantees, and
2096 * assuming that who invoked this function does not
2097 * need parent entities too to be removed completely.
2099 ins_into_idle_tree
= true;
2103 * If the deactivation loop is fully executed, then there are
2104 * no more entities to touch and next loop is not executed at
2105 * all. Otherwise, requeue remaining entities if they are
2106 * about to stop receiving service, or reposition them if this
2110 for_each_entity(entity
) {
2112 * Invoke __bfq_requeue_entity on entity, even if
2113 * already active, to requeue/reposition it in the
2114 * active tree (because sd->next_in_service has
2117 __bfq_requeue_entity(entity
);
2119 sd
= entity
->sched_data
;
2120 if (!bfq_update_next_in_service(sd
, entity
) &&
2123 * next_in_service unchanged or not causing
2124 * any change in entity->parent->sd, and no
2125 * requeueing needed for expiration: stop
2133 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2134 * if needed, to have at least one entity eligible.
2135 * @st: the service tree to act upon.
2137 * Assumes that st is not empty.
2139 static u64
bfq_calc_vtime_jump(struct bfq_service_tree
*st
)
2141 struct bfq_entity
*root_entity
= bfq_root_active_entity(&st
->active
);
2143 if (bfq_gt(root_entity
->min_start
, st
->vtime
))
2144 return root_entity
->min_start
;
2149 static void bfq_update_vtime(struct bfq_service_tree
*st
, u64 new_value
)
2151 if (new_value
> st
->vtime
) {
2152 st
->vtime
= new_value
;
2153 bfq_forget_idle(st
);
2158 * bfq_first_active_entity - find the eligible entity with
2159 * the smallest finish time
2160 * @st: the service tree to select from.
2161 * @vtime: the system virtual to use as a reference for eligibility
2163 * This function searches the first schedulable entity, starting from the
2164 * root of the tree and going on the left every time on this side there is
2165 * a subtree with at least one eligible (start >= vtime) entity. The path on
2166 * the right is followed only if a) the left subtree contains no eligible
2167 * entities and b) no eligible entity has been found yet.
2169 static struct bfq_entity
*bfq_first_active_entity(struct bfq_service_tree
*st
,
2172 struct bfq_entity
*entry
, *first
= NULL
;
2173 struct rb_node
*node
= st
->active
.rb_node
;
2176 entry
= rb_entry(node
, struct bfq_entity
, rb_node
);
2178 if (!bfq_gt(entry
->start
, vtime
))
2181 if (node
->rb_left
) {
2182 entry
= rb_entry(node
->rb_left
,
2183 struct bfq_entity
, rb_node
);
2184 if (!bfq_gt(entry
->min_start
, vtime
)) {
2185 node
= node
->rb_left
;
2191 node
= node
->rb_right
;
2198 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2199 * @st: the service tree.
2201 * If there is no in-service entity for the sched_data st belongs to,
2202 * then return the entity that will be set in service if:
2203 * 1) the parent entity this st belongs to is set in service;
2204 * 2) no entity belonging to such parent entity undergoes a state change
2205 * that would influence the timestamps of the entity (e.g., becomes idle,
2206 * becomes backlogged, changes its budget, ...).
2208 * In this first case, update the virtual time in @st too (see the
2209 * comments on this update inside the function).
2211 * In constrast, if there is an in-service entity, then return the
2212 * entity that would be set in service if not only the above
2213 * conditions, but also the next one held true: the currently
2214 * in-service entity, on expiration,
2215 * 1) gets a finish time equal to the current one, or
2216 * 2) is not eligible any more, or
2219 static struct bfq_entity
*
2220 __bfq_lookup_next_entity(struct bfq_service_tree
*st
, bool in_service
)
2222 struct bfq_entity
*entity
;
2225 if (RB_EMPTY_ROOT(&st
->active
))
2229 * Get the value of the system virtual time for which at
2230 * least one entity is eligible.
2232 new_vtime
= bfq_calc_vtime_jump(st
);
2235 * If there is no in-service entity for the sched_data this
2236 * active tree belongs to, then push the system virtual time
2237 * up to the value that guarantees that at least one entity is
2238 * eligible. If, instead, there is an in-service entity, then
2239 * do not make any such update, because there is already an
2240 * eligible entity, namely the in-service one (even if the
2241 * entity is not on st, because it was extracted when set in
2245 bfq_update_vtime(st
, new_vtime
);
2247 entity
= bfq_first_active_entity(st
, new_vtime
);
2253 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2254 * @sd: the sched_data.
2256 * This function is invoked when there has been a change in the trees
2257 * for sd, and we need know what is the new next entity after this
2260 static struct bfq_entity
*bfq_lookup_next_entity(struct bfq_sched_data
*sd
)
2262 struct bfq_service_tree
*st
= sd
->service_tree
;
2263 struct bfq_service_tree
*idle_class_st
= st
+ (BFQ_IOPRIO_CLASSES
- 1);
2264 struct bfq_entity
*entity
= NULL
;
2268 * Choose from idle class, if needed to guarantee a minimum
2269 * bandwidth to this class (and if there is some active entity
2270 * in idle class). This should also mitigate
2271 * priority-inversion problems in case a low priority task is
2272 * holding file system resources.
2274 if (time_is_before_jiffies(sd
->bfq_class_idle_last_service
+
2275 BFQ_CL_IDLE_TIMEOUT
)) {
2276 if (!RB_EMPTY_ROOT(&idle_class_st
->active
))
2277 class_idx
= BFQ_IOPRIO_CLASSES
- 1;
2278 /* About to be served if backlogged, or not yet backlogged */
2279 sd
->bfq_class_idle_last_service
= jiffies
;
2283 * Find the next entity to serve for the highest-priority
2284 * class, unless the idle class needs to be served.
2286 for (; class_idx
< BFQ_IOPRIO_CLASSES
; class_idx
++) {
2287 entity
= __bfq_lookup_next_entity(st
+ class_idx
,
2288 sd
->in_service_entity
);
2300 static bool next_queue_may_preempt(struct bfq_data
*bfqd
)
2302 struct bfq_sched_data
*sd
= &bfqd
->root_group
->sched_data
;
2304 return sd
->next_in_service
!= sd
->in_service_entity
;
2308 * Get next queue for service.
2310 static struct bfq_queue
*bfq_get_next_queue(struct bfq_data
*bfqd
)
2312 struct bfq_entity
*entity
= NULL
;
2313 struct bfq_sched_data
*sd
;
2314 struct bfq_queue
*bfqq
;
2316 if (bfqd
->busy_queues
== 0)
2320 * Traverse the path from the root to the leaf entity to
2321 * serve. Set in service all the entities visited along the
2324 sd
= &bfqd
->root_group
->sched_data
;
2325 for (; sd
; sd
= entity
->my_sched_data
) {
2327 * WARNING. We are about to set the in-service entity
2328 * to sd->next_in_service, i.e., to the (cached) value
2329 * returned by bfq_lookup_next_entity(sd) the last
2330 * time it was invoked, i.e., the last time when the
2331 * service order in sd changed as a consequence of the
2332 * activation or deactivation of an entity. In this
2333 * respect, if we execute bfq_lookup_next_entity(sd)
2334 * in this very moment, it may, although with low
2335 * probability, yield a different entity than that
2336 * pointed to by sd->next_in_service. This rare event
2337 * happens in case there was no CLASS_IDLE entity to
2338 * serve for sd when bfq_lookup_next_entity(sd) was
2339 * invoked for the last time, while there is now one
2342 * If the above event happens, then the scheduling of
2343 * such entity in CLASS_IDLE is postponed until the
2344 * service of the sd->next_in_service entity
2345 * finishes. In fact, when the latter is expired,
2346 * bfq_lookup_next_entity(sd) gets called again,
2347 * exactly to update sd->next_in_service.
2350 /* Make next_in_service entity become in_service_entity */
2351 entity
= sd
->next_in_service
;
2352 sd
->in_service_entity
= entity
;
2355 * Reset the accumulator of the amount of service that
2356 * the entity is about to receive.
2358 entity
->service
= 0;
2361 * If entity is no longer a candidate for next
2362 * service, then we extract it from its active tree,
2363 * for the following reason. To further boost the
2364 * throughput in some special case, BFQ needs to know
2365 * which is the next candidate entity to serve, while
2366 * there is already an entity in service. In this
2367 * respect, to make it easy to compute/update the next
2368 * candidate entity to serve after the current
2369 * candidate has been set in service, there is a case
2370 * where it is necessary to extract the current
2371 * candidate from its service tree. Such a case is
2372 * when the entity just set in service cannot be also
2373 * a candidate for next service. Details about when
2374 * this conditions holds are reported in the comments
2375 * on the function bfq_no_longer_next_in_service()
2378 if (bfq_no_longer_next_in_service(entity
))
2379 bfq_active_extract(bfq_entity_service_tree(entity
),
2383 * For the same reason why we may have just extracted
2384 * entity from its active tree, we may need to update
2385 * next_in_service for the sched_data of entity too,
2386 * regardless of whether entity has been extracted.
2387 * In fact, even if entity has not been extracted, a
2388 * descendant entity may get extracted. Such an event
2389 * would cause a change in next_in_service for the
2390 * level of the descendant entity, and thus possibly
2391 * back to upper levels.
2393 * We cannot perform the resulting needed update
2394 * before the end of this loop, because, to know which
2395 * is the correct next-to-serve candidate entity for
2396 * each level, we need first to find the leaf entity
2397 * to set in service. In fact, only after we know
2398 * which is the next-to-serve leaf entity, we can
2399 * discover whether the parent entity of the leaf
2400 * entity becomes the next-to-serve, and so on.
2405 bfqq
= bfq_entity_to_bfqq(entity
);
2408 * We can finally update all next-to-serve entities along the
2409 * path from the leaf entity just set in service to the root.
2411 for_each_entity(entity
) {
2412 struct bfq_sched_data
*sd
= entity
->sched_data
;
2414 if (!bfq_update_next_in_service(sd
, NULL
))
2421 static void __bfq_bfqd_reset_in_service(struct bfq_data
*bfqd
)
2423 struct bfq_queue
*in_serv_bfqq
= bfqd
->in_service_queue
;
2424 struct bfq_entity
*in_serv_entity
= &in_serv_bfqq
->entity
;
2425 struct bfq_entity
*entity
= in_serv_entity
;
2427 if (bfqd
->in_service_bic
) {
2428 put_io_context(bfqd
->in_service_bic
->icq
.ioc
);
2429 bfqd
->in_service_bic
= NULL
;
2432 bfq_clear_bfqq_wait_request(in_serv_bfqq
);
2433 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
2434 bfqd
->in_service_queue
= NULL
;
2437 * When this function is called, all in-service entities have
2438 * been properly deactivated or requeued, so we can safely
2439 * execute the final step: reset in_service_entity along the
2440 * path from entity to the root.
2442 for_each_entity(entity
)
2443 entity
->sched_data
->in_service_entity
= NULL
;
2446 * in_serv_entity is no longer in service, so, if it is in no
2447 * service tree either, then release the service reference to
2448 * the queue it represents (taken with bfq_get_entity).
2450 if (!in_serv_entity
->on_st
)
2451 bfq_put_queue(in_serv_bfqq
);
2454 static void bfq_deactivate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2455 bool ins_into_idle_tree
, bool expiration
)
2457 struct bfq_entity
*entity
= &bfqq
->entity
;
2459 bfq_deactivate_entity(entity
, ins_into_idle_tree
, expiration
);
2462 static void bfq_activate_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2464 struct bfq_entity
*entity
= &bfqq
->entity
;
2466 bfq_activate_requeue_entity(entity
, bfq_bfqq_non_blocking_wait_rq(bfqq
),
2468 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
2471 static void bfq_requeue_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2473 struct bfq_entity
*entity
= &bfqq
->entity
;
2475 bfq_activate_requeue_entity(entity
, false,
2476 bfqq
== bfqd
->in_service_queue
);
2479 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
);
2482 * Called when the bfqq no longer has requests pending, remove it from
2483 * the service tree. As a special case, it can be invoked during an
2486 static void bfq_del_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2489 bfq_log_bfqq(bfqd
, bfqq
, "del from busy");
2491 bfq_clear_bfqq_busy(bfqq
);
2493 bfqd
->busy_queues
--;
2495 if (bfqq
->wr_coeff
> 1)
2496 bfqd
->wr_busy_queues
--;
2498 bfqg_stats_update_dequeue(bfqq_group(bfqq
));
2500 bfq_deactivate_bfqq(bfqd
, bfqq
, true, expiration
);
2504 * Called when an inactive queue receives a new request.
2506 static void bfq_add_bfqq_busy(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
2508 bfq_log_bfqq(bfqd
, bfqq
, "add to busy");
2510 bfq_activate_bfqq(bfqd
, bfqq
);
2512 bfq_mark_bfqq_busy(bfqq
);
2513 bfqd
->busy_queues
++;
2515 if (bfqq
->wr_coeff
> 1)
2516 bfqd
->wr_busy_queues
++;
2519 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2521 /* bfqg stats flags */
2522 enum bfqg_stats_flags
{
2523 BFQG_stats_waiting
= 0,
2528 #define BFQG_FLAG_FNS(name) \
2529 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2531 stats->flags |= (1 << BFQG_stats_##name); \
2533 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2535 stats->flags &= ~(1 << BFQG_stats_##name); \
2537 static int bfqg_stats_##name(struct bfqg_stats *stats) \
2539 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2542 BFQG_FLAG_FNS(waiting)
2543 BFQG_FLAG_FNS(idling
)
2544 BFQG_FLAG_FNS(empty
)
2545 #undef BFQG_FLAG_FNS
2547 /* This should be called with the queue_lock held. */
2548 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
2550 unsigned long long now
;
2552 if (!bfqg_stats_waiting(stats
))
2555 now
= sched_clock();
2556 if (time_after64(now
, stats
->start_group_wait_time
))
2557 blkg_stat_add(&stats
->group_wait_time
,
2558 now
- stats
->start_group_wait_time
);
2559 bfqg_stats_clear_waiting(stats
);
2562 /* This should be called with the queue_lock held. */
2563 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
2564 struct bfq_group
*curr_bfqg
)
2566 struct bfqg_stats
*stats
= &bfqg
->stats
;
2568 if (bfqg_stats_waiting(stats
))
2570 if (bfqg
== curr_bfqg
)
2572 stats
->start_group_wait_time
= sched_clock();
2573 bfqg_stats_mark_waiting(stats
);
2576 /* This should be called with the queue_lock held. */
2577 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
2579 unsigned long long now
;
2581 if (!bfqg_stats_empty(stats
))
2584 now
= sched_clock();
2585 if (time_after64(now
, stats
->start_empty_time
))
2586 blkg_stat_add(&stats
->empty_time
,
2587 now
- stats
->start_empty_time
);
2588 bfqg_stats_clear_empty(stats
);
2591 static void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
2593 blkg_stat_add(&bfqg
->stats
.dequeue
, 1);
2596 static void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
2598 struct bfqg_stats
*stats
= &bfqg
->stats
;
2600 if (blkg_rwstat_total(&stats
->queued
))
2604 * group is already marked empty. This can happen if bfqq got new
2605 * request in parent group and moved to this group while being added
2606 * to service tree. Just ignore the event and move on.
2608 if (bfqg_stats_empty(stats
))
2611 stats
->start_empty_time
= sched_clock();
2612 bfqg_stats_mark_empty(stats
);
2615 static void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
2617 struct bfqg_stats
*stats
= &bfqg
->stats
;
2619 if (bfqg_stats_idling(stats
)) {
2620 unsigned long long now
= sched_clock();
2622 if (time_after64(now
, stats
->start_idle_time
))
2623 blkg_stat_add(&stats
->idle_time
,
2624 now
- stats
->start_idle_time
);
2625 bfqg_stats_clear_idling(stats
);
2629 static void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
2631 struct bfqg_stats
*stats
= &bfqg
->stats
;
2633 stats
->start_idle_time
= sched_clock();
2634 bfqg_stats_mark_idling(stats
);
2637 static void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
2639 struct bfqg_stats
*stats
= &bfqg
->stats
;
2641 blkg_stat_add(&stats
->avg_queue_size_sum
,
2642 blkg_rwstat_total(&stats
->queued
));
2643 blkg_stat_add(&stats
->avg_queue_size_samples
, 1);
2644 bfqg_stats_update_group_wait_time(stats
);
2648 * blk-cgroup policy-related handlers
2649 * The following functions help in converting between blk-cgroup
2650 * internal structures and BFQ-specific structures.
2653 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
2655 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
2658 static struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
2660 return pd_to_blkg(&bfqg
->pd
);
2663 static struct blkcg_policy blkcg_policy_bfq
;
2665 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
2667 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
2671 * bfq_group handlers
2672 * The following functions help in navigating the bfq_group hierarchy
2673 * by allowing to find the parent of a bfq_group or the bfq_group
2674 * associated to a bfq_queue.
2677 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
2679 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
2681 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
2684 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
2686 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
2688 return group_entity
? container_of(group_entity
, struct bfq_group
,
2690 bfqq
->bfqd
->root_group
;
2694 * The following two functions handle get and put of a bfq_group by
2695 * wrapping the related blk-cgroup hooks.
2698 static void bfqg_get(struct bfq_group
*bfqg
)
2700 return blkg_get(bfqg_to_blkg(bfqg
));
2703 static void bfqg_put(struct bfq_group
*bfqg
)
2705 return blkg_put(bfqg_to_blkg(bfqg
));
2708 static void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
2709 struct bfq_queue
*bfqq
,
2712 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
2713 bfqg_stats_end_empty_time(&bfqg
->stats
);
2714 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
2715 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
2718 static void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
2720 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
2723 static void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
2725 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
2728 static void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
2729 uint64_t start_time
, uint64_t io_start_time
,
2732 struct bfqg_stats
*stats
= &bfqg
->stats
;
2733 unsigned long long now
= sched_clock();
2735 if (time_after64(now
, io_start_time
))
2736 blkg_rwstat_add(&stats
->service_time
, op
,
2737 now
- io_start_time
);
2738 if (time_after64(io_start_time
, start_time
))
2739 blkg_rwstat_add(&stats
->wait_time
, op
,
2740 io_start_time
- start_time
);
2744 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
2746 /* queued stats shouldn't be cleared */
2747 blkg_rwstat_reset(&stats
->merged
);
2748 blkg_rwstat_reset(&stats
->service_time
);
2749 blkg_rwstat_reset(&stats
->wait_time
);
2750 blkg_stat_reset(&stats
->time
);
2751 blkg_stat_reset(&stats
->avg_queue_size_sum
);
2752 blkg_stat_reset(&stats
->avg_queue_size_samples
);
2753 blkg_stat_reset(&stats
->dequeue
);
2754 blkg_stat_reset(&stats
->group_wait_time
);
2755 blkg_stat_reset(&stats
->idle_time
);
2756 blkg_stat_reset(&stats
->empty_time
);
2760 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
2765 /* queued stats shouldn't be cleared */
2766 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
2767 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
2768 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
2769 blkg_stat_add_aux(&from
->time
, &from
->time
);
2770 blkg_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
2771 blkg_stat_add_aux(&to
->avg_queue_size_samples
,
2772 &from
->avg_queue_size_samples
);
2773 blkg_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
2774 blkg_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
2775 blkg_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
2776 blkg_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
2780 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2781 * recursive stats can still account for the amount used by this bfqg after
2784 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
2786 struct bfq_group
*parent
;
2788 if (!bfqg
) /* root_group */
2791 parent
= bfqg_parent(bfqg
);
2793 lockdep_assert_held(bfqg_to_blkg(bfqg
)->q
->queue_lock
);
2795 if (unlikely(!parent
))
2798 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
2799 bfqg_stats_reset(&bfqg
->stats
);
2802 static void bfq_init_entity(struct bfq_entity
*entity
,
2803 struct bfq_group
*bfqg
)
2805 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
2807 entity
->weight
= entity
->new_weight
;
2808 entity
->orig_weight
= entity
->new_weight
;
2810 bfqq
->ioprio
= bfqq
->new_ioprio
;
2811 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
2814 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
2815 entity
->sched_data
= &bfqg
->sched_data
;
2818 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
2820 blkg_rwstat_exit(&stats
->merged
);
2821 blkg_rwstat_exit(&stats
->service_time
);
2822 blkg_rwstat_exit(&stats
->wait_time
);
2823 blkg_rwstat_exit(&stats
->queued
);
2824 blkg_stat_exit(&stats
->time
);
2825 blkg_stat_exit(&stats
->avg_queue_size_sum
);
2826 blkg_stat_exit(&stats
->avg_queue_size_samples
);
2827 blkg_stat_exit(&stats
->dequeue
);
2828 blkg_stat_exit(&stats
->group_wait_time
);
2829 blkg_stat_exit(&stats
->idle_time
);
2830 blkg_stat_exit(&stats
->empty_time
);
2833 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
2835 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
2836 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
2837 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
2838 blkg_rwstat_init(&stats
->queued
, gfp
) ||
2839 blkg_stat_init(&stats
->time
, gfp
) ||
2840 blkg_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
2841 blkg_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
2842 blkg_stat_init(&stats
->dequeue
, gfp
) ||
2843 blkg_stat_init(&stats
->group_wait_time
, gfp
) ||
2844 blkg_stat_init(&stats
->idle_time
, gfp
) ||
2845 blkg_stat_init(&stats
->empty_time
, gfp
)) {
2846 bfqg_stats_exit(stats
);
2853 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
2855 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
2858 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
2860 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
2863 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
2865 struct bfq_group_data
*bgd
;
2867 bgd
= kzalloc(sizeof(*bgd
), gfp
);
2873 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
2875 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
2877 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
2878 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
2881 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
2883 kfree(cpd_to_bfqgd(cpd
));
2886 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, int node
)
2888 struct bfq_group
*bfqg
;
2890 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, node
);
2894 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
2902 static void bfq_pd_init(struct blkg_policy_data
*pd
)
2904 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
2905 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
2906 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
2907 struct bfq_entity
*entity
= &bfqg
->entity
;
2908 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
2910 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
2911 entity
->my_sched_data
= &bfqg
->sched_data
;
2912 bfqg
->my_entity
= entity
; /*
2913 * the root_group's will be set to NULL
2914 * in bfq_init_queue()
2919 static void bfq_pd_free(struct blkg_policy_data
*pd
)
2921 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2923 bfqg_stats_exit(&bfqg
->stats
);
2927 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
2929 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
2931 bfqg_stats_reset(&bfqg
->stats
);
2934 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
2935 struct bfq_group
*parent
)
2937 struct bfq_entity
*entity
;
2939 entity
= &bfqg
->entity
;
2940 entity
->parent
= parent
->my_entity
;
2941 entity
->sched_data
= &parent
->sched_data
;
2944 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
2945 struct blkcg
*blkcg
)
2947 struct blkcg_gq
*blkg
;
2949 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
2951 return blkg_to_bfqg(blkg
);
2955 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
2956 struct blkcg
*blkcg
)
2958 struct bfq_group
*bfqg
, *parent
;
2959 struct bfq_entity
*entity
;
2961 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
2963 if (unlikely(!bfqg
))
2967 * Update chain of bfq_groups as we might be handling a leaf group
2968 * which, along with some of its relatives, has not been hooked yet
2969 * to the private hierarchy of BFQ.
2971 entity
= &bfqg
->entity
;
2972 for_each_entity(entity
) {
2973 bfqg
= container_of(entity
, struct bfq_group
, entity
);
2974 if (bfqg
!= bfqd
->root_group
) {
2975 parent
= bfqg_parent(bfqg
);
2977 parent
= bfqd
->root_group
;
2978 bfq_group_set_parent(bfqg
, parent
);
2985 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
2986 struct bfq_queue
*bfqq
,
2988 enum bfqq_expiration reason
);
2991 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2992 * @bfqd: queue descriptor.
2993 * @bfqq: the queue to move.
2994 * @bfqg: the group to move to.
2996 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2997 * it on the new one. Avoid putting the entity on the old group idle tree.
2999 * Must be called under the queue lock; the cgroup owning @bfqg must
3000 * not disappear (by now this just means that we are called under
3003 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3004 struct bfq_group
*bfqg
)
3006 struct bfq_entity
*entity
= &bfqq
->entity
;
3008 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3009 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3010 * from data structures related to current group. Otherwise we
3011 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3014 if (bfqq
== bfqd
->in_service_queue
)
3015 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
3016 false, BFQQE_PREEMPTED
);
3018 if (bfq_bfqq_busy(bfqq
))
3019 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
3020 else if (entity
->on_st
)
3021 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
3022 bfqg_put(bfqq_group(bfqq
));
3025 * Here we use a reference to bfqg. We don't need a refcounter
3026 * as the cgroup reference will not be dropped, so that its
3027 * destroy() callback will not be invoked.
3029 entity
->parent
= bfqg
->my_entity
;
3030 entity
->sched_data
= &bfqg
->sched_data
;
3033 if (bfq_bfqq_busy(bfqq
))
3034 bfq_activate_bfqq(bfqd
, bfqq
);
3036 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
3037 bfq_schedule_dispatch(bfqd
);
3041 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3042 * @bfqd: the queue descriptor.
3043 * @bic: the bic to move.
3044 * @blkcg: the blk-cgroup to move to.
3046 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3047 * has to make sure that the reference to cgroup is valid across the call.
3049 * NOTE: an alternative approach might have been to store the current
3050 * cgroup in bfqq and getting a reference to it, reducing the lookup
3051 * time here, at the price of slightly more complex code.
3053 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
3054 struct bfq_io_cq
*bic
,
3055 struct blkcg
*blkcg
)
3057 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
3058 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
3059 struct bfq_group
*bfqg
;
3060 struct bfq_entity
*entity
;
3062 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
3064 if (unlikely(!bfqg
))
3065 bfqg
= bfqd
->root_group
;
3068 entity
= &async_bfqq
->entity
;
3070 if (entity
->sched_data
!= &bfqg
->sched_data
) {
3071 bic_set_bfqq(bic
, NULL
, 0);
3072 bfq_log_bfqq(bfqd
, async_bfqq
,
3073 "bic_change_group: %p %d",
3076 bfq_put_queue(async_bfqq
);
3081 entity
= &sync_bfqq
->entity
;
3082 if (entity
->sched_data
!= &bfqg
->sched_data
)
3083 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
3089 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
3091 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
3092 struct bfq_group
*bfqg
= NULL
;
3096 serial_nr
= bio_blkcg(bio
)->css
.serial_nr
;
3099 * Check whether blkcg has changed. The condition may trigger
3100 * spuriously on a newly created cic but there's no harm.
3102 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
3105 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, bio_blkcg(bio
));
3106 bic
->blkcg_serial_nr
= serial_nr
;
3112 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3113 * @st: the service tree being flushed.
3115 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
3117 struct bfq_entity
*entity
= st
->first_idle
;
3119 for (; entity
; entity
= st
->first_idle
)
3120 __bfq_deactivate_entity(entity
, false);
3124 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3125 * @bfqd: the device data structure with the root group.
3126 * @entity: the entity to move.
3128 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
3129 struct bfq_entity
*entity
)
3131 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3133 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
3137 * bfq_reparent_active_entities - move to the root group all active
3139 * @bfqd: the device data structure with the root group.
3140 * @bfqg: the group to move from.
3141 * @st: the service tree with the entities.
3143 * Needs queue_lock to be taken and reference to be valid over the call.
3145 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
3146 struct bfq_group
*bfqg
,
3147 struct bfq_service_tree
*st
)
3149 struct rb_root
*active
= &st
->active
;
3150 struct bfq_entity
*entity
= NULL
;
3152 if (!RB_EMPTY_ROOT(&st
->active
))
3153 entity
= bfq_entity_of(rb_first(active
));
3155 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
3156 bfq_reparent_leaf_entity(bfqd
, entity
);
3158 if (bfqg
->sched_data
.in_service_entity
)
3159 bfq_reparent_leaf_entity(bfqd
,
3160 bfqg
->sched_data
.in_service_entity
);
3164 * bfq_pd_offline - deactivate the entity associated with @pd,
3165 * and reparent its children entities.
3166 * @pd: descriptor of the policy going offline.
3168 * blkio already grabs the queue_lock for us, so no need to use
3171 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
3173 struct bfq_service_tree
*st
;
3174 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3175 struct bfq_data
*bfqd
= bfqg
->bfqd
;
3176 struct bfq_entity
*entity
= bfqg
->my_entity
;
3177 unsigned long flags
;
3180 if (!entity
) /* root group */
3183 spin_lock_irqsave(&bfqd
->lock
, flags
);
3185 * Empty all service_trees belonging to this group before
3186 * deactivating the group itself.
3188 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
3189 st
= bfqg
->sched_data
.service_tree
+ i
;
3192 * The idle tree may still contain bfq_queues belonging
3193 * to exited task because they never migrated to a different
3194 * cgroup from the one being destroyed now. No one else
3195 * can access them so it's safe to act without any lock.
3197 bfq_flush_idle_tree(st
);
3200 * It may happen that some queues are still active
3201 * (busy) upon group destruction (if the corresponding
3202 * processes have been forced to terminate). We move
3203 * all the leaf entities corresponding to these queues
3204 * to the root_group.
3205 * Also, it may happen that the group has an entity
3206 * in service, which is disconnected from the active
3207 * tree: it must be moved, too.
3208 * There is no need to put the sync queues, as the
3209 * scheduler has taken no reference.
3211 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
3214 __bfq_deactivate_entity(entity
, false);
3215 bfq_put_async_queues(bfqd
, bfqg
);
3217 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
3219 * @blkg is going offline and will be ignored by
3220 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3221 * that they don't get lost. If IOs complete after this point, the
3222 * stats for them will be lost. Oh well...
3224 bfqg_stats_xfer_dead(bfqg
);
3227 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3229 struct blkcg_gq
*blkg
;
3231 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
3232 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3234 bfq_end_wr_async_queues(bfqd
, bfqg
);
3236 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3239 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
3241 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
3242 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3243 unsigned int val
= 0;
3246 val
= bfqgd
->weight
;
3248 seq_printf(sf
, "%u\n", val
);
3253 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
3254 struct cftype
*cftype
,
3257 struct blkcg
*blkcg
= css_to_blkcg(css
);
3258 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
3259 struct blkcg_gq
*blkg
;
3262 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
3266 spin_lock_irq(&blkcg
->lock
);
3267 bfqgd
->weight
= (unsigned short)val
;
3268 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
3269 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
3274 * Setting the prio_changed flag of the entity
3275 * to 1 with new_weight == weight would re-set
3276 * the value of the weight to its ioprio mapping.
3277 * Set the flag only if necessary.
3279 if ((unsigned short)val
!= bfqg
->entity
.new_weight
) {
3280 bfqg
->entity
.new_weight
= (unsigned short)val
;
3282 * Make sure that the above new value has been
3283 * stored in bfqg->entity.new_weight before
3284 * setting the prio_changed flag. In fact,
3285 * this flag may be read asynchronously (in
3286 * critical sections protected by a different
3287 * lock than that held here), and finding this
3288 * flag set may cause the execution of the code
3289 * for updating parameters whose value may
3290 * depend also on bfqg->entity.new_weight (in
3291 * __bfq_entity_update_weight_prio).
3292 * This barrier makes sure that the new value
3293 * of bfqg->entity.new_weight is correctly
3294 * seen in that code.
3297 bfqg
->entity
.prio_changed
= 1;
3300 spin_unlock_irq(&blkcg
->lock
);
3305 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
3306 char *buf
, size_t nbytes
,
3310 /* First unsigned long found in the file is used */
3311 int ret
= kstrtoull(strim(buf
), 0, &weight
);
3316 return bfq_io_set_weight_legacy(of_css(of
), NULL
, weight
);
3319 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
3321 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
3322 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
3326 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
3328 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
3329 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
3333 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
3334 struct blkg_policy_data
*pd
, int off
)
3336 u64 sum
= blkg_stat_recursive_sum(pd_to_blkg(pd
),
3337 &blkcg_policy_bfq
, off
);
3338 return __blkg_prfill_u64(sf
, pd
, sum
);
3341 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
3342 struct blkg_policy_data
*pd
, int off
)
3344 struct blkg_rwstat sum
= blkg_rwstat_recursive_sum(pd_to_blkg(pd
),
3347 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
3350 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
3352 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3353 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
3354 seq_cft(sf
)->private, false);
3358 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
3360 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3361 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
3362 seq_cft(sf
)->private, true);
3366 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
3369 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
3371 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3374 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
3376 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3377 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
3381 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
3382 struct blkg_policy_data
*pd
, int off
)
3384 struct blkg_rwstat tmp
= blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
3385 offsetof(struct blkcg_gq
, stat_bytes
));
3386 u64 sum
= atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_READ
]) +
3387 atomic64_read(&tmp
.aux_cnt
[BLKG_RWSTAT_WRITE
]);
3389 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
3392 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
3394 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3395 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
3400 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
3401 struct blkg_policy_data
*pd
, int off
)
3403 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
3404 u64 samples
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
3408 v
= blkg_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
3409 v
= div64_u64(v
, samples
);
3411 __blkg_prfill_u64(sf
, pd
, v
);
3415 /* print avg_queue_size */
3416 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
3418 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
3419 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
3424 static struct bfq_group
*
3425 bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
3429 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
3433 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
3436 static struct cftype bfq_blkcg_legacy_files
[] = {
3438 .name
= "bfq.weight",
3439 .flags
= CFTYPE_NOT_ON_ROOT
,
3440 .seq_show
= bfq_io_show_weight
,
3441 .write_u64
= bfq_io_set_weight_legacy
,
3444 /* statistics, covers only the tasks in the bfqg */
3447 .private = offsetof(struct bfq_group
, stats
.time
),
3448 .seq_show
= bfqg_print_stat
,
3451 .name
= "bfq.sectors",
3452 .seq_show
= bfqg_print_stat_sectors
,
3455 .name
= "bfq.io_service_bytes",
3456 .private = (unsigned long)&blkcg_policy_bfq
,
3457 .seq_show
= blkg_print_stat_bytes
,
3460 .name
= "bfq.io_serviced",
3461 .private = (unsigned long)&blkcg_policy_bfq
,
3462 .seq_show
= blkg_print_stat_ios
,
3465 .name
= "bfq.io_service_time",
3466 .private = offsetof(struct bfq_group
, stats
.service_time
),
3467 .seq_show
= bfqg_print_rwstat
,
3470 .name
= "bfq.io_wait_time",
3471 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3472 .seq_show
= bfqg_print_rwstat
,
3475 .name
= "bfq.io_merged",
3476 .private = offsetof(struct bfq_group
, stats
.merged
),
3477 .seq_show
= bfqg_print_rwstat
,
3480 .name
= "bfq.io_queued",
3481 .private = offsetof(struct bfq_group
, stats
.queued
),
3482 .seq_show
= bfqg_print_rwstat
,
3485 /* the same statictics which cover the bfqg and its descendants */
3487 .name
= "bfq.time_recursive",
3488 .private = offsetof(struct bfq_group
, stats
.time
),
3489 .seq_show
= bfqg_print_stat_recursive
,
3492 .name
= "bfq.sectors_recursive",
3493 .seq_show
= bfqg_print_stat_sectors_recursive
,
3496 .name
= "bfq.io_service_bytes_recursive",
3497 .private = (unsigned long)&blkcg_policy_bfq
,
3498 .seq_show
= blkg_print_stat_bytes_recursive
,
3501 .name
= "bfq.io_serviced_recursive",
3502 .private = (unsigned long)&blkcg_policy_bfq
,
3503 .seq_show
= blkg_print_stat_ios_recursive
,
3506 .name
= "bfq.io_service_time_recursive",
3507 .private = offsetof(struct bfq_group
, stats
.service_time
),
3508 .seq_show
= bfqg_print_rwstat_recursive
,
3511 .name
= "bfq.io_wait_time_recursive",
3512 .private = offsetof(struct bfq_group
, stats
.wait_time
),
3513 .seq_show
= bfqg_print_rwstat_recursive
,
3516 .name
= "bfq.io_merged_recursive",
3517 .private = offsetof(struct bfq_group
, stats
.merged
),
3518 .seq_show
= bfqg_print_rwstat_recursive
,
3521 .name
= "bfq.io_queued_recursive",
3522 .private = offsetof(struct bfq_group
, stats
.queued
),
3523 .seq_show
= bfqg_print_rwstat_recursive
,
3526 .name
= "bfq.avg_queue_size",
3527 .seq_show
= bfqg_print_avg_queue_size
,
3530 .name
= "bfq.group_wait_time",
3531 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
3532 .seq_show
= bfqg_print_stat
,
3535 .name
= "bfq.idle_time",
3536 .private = offsetof(struct bfq_group
, stats
.idle_time
),
3537 .seq_show
= bfqg_print_stat
,
3540 .name
= "bfq.empty_time",
3541 .private = offsetof(struct bfq_group
, stats
.empty_time
),
3542 .seq_show
= bfqg_print_stat
,
3545 .name
= "bfq.dequeue",
3546 .private = offsetof(struct bfq_group
, stats
.dequeue
),
3547 .seq_show
= bfqg_print_stat
,
3552 static struct cftype bfq_blkg_files
[] = {
3554 .name
= "bfq.weight",
3555 .flags
= CFTYPE_NOT_ON_ROOT
,
3556 .seq_show
= bfq_io_show_weight
,
3557 .write
= bfq_io_set_weight
,
3562 #else /* CONFIG_BFQ_GROUP_IOSCHED */
3564 static inline void bfqg_stats_update_io_add(struct bfq_group
*bfqg
,
3565 struct bfq_queue
*bfqq
, unsigned int op
) { }
3567 bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
3569 bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
3570 static inline void bfqg_stats_update_completion(struct bfq_group
*bfqg
,
3571 uint64_t start_time
, uint64_t io_start_time
,
3572 unsigned int op
) { }
3574 bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
3575 struct bfq_group
*curr_bfqg
) { }
3576 static inline void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
) { }
3577 static inline void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
3578 static inline void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
3579 static inline void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
3580 static inline void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
3581 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
3583 static void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
3584 struct bfq_group
*bfqg
) {}
3586 static void bfq_init_entity(struct bfq_entity
*entity
,
3587 struct bfq_group
*bfqg
)
3589 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
3591 entity
->weight
= entity
->new_weight
;
3592 entity
->orig_weight
= entity
->new_weight
;
3594 bfqq
->ioprio
= bfqq
->new_ioprio
;
3595 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
3597 entity
->sched_data
= &bfqg
->sched_data
;
3600 static void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
3602 static void bfq_end_wr_async(struct bfq_data
*bfqd
)
3604 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
3607 static struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
3608 struct blkcg
*blkcg
)
3610 return bfqd
->root_group
;
3613 static struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
3615 return bfqq
->bfqd
->root_group
;
3618 static struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
,
3621 struct bfq_group
*bfqg
;
3624 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
3628 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
3629 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
3633 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
3635 #define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3636 #define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3638 #define bfq_sample_valid(samples) ((samples) > 80)
3641 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3642 * We choose the request that is closesr to the head right now. Distance
3643 * behind the head is penalized and only allowed to a certain extent.
3645 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
3646 struct request
*rq1
,
3647 struct request
*rq2
,
3650 sector_t s1
, s2
, d1
= 0, d2
= 0;
3651 unsigned long back_max
;
3652 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3653 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3654 unsigned int wrap
= 0; /* bit mask: requests behind the disk head? */
3656 if (!rq1
|| rq1
== rq2
)
3661 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
3663 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
3665 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
3667 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
3670 s1
= blk_rq_pos(rq1
);
3671 s2
= blk_rq_pos(rq2
);
3674 * By definition, 1KiB is 2 sectors.
3676 back_max
= bfqd
->bfq_back_max
* 2;
3679 * Strict one way elevator _except_ in the case where we allow
3680 * short backward seeks which are biased as twice the cost of a
3681 * similar forward seek.
3685 else if (s1
+ back_max
>= last
)
3686 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
3688 wrap
|= BFQ_RQ1_WRAP
;
3692 else if (s2
+ back_max
>= last
)
3693 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
3695 wrap
|= BFQ_RQ2_WRAP
;
3697 /* Found required data */
3700 * By doing switch() on the bit mask "wrap" we avoid having to
3701 * check two variables for all permutations: --> faster!
3704 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3719 case BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
: /* both rqs wrapped */
3722 * Since both rqs are wrapped,
3723 * start with the one that's further behind head
3724 * (--> only *one* back seek required),
3725 * since back seek takes more time than forward.
3735 * Return expired entry, or NULL to just start from scratch in rbtree.
3737 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
,
3738 struct request
*last
)
3742 if (bfq_bfqq_fifo_expire(bfqq
))
3745 bfq_mark_bfqq_fifo_expire(bfqq
);
3747 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
3749 if (rq
== last
|| ktime_get_ns() < rq
->fifo_time
)
3752 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "check_fifo: returned %p", rq
);
3756 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
3757 struct bfq_queue
*bfqq
,
3758 struct request
*last
)
3760 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
3761 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
3762 struct request
*next
, *prev
= NULL
;
3764 /* Follow expired path, else get first next available. */
3765 next
= bfq_check_fifo(bfqq
, last
);
3770 prev
= rb_entry_rq(rbprev
);
3773 next
= rb_entry_rq(rbnext
);
3775 rbnext
= rb_first(&bfqq
->sort_list
);
3776 if (rbnext
&& rbnext
!= &last
->rb_node
)
3777 next
= rb_entry_rq(rbnext
);
3780 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
3783 /* see the definition of bfq_async_charge_factor for details */
3784 static unsigned long bfq_serv_to_charge(struct request
*rq
,
3785 struct bfq_queue
*bfqq
)
3787 if (bfq_bfqq_sync(bfqq
) || bfqq
->wr_coeff
> 1)
3788 return blk_rq_sectors(rq
);
3791 * If there are no weight-raised queues, then amplify service
3792 * by just the async charge factor; otherwise amplify service
3793 * by twice the async charge factor, to further reduce latency
3794 * for weight-raised queues.
3796 if (bfqq
->bfqd
->wr_busy_queues
== 0)
3797 return blk_rq_sectors(rq
) * bfq_async_charge_factor
;
3799 return blk_rq_sectors(rq
) * 2 * bfq_async_charge_factor
;
3803 * bfq_updated_next_req - update the queue after a new next_rq selection.
3804 * @bfqd: the device data the queue belongs to.
3805 * @bfqq: the queue to update.
3807 * If the first request of a queue changes we make sure that the queue
3808 * has enough budget to serve at least its first request (if the
3809 * request has grown). We do this because if the queue has not enough
3810 * budget for its first request, it has to go through two dispatch
3811 * rounds to actually get it dispatched.
3813 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
3814 struct bfq_queue
*bfqq
)
3816 struct bfq_entity
*entity
= &bfqq
->entity
;
3817 struct request
*next_rq
= bfqq
->next_rq
;
3818 unsigned long new_budget
;
3823 if (bfqq
== bfqd
->in_service_queue
)
3825 * In order not to break guarantees, budgets cannot be
3826 * changed after an entity has been selected.
3830 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
3831 bfq_serv_to_charge(next_rq
, bfqq
));
3832 if (entity
->budget
!= new_budget
) {
3833 entity
->budget
= new_budget
;
3834 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu",
3836 bfq_requeue_bfqq(bfqd
, bfqq
);
3840 static int bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
3842 struct bfq_entity
*entity
= &bfqq
->entity
;
3844 return entity
->budget
- entity
->service
;
3848 * If enough samples have been computed, return the current max budget
3849 * stored in bfqd, which is dynamically updated according to the
3850 * estimated disk peak rate; otherwise return the default max budget
3852 static int bfq_max_budget(struct bfq_data
*bfqd
)
3854 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3855 return bfq_default_max_budget
;
3857 return bfqd
->bfq_max_budget
;
3861 * Return min budget, which is a fraction of the current or default
3862 * max budget (trying with 1/32)
3864 static int bfq_min_budget(struct bfq_data
*bfqd
)
3866 if (bfqd
->budgets_assigned
< bfq_stats_min_budgets
)
3867 return bfq_default_max_budget
/ 32;
3869 return bfqd
->bfq_max_budget
/ 32;
3872 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
3873 struct bfq_queue
*bfqq
,
3875 enum bfqq_expiration reason
);
3878 * The next function, invoked after the input queue bfqq switches from
3879 * idle to busy, updates the budget of bfqq. The function also tells
3880 * whether the in-service queue should be expired, by returning
3881 * true. The purpose of expiring the in-service queue is to give bfqq
3882 * the chance to possibly preempt the in-service queue, and the reason
3883 * for preempting the in-service queue is to achieve one of the two
3886 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
3887 * expired because it has remained idle. In particular, bfqq may have
3888 * expired for one of the following two reasons:
3890 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3891 * and did not make it to issue a new request before its last
3892 * request was served;
3894 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3895 * a new request before the expiration of the idling-time.
3897 * Even if bfqq has expired for one of the above reasons, the process
3898 * associated with the queue may be however issuing requests greedily,
3899 * and thus be sensitive to the bandwidth it receives (bfqq may have
3900 * remained idle for other reasons: CPU high load, bfqq not enjoying
3901 * idling, I/O throttling somewhere in the path from the process to
3902 * the I/O scheduler, ...). But if, after every expiration for one of
3903 * the above two reasons, bfqq has to wait for the service of at least
3904 * one full budget of another queue before being served again, then
3905 * bfqq is likely to get a much lower bandwidth or resource time than
3906 * its reserved ones. To address this issue, two countermeasures need
3909 * First, the budget and the timestamps of bfqq need to be updated in
3910 * a special way on bfqq reactivation: they need to be updated as if
3911 * bfqq did not remain idle and did not expire. In fact, if they are
3912 * computed as if bfqq expired and remained idle until reactivation,
3913 * then the process associated with bfqq is treated as if, instead of
3914 * being greedy, it stopped issuing requests when bfqq remained idle,
3915 * and restarts issuing requests only on this reactivation. In other
3916 * words, the scheduler does not help the process recover the "service
3917 * hole" between bfqq expiration and reactivation. As a consequence,
3918 * the process receives a lower bandwidth than its reserved one. In
3919 * contrast, to recover this hole, the budget must be updated as if
3920 * bfqq was not expired at all before this reactivation, i.e., it must
3921 * be set to the value of the remaining budget when bfqq was
3922 * expired. Along the same line, timestamps need to be assigned the
3923 * value they had the last time bfqq was selected for service, i.e.,
3924 * before last expiration. Thus timestamps need to be back-shifted
3925 * with respect to their normal computation (see [1] for more details
3926 * on this tricky aspect).
3928 * Secondly, to allow the process to recover the hole, the in-service
3929 * queue must be expired too, to give bfqq the chance to preempt it
3930 * immediately. In fact, if bfqq has to wait for a full budget of the
3931 * in-service queue to be completed, then it may become impossible to
3932 * let the process recover the hole, even if the back-shifted
3933 * timestamps of bfqq are lower than those of the in-service queue. If
3934 * this happens for most or all of the holes, then the process may not
3935 * receive its reserved bandwidth. In this respect, it is worth noting
3936 * that, being the service of outstanding requests unpreemptible, a
3937 * little fraction of the holes may however be unrecoverable, thereby
3938 * causing a little loss of bandwidth.
3940 * The last important point is detecting whether bfqq does need this
3941 * bandwidth recovery. In this respect, the next function deems the
3942 * process associated with bfqq greedy, and thus allows it to recover
3943 * the hole, if: 1) the process is waiting for the arrival of a new
3944 * request (which implies that bfqq expired for one of the above two
3945 * reasons), and 2) such a request has arrived soon. The first
3946 * condition is controlled through the flag non_blocking_wait_rq,
3947 * while the second through the flag arrived_in_time. If both
3948 * conditions hold, then the function computes the budget in the
3949 * above-described special way, and signals that the in-service queue
3950 * should be expired. Timestamp back-shifting is done later in
3951 * __bfq_activate_entity.
3953 * 2. Reduce latency. Even if timestamps are not backshifted to let
3954 * the process associated with bfqq recover a service hole, bfqq may
3955 * however happen to have, after being (re)activated, a lower finish
3956 * timestamp than the in-service queue. That is, the next budget of
3957 * bfqq may have to be completed before the one of the in-service
3958 * queue. If this is the case, then preempting the in-service queue
3959 * allows this goal to be achieved, apart from the unpreemptible,
3960 * outstanding requests mentioned above.
3962 * Unfortunately, regardless of which of the above two goals one wants
3963 * to achieve, service trees need first to be updated to know whether
3964 * the in-service queue must be preempted. To have service trees
3965 * correctly updated, the in-service queue must be expired and
3966 * rescheduled, and bfqq must be scheduled too. This is one of the
3967 * most costly operations (in future versions, the scheduling
3968 * mechanism may be re-designed in such a way to make it possible to
3969 * know whether preemption is needed without needing to update service
3970 * trees). In addition, queue preemptions almost always cause random
3971 * I/O, and thus loss of throughput. Because of these facts, the next
3972 * function adopts the following simple scheme to avoid both costly
3973 * operations and too frequent preemptions: it requests the expiration
3974 * of the in-service queue (unconditionally) only for queues that need
3975 * to recover a hole, or that either are weight-raised or deserve to
3978 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data
*bfqd
,
3979 struct bfq_queue
*bfqq
,
3980 bool arrived_in_time
,
3981 bool wr_or_deserves_wr
)
3983 struct bfq_entity
*entity
= &bfqq
->entity
;
3985 if (bfq_bfqq_non_blocking_wait_rq(bfqq
) && arrived_in_time
) {
3987 * We do not clear the flag non_blocking_wait_rq here, as
3988 * the latter is used in bfq_activate_bfqq to signal
3989 * that timestamps need to be back-shifted (and is
3990 * cleared right after).
3994 * In next assignment we rely on that either
3995 * entity->service or entity->budget are not updated
3996 * on expiration if bfqq is empty (see
3997 * __bfq_bfqq_recalc_budget). Thus both quantities
3998 * remain unchanged after such an expiration, and the
3999 * following statement therefore assigns to
4000 * entity->budget the remaining budget on such an
4001 * expiration. For clarity, entity->service is not
4002 * updated on expiration in any case, and, in normal
4003 * operation, is reset only when bfqq is selected for
4004 * service (see bfq_get_next_queue).
4006 entity
->budget
= min_t(unsigned long,
4007 bfq_bfqq_budget_left(bfqq
),
4013 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
4014 bfq_serv_to_charge(bfqq
->next_rq
, bfqq
));
4015 bfq_clear_bfqq_non_blocking_wait_rq(bfqq
);
4016 return wr_or_deserves_wr
;
4019 static unsigned int bfq_wr_duration(struct bfq_data
*bfqd
)
4023 if (bfqd
->bfq_wr_max_time
> 0)
4024 return bfqd
->bfq_wr_max_time
;
4026 dur
= bfqd
->RT_prod
;
4027 do_div(dur
, bfqd
->peak_rate
);
4030 * Limit duration between 3 and 13 seconds. Tests show that
4031 * higher values than 13 seconds often yield the opposite of
4032 * the desired result, i.e., worsen responsiveness by letting
4033 * non-interactive and non-soft-real-time applications
4034 * preserve weight raising for a too long time interval.
4036 * On the other end, lower values than 3 seconds make it
4037 * difficult for most interactive tasks to complete their jobs
4038 * before weight-raising finishes.
4040 if (dur
> msecs_to_jiffies(13000))
4041 dur
= msecs_to_jiffies(13000);
4042 else if (dur
< msecs_to_jiffies(3000))
4043 dur
= msecs_to_jiffies(3000);
4048 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data
*bfqd
,
4049 struct bfq_queue
*bfqq
,
4050 unsigned int old_wr_coeff
,
4051 bool wr_or_deserves_wr
,
4055 if (old_wr_coeff
== 1 && wr_or_deserves_wr
) {
4056 /* start a weight-raising period */
4058 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4059 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4061 bfqq
->wr_start_at_switch_to_srt
= jiffies
;
4062 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4063 BFQ_SOFTRT_WEIGHT_FACTOR
;
4064 bfqq
->wr_cur_max_time
=
4065 bfqd
->bfq_wr_rt_max_time
;
4069 * If needed, further reduce budget to make sure it is
4070 * close to bfqq's backlog, so as to reduce the
4071 * scheduling-error component due to a too large
4072 * budget. Do not care about throughput consequences,
4073 * but only about latency. Finally, do not assign a
4074 * too small budget either, to avoid increasing
4075 * latency by causing too frequent expirations.
4077 bfqq
->entity
.budget
= min_t(unsigned long,
4078 bfqq
->entity
.budget
,
4079 2 * bfq_min_budget(bfqd
));
4080 } else if (old_wr_coeff
> 1) {
4081 if (interactive
) { /* update wr coeff and duration */
4082 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4083 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4084 } else if (soft_rt
) {
4086 * The application is now or still meeting the
4087 * requirements for being deemed soft rt. We
4088 * can then correctly and safely (re)charge
4089 * the weight-raising duration for the
4090 * application with the weight-raising
4091 * duration for soft rt applications.
4093 * In particular, doing this recharge now, i.e.,
4094 * before the weight-raising period for the
4095 * application finishes, reduces the probability
4096 * of the following negative scenario:
4097 * 1) the weight of a soft rt application is
4098 * raised at startup (as for any newly
4099 * created application),
4100 * 2) since the application is not interactive,
4101 * at a certain time weight-raising is
4102 * stopped for the application,
4103 * 3) at that time the application happens to
4104 * still have pending requests, and hence
4105 * is destined to not have a chance to be
4106 * deemed soft rt before these requests are
4107 * completed (see the comments to the
4108 * function bfq_bfqq_softrt_next_start()
4109 * for details on soft rt detection),
4110 * 4) these pending requests experience a high
4111 * latency because the application is not
4112 * weight-raised while they are pending.
4114 if (bfqq
->wr_cur_max_time
!=
4115 bfqd
->bfq_wr_rt_max_time
) {
4116 bfqq
->wr_start_at_switch_to_srt
=
4117 bfqq
->last_wr_start_finish
;
4119 bfqq
->wr_cur_max_time
=
4120 bfqd
->bfq_wr_rt_max_time
;
4121 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
*
4122 BFQ_SOFTRT_WEIGHT_FACTOR
;
4124 bfqq
->last_wr_start_finish
= jiffies
;
4129 static bool bfq_bfqq_idle_for_long_time(struct bfq_data
*bfqd
,
4130 struct bfq_queue
*bfqq
)
4132 return bfqq
->dispatched
== 0 &&
4133 time_is_before_jiffies(
4134 bfqq
->budget_timeout
+
4135 bfqd
->bfq_wr_min_idle_time
);
4138 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data
*bfqd
,
4139 struct bfq_queue
*bfqq
,
4144 bool soft_rt
, wr_or_deserves_wr
, bfqq_wants_to_preempt
,
4145 idle_for_long_time
= bfq_bfqq_idle_for_long_time(bfqd
, bfqq
),
4147 * See the comments on
4148 * bfq_bfqq_update_budg_for_activation for
4149 * details on the usage of the next variable.
4151 arrived_in_time
= ktime_get_ns() <=
4152 bfqq
->ttime
.last_end_request
+
4153 bfqd
->bfq_slice_idle
* 3;
4155 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq
)), bfqq
, rq
->cmd_flags
);
4158 * bfqq deserves to be weight-raised if:
4160 * - it has been idle for enough time or is soft real-time.
4162 soft_rt
= bfqd
->bfq_wr_max_softrt_rate
> 0 &&
4163 time_is_before_jiffies(bfqq
->soft_rt_next_start
);
4164 *interactive
= idle_for_long_time
;
4165 wr_or_deserves_wr
= bfqd
->low_latency
&&
4166 (bfqq
->wr_coeff
> 1 ||
4167 (bfq_bfqq_sync(bfqq
) && (*interactive
|| soft_rt
)));
4170 * Using the last flag, update budget and check whether bfqq
4171 * may want to preempt the in-service queue.
4173 bfqq_wants_to_preempt
=
4174 bfq_bfqq_update_budg_for_activation(bfqd
, bfqq
,
4178 if (!bfq_bfqq_IO_bound(bfqq
)) {
4179 if (arrived_in_time
) {
4180 bfqq
->requests_within_timer
++;
4181 if (bfqq
->requests_within_timer
>=
4182 bfqd
->bfq_requests_within_timer
)
4183 bfq_mark_bfqq_IO_bound(bfqq
);
4185 bfqq
->requests_within_timer
= 0;
4188 if (bfqd
->low_latency
) {
4189 bfq_update_bfqq_wr_on_rq_arrival(bfqd
, bfqq
,
4195 if (old_wr_coeff
!= bfqq
->wr_coeff
)
4196 bfqq
->entity
.prio_changed
= 1;
4199 bfqq
->last_idle_bklogged
= jiffies
;
4200 bfqq
->service_from_backlogged
= 0;
4201 bfq_clear_bfqq_softrt_update(bfqq
);
4203 bfq_add_bfqq_busy(bfqd
, bfqq
);
4206 * Expire in-service queue only if preemption may be needed
4207 * for guarantees. In this respect, the function
4208 * next_queue_may_preempt just checks a simple, necessary
4209 * condition, and not a sufficient condition based on
4210 * timestamps. In fact, for the latter condition to be
4211 * evaluated, timestamps would need first to be updated, and
4212 * this operation is quite costly (see the comments on the
4213 * function bfq_bfqq_update_budg_for_activation).
4215 if (bfqd
->in_service_queue
&& bfqq_wants_to_preempt
&&
4216 bfqd
->in_service_queue
->wr_coeff
< bfqq
->wr_coeff
&&
4217 next_queue_may_preempt(bfqd
))
4218 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
4219 false, BFQQE_PREEMPTED
);
4222 static void bfq_add_request(struct request
*rq
)
4224 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4225 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4226 struct request
*next_rq
, *prev
;
4227 unsigned int old_wr_coeff
= bfqq
->wr_coeff
;
4228 bool interactive
= false;
4230 bfq_log_bfqq(bfqd
, bfqq
, "add_request %d", rq_is_sync(rq
));
4231 bfqq
->queued
[rq_is_sync(rq
)]++;
4234 elv_rb_add(&bfqq
->sort_list
, rq
);
4237 * Check if this request is a better next-serve candidate.
4239 prev
= bfqq
->next_rq
;
4240 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
4241 bfqq
->next_rq
= next_rq
;
4243 if (!bfq_bfqq_busy(bfqq
)) /* switching to busy ... */
4244 bfq_bfqq_handle_idle_busy_switch(bfqd
, bfqq
, old_wr_coeff
,
4247 if (bfqd
->low_latency
&& old_wr_coeff
== 1 && !rq_is_sync(rq
) &&
4248 time_is_before_jiffies(
4249 bfqq
->last_wr_start_finish
+
4250 bfqd
->bfq_wr_min_inter_arr_async
)) {
4251 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
4252 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
4254 bfqd
->wr_busy_queues
++;
4255 bfqq
->entity
.prio_changed
= 1;
4257 if (prev
!= bfqq
->next_rq
)
4258 bfq_updated_next_req(bfqd
, bfqq
);
4262 * Assign jiffies to last_wr_start_finish in the following
4265 * . if bfqq is not going to be weight-raised, because, for
4266 * non weight-raised queues, last_wr_start_finish stores the
4267 * arrival time of the last request; as of now, this piece
4268 * of information is used only for deciding whether to
4269 * weight-raise async queues
4271 * . if bfqq is not weight-raised, because, if bfqq is now
4272 * switching to weight-raised, then last_wr_start_finish
4273 * stores the time when weight-raising starts
4275 * . if bfqq is interactive, because, regardless of whether
4276 * bfqq is currently weight-raised, the weight-raising
4277 * period must start or restart (this case is considered
4278 * separately because it is not detected by the above
4279 * conditions, if bfqq is already weight-raised)
4281 * last_wr_start_finish has to be updated also if bfqq is soft
4282 * real-time, because the weight-raising period is constantly
4283 * restarted on idle-to-busy transitions for these queues, but
4284 * this is already done in bfq_bfqq_handle_idle_busy_switch if
4287 if (bfqd
->low_latency
&&
4288 (old_wr_coeff
== 1 || bfqq
->wr_coeff
== 1 || interactive
))
4289 bfqq
->last_wr_start_finish
= jiffies
;
4292 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
4294 struct request_queue
*q
)
4296 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4300 return elv_rb_find(&bfqq
->sort_list
, bio_end_sector(bio
));
4305 static sector_t
get_sdist(sector_t last_pos
, struct request
*rq
)
4308 return abs(blk_rq_pos(rq
) - last_pos
);
4313 #if 0 /* Still not clear if we can do without next two functions */
4314 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
4316 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4318 bfqd
->rq_in_driver
++;
4321 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
4323 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4325 bfqd
->rq_in_driver
--;
4329 static void bfq_remove_request(struct request_queue
*q
,
4332 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4333 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4334 const int sync
= rq_is_sync(rq
);
4336 if (bfqq
->next_rq
== rq
) {
4337 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
4338 bfq_updated_next_req(bfqd
, bfqq
);
4341 if (rq
->queuelist
.prev
!= &rq
->queuelist
)
4342 list_del_init(&rq
->queuelist
);
4343 bfqq
->queued
[sync
]--;
4345 elv_rb_del(&bfqq
->sort_list
, rq
);
4347 elv_rqhash_del(q
, rq
);
4348 if (q
->last_merge
== rq
)
4349 q
->last_merge
= NULL
;
4351 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4352 bfqq
->next_rq
= NULL
;
4354 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->in_service_queue
) {
4355 bfq_del_bfqq_busy(bfqd
, bfqq
, false);
4357 * bfqq emptied. In normal operation, when
4358 * bfqq is empty, bfqq->entity.service and
4359 * bfqq->entity.budget must contain,
4360 * respectively, the service received and the
4361 * budget used last time bfqq emptied. These
4362 * facts do not hold in this case, as at least
4363 * this last removal occurred while bfqq is
4364 * not in service. To avoid inconsistencies,
4365 * reset both bfqq->entity.service and
4366 * bfqq->entity.budget, if bfqq has still a
4367 * process that may issue I/O requests to it.
4369 bfqq
->entity
.budget
= bfqq
->entity
.service
= 0;
4373 if (rq
->cmd_flags
& REQ_META
)
4374 bfqq
->meta_pending
--;
4376 bfqg_stats_update_io_remove(bfqq_group(bfqq
), rq
->cmd_flags
);
4379 static bool bfq_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
4381 struct request_queue
*q
= hctx
->queue
;
4382 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4383 struct request
*free
= NULL
;
4385 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4386 * store its return value for later use, to avoid nesting
4387 * queue_lock inside the bfqd->lock. We assume that the bic
4388 * returned by bfq_bic_lookup does not go away before
4389 * bfqd->lock is taken.
4391 struct bfq_io_cq
*bic
= bfq_bic_lookup(bfqd
, current
->io_context
, q
);
4394 spin_lock_irq(&bfqd
->lock
);
4397 bfqd
->bio_bfqq
= bic_to_bfqq(bic
, op_is_sync(bio
->bi_opf
));
4399 bfqd
->bio_bfqq
= NULL
;
4400 bfqd
->bio_bic
= bic
;
4402 ret
= blk_mq_sched_try_merge(q
, bio
, &free
);
4405 blk_mq_free_request(free
);
4406 spin_unlock_irq(&bfqd
->lock
);
4411 static int bfq_request_merge(struct request_queue
*q
, struct request
**req
,
4414 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4415 struct request
*__rq
;
4417 __rq
= bfq_find_rq_fmerge(bfqd
, bio
, q
);
4418 if (__rq
&& elv_bio_merge_ok(__rq
, bio
)) {
4420 return ELEVATOR_FRONT_MERGE
;
4423 return ELEVATOR_NO_MERGE
;
4426 static void bfq_request_merged(struct request_queue
*q
, struct request
*req
,
4427 enum elv_merge type
)
4429 if (type
== ELEVATOR_FRONT_MERGE
&&
4430 rb_prev(&req
->rb_node
) &&
4432 blk_rq_pos(container_of(rb_prev(&req
->rb_node
),
4433 struct request
, rb_node
))) {
4434 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
4435 struct bfq_data
*bfqd
= bfqq
->bfqd
;
4436 struct request
*prev
, *next_rq
;
4438 /* Reposition request in its sort_list */
4439 elv_rb_del(&bfqq
->sort_list
, req
);
4440 elv_rb_add(&bfqq
->sort_list
, req
);
4442 /* Choose next request to be served for bfqq */
4443 prev
= bfqq
->next_rq
;
4444 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, req
,
4445 bfqd
->last_position
);
4446 bfqq
->next_rq
= next_rq
;
4448 * If next_rq changes, update the queue's budget to fit
4451 if (prev
!= bfqq
->next_rq
)
4452 bfq_updated_next_req(bfqd
, bfqq
);
4456 static void bfq_requests_merged(struct request_queue
*q
, struct request
*rq
,
4457 struct request
*next
)
4459 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
), *next_bfqq
= RQ_BFQQ(next
);
4461 if (!RB_EMPTY_NODE(&rq
->rb_node
))
4463 spin_lock_irq(&bfqq
->bfqd
->lock
);
4466 * If next and rq belong to the same bfq_queue and next is older
4467 * than rq, then reposition rq in the fifo (by substituting next
4468 * with rq). Otherwise, if next and rq belong to different
4469 * bfq_queues, never reposition rq: in fact, we would have to
4470 * reposition it with respect to next's position in its own fifo,
4471 * which would most certainly be too expensive with respect to
4474 if (bfqq
== next_bfqq
&&
4475 !list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
4476 next
->fifo_time
< rq
->fifo_time
) {
4477 list_del_init(&rq
->queuelist
);
4478 list_replace_init(&next
->queuelist
, &rq
->queuelist
);
4479 rq
->fifo_time
= next
->fifo_time
;
4482 if (bfqq
->next_rq
== next
)
4485 bfq_remove_request(q
, next
);
4487 spin_unlock_irq(&bfqq
->bfqd
->lock
);
4489 bfqg_stats_update_io_merged(bfqq_group(bfqq
), next
->cmd_flags
);
4492 /* Must be called with bfqq != NULL */
4493 static void bfq_bfqq_end_wr(struct bfq_queue
*bfqq
)
4495 if (bfq_bfqq_busy(bfqq
))
4496 bfqq
->bfqd
->wr_busy_queues
--;
4498 bfqq
->wr_cur_max_time
= 0;
4499 bfqq
->last_wr_start_finish
= jiffies
;
4501 * Trigger a weight change on the next invocation of
4502 * __bfq_entity_update_weight_prio.
4504 bfqq
->entity
.prio_changed
= 1;
4507 static void bfq_end_wr_async_queues(struct bfq_data
*bfqd
,
4508 struct bfq_group
*bfqg
)
4512 for (i
= 0; i
< 2; i
++)
4513 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
4514 if (bfqg
->async_bfqq
[i
][j
])
4515 bfq_bfqq_end_wr(bfqg
->async_bfqq
[i
][j
]);
4516 if (bfqg
->async_idle_bfqq
)
4517 bfq_bfqq_end_wr(bfqg
->async_idle_bfqq
);
4520 static void bfq_end_wr(struct bfq_data
*bfqd
)
4522 struct bfq_queue
*bfqq
;
4524 spin_lock_irq(&bfqd
->lock
);
4526 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
)
4527 bfq_bfqq_end_wr(bfqq
);
4528 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
)
4529 bfq_bfqq_end_wr(bfqq
);
4530 bfq_end_wr_async(bfqd
);
4532 spin_unlock_irq(&bfqd
->lock
);
4535 static bool bfq_allow_bio_merge(struct request_queue
*q
, struct request
*rq
,
4538 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
4539 bool is_sync
= op_is_sync(bio
->bi_opf
);
4540 struct bfq_queue
*bfqq
= bfqd
->bio_bfqq
;
4543 * Disallow merge of a sync bio into an async request.
4545 if (is_sync
&& !rq_is_sync(rq
))
4549 * Lookup the bfqq that this bio will be queued with. Allow
4550 * merge only if rq is queued there.
4555 return bfqq
== RQ_BFQQ(rq
);
4559 * Set the maximum time for the in-service queue to consume its
4560 * budget. This prevents seeky processes from lowering the throughput.
4561 * In practice, a time-slice service scheme is used with seeky
4564 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
,
4565 struct bfq_queue
*bfqq
)
4567 unsigned int timeout_coeff
;
4569 if (bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
)
4572 timeout_coeff
= bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
;
4574 bfqd
->last_budget_start
= ktime_get();
4576 bfqq
->budget_timeout
= jiffies
+
4577 bfqd
->bfq_timeout
* timeout_coeff
;
4580 static void __bfq_set_in_service_queue(struct bfq_data
*bfqd
,
4581 struct bfq_queue
*bfqq
)
4584 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq
));
4585 bfq_clear_bfqq_fifo_expire(bfqq
);
4587 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
* 7 + 256) / 8;
4589 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
) &&
4590 bfqq
->wr_coeff
> 1 &&
4591 bfqq
->wr_cur_max_time
== bfqd
->bfq_wr_rt_max_time
&&
4592 time_is_before_jiffies(bfqq
->budget_timeout
)) {
4594 * For soft real-time queues, move the start
4595 * of the weight-raising period forward by the
4596 * time the queue has not received any
4597 * service. Otherwise, a relatively long
4598 * service delay is likely to cause the
4599 * weight-raising period of the queue to end,
4600 * because of the short duration of the
4601 * weight-raising period of a soft real-time
4602 * queue. It is worth noting that this move
4603 * is not so dangerous for the other queues,
4604 * because soft real-time queues are not
4607 * To not add a further variable, we use the
4608 * overloaded field budget_timeout to
4609 * determine for how long the queue has not
4610 * received service, i.e., how much time has
4611 * elapsed since the queue expired. However,
4612 * this is a little imprecise, because
4613 * budget_timeout is set to jiffies if bfqq
4614 * not only expires, but also remains with no
4617 if (time_after(bfqq
->budget_timeout
,
4618 bfqq
->last_wr_start_finish
))
4619 bfqq
->last_wr_start_finish
+=
4620 jiffies
- bfqq
->budget_timeout
;
4622 bfqq
->last_wr_start_finish
= jiffies
;
4625 bfq_set_budget_timeout(bfqd
, bfqq
);
4626 bfq_log_bfqq(bfqd
, bfqq
,
4627 "set_in_service_queue, cur-budget = %d",
4628 bfqq
->entity
.budget
);
4631 bfqd
->in_service_queue
= bfqq
;
4635 * Get and set a new queue for service.
4637 static struct bfq_queue
*bfq_set_in_service_queue(struct bfq_data
*bfqd
)
4639 struct bfq_queue
*bfqq
= bfq_get_next_queue(bfqd
);
4641 __bfq_set_in_service_queue(bfqd
, bfqq
);
4645 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
4647 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
4648 struct bfq_io_cq
*bic
;
4651 /* Processes have exited, don't wait. */
4652 bic
= bfqd
->in_service_bic
;
4653 if (!bic
|| atomic_read(&bic
->icq
.ioc
->active_ref
) == 0)
4656 bfq_mark_bfqq_wait_request(bfqq
);
4659 * We don't want to idle for seeks, but we do want to allow
4660 * fair distribution of slice time for a process doing back-to-back
4661 * seeks. So allow a little bit of time for him to submit a new rq.
4663 sl
= bfqd
->bfq_slice_idle
;
4665 * Unless the queue is being weight-raised, grant only minimum
4666 * idle time if the queue is seeky. A long idling is preserved
4667 * for a weight-raised queue, because it is needed for
4668 * guaranteeing to the queue its reserved share of the
4671 if (BFQQ_SEEKY(bfqq
) && bfqq
->wr_coeff
== 1)
4672 sl
= min_t(u64
, sl
, BFQ_MIN_TT
);
4674 bfqd
->last_idling_start
= ktime_get();
4675 hrtimer_start(&bfqd
->idle_slice_timer
, ns_to_ktime(sl
),
4677 bfqg_stats_set_start_idle_time(bfqq_group(bfqq
));
4681 * In autotuning mode, max_budget is dynamically recomputed as the
4682 * amount of sectors transferred in timeout at the estimated peak
4683 * rate. This enables BFQ to utilize a full timeslice with a full
4684 * budget, even if the in-service queue is served at peak rate. And
4685 * this maximises throughput with sequential workloads.
4687 static unsigned long bfq_calc_max_budget(struct bfq_data
*bfqd
)
4689 return (u64
)bfqd
->peak_rate
* USEC_PER_MSEC
*
4690 jiffies_to_msecs(bfqd
->bfq_timeout
)>>BFQ_RATE_SHIFT
;
4694 * Update parameters related to throughput and responsiveness, as a
4695 * function of the estimated peak rate. See comments on
4696 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
4698 static void update_thr_responsiveness_params(struct bfq_data
*bfqd
)
4700 int dev_type
= blk_queue_nonrot(bfqd
->queue
);
4702 if (bfqd
->bfq_user_max_budget
== 0)
4703 bfqd
->bfq_max_budget
=
4704 bfq_calc_max_budget(bfqd
);
4706 if (bfqd
->device_speed
== BFQ_BFQD_FAST
&&
4707 bfqd
->peak_rate
< device_speed_thresh
[dev_type
]) {
4708 bfqd
->device_speed
= BFQ_BFQD_SLOW
;
4709 bfqd
->RT_prod
= R_slow
[dev_type
] *
4711 } else if (bfqd
->device_speed
== BFQ_BFQD_SLOW
&&
4712 bfqd
->peak_rate
> device_speed_thresh
[dev_type
]) {
4713 bfqd
->device_speed
= BFQ_BFQD_FAST
;
4714 bfqd
->RT_prod
= R_fast
[dev_type
] *
4719 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
4720 dev_type
== 0 ? "ROT" : "NONROT",
4721 bfqd
->device_speed
== BFQ_BFQD_FAST
? "FAST" : "SLOW",
4722 bfqd
->device_speed
== BFQ_BFQD_FAST
?
4723 (USEC_PER_SEC
*(u64
)R_fast
[dev_type
])>>BFQ_RATE_SHIFT
:
4724 (USEC_PER_SEC
*(u64
)R_slow
[dev_type
])>>BFQ_RATE_SHIFT
,
4725 (USEC_PER_SEC
*(u64
)device_speed_thresh
[dev_type
])>>
4729 static void bfq_reset_rate_computation(struct bfq_data
*bfqd
,
4732 if (rq
!= NULL
) { /* new rq dispatch now, reset accordingly */
4733 bfqd
->last_dispatch
= bfqd
->first_dispatch
= ktime_get_ns();
4734 bfqd
->peak_rate_samples
= 1;
4735 bfqd
->sequential_samples
= 0;
4736 bfqd
->tot_sectors_dispatched
= bfqd
->last_rq_max_size
=
4738 } else /* no new rq dispatched, just reset the number of samples */
4739 bfqd
->peak_rate_samples
= 0; /* full re-init on next disp. */
4742 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4743 bfqd
->peak_rate_samples
, bfqd
->sequential_samples
,
4744 bfqd
->tot_sectors_dispatched
);
4747 static void bfq_update_rate_reset(struct bfq_data
*bfqd
, struct request
*rq
)
4749 u32 rate
, weight
, divisor
;
4752 * For the convergence property to hold (see comments on
4753 * bfq_update_peak_rate()) and for the assessment to be
4754 * reliable, a minimum number of samples must be present, and
4755 * a minimum amount of time must have elapsed. If not so, do
4756 * not compute new rate. Just reset parameters, to get ready
4757 * for a new evaluation attempt.
4759 if (bfqd
->peak_rate_samples
< BFQ_RATE_MIN_SAMPLES
||
4760 bfqd
->delta_from_first
< BFQ_RATE_MIN_INTERVAL
)
4761 goto reset_computation
;
4764 * If a new request completion has occurred after last
4765 * dispatch, then, to approximate the rate at which requests
4766 * have been served by the device, it is more precise to
4767 * extend the observation interval to the last completion.
4769 bfqd
->delta_from_first
=
4770 max_t(u64
, bfqd
->delta_from_first
,
4771 bfqd
->last_completion
- bfqd
->first_dispatch
);
4774 * Rate computed in sects/usec, and not sects/nsec, for
4777 rate
= div64_ul(bfqd
->tot_sectors_dispatched
<<BFQ_RATE_SHIFT
,
4778 div_u64(bfqd
->delta_from_first
, NSEC_PER_USEC
));
4781 * Peak rate not updated if:
4782 * - the percentage of sequential dispatches is below 3/4 of the
4783 * total, and rate is below the current estimated peak rate
4784 * - rate is unreasonably high (> 20M sectors/sec)
4786 if ((bfqd
->sequential_samples
< (3 * bfqd
->peak_rate_samples
)>>2 &&
4787 rate
<= bfqd
->peak_rate
) ||
4788 rate
> 20<<BFQ_RATE_SHIFT
)
4789 goto reset_computation
;
4792 * We have to update the peak rate, at last! To this purpose,
4793 * we use a low-pass filter. We compute the smoothing constant
4794 * of the filter as a function of the 'weight' of the new
4797 * As can be seen in next formulas, we define this weight as a
4798 * quantity proportional to how sequential the workload is,
4799 * and to how long the observation time interval is.
4801 * The weight runs from 0 to 8. The maximum value of the
4802 * weight, 8, yields the minimum value for the smoothing
4803 * constant. At this minimum value for the smoothing constant,
4804 * the measured rate contributes for half of the next value of
4805 * the estimated peak rate.
4807 * So, the first step is to compute the weight as a function
4808 * of how sequential the workload is. Note that the weight
4809 * cannot reach 9, because bfqd->sequential_samples cannot
4810 * become equal to bfqd->peak_rate_samples, which, in its
4811 * turn, holds true because bfqd->sequential_samples is not
4812 * incremented for the first sample.
4814 weight
= (9 * bfqd
->sequential_samples
) / bfqd
->peak_rate_samples
;
4817 * Second step: further refine the weight as a function of the
4818 * duration of the observation interval.
4820 weight
= min_t(u32
, 8,
4821 div_u64(weight
* bfqd
->delta_from_first
,
4822 BFQ_RATE_REF_INTERVAL
));
4825 * Divisor ranging from 10, for minimum weight, to 2, for
4828 divisor
= 10 - weight
;
4831 * Finally, update peak rate:
4833 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4835 bfqd
->peak_rate
*= divisor
-1;
4836 bfqd
->peak_rate
/= divisor
;
4837 rate
/= divisor
; /* smoothing constant alpha = 1/divisor */
4839 bfqd
->peak_rate
+= rate
;
4840 update_thr_responsiveness_params(bfqd
);
4843 bfq_reset_rate_computation(bfqd
, rq
);
4847 * Update the read/write peak rate (the main quantity used for
4848 * auto-tuning, see update_thr_responsiveness_params()).
4850 * It is not trivial to estimate the peak rate (correctly): because of
4851 * the presence of sw and hw queues between the scheduler and the
4852 * device components that finally serve I/O requests, it is hard to
4853 * say exactly when a given dispatched request is served inside the
4854 * device, and for how long. As a consequence, it is hard to know
4855 * precisely at what rate a given set of requests is actually served
4858 * On the opposite end, the dispatch time of any request is trivially
4859 * available, and, from this piece of information, the "dispatch rate"
4860 * of requests can be immediately computed. So, the idea in the next
4861 * function is to use what is known, namely request dispatch times
4862 * (plus, when useful, request completion times), to estimate what is
4863 * unknown, namely in-device request service rate.
4865 * The main issue is that, because of the above facts, the rate at
4866 * which a certain set of requests is dispatched over a certain time
4867 * interval can vary greatly with respect to the rate at which the
4868 * same requests are then served. But, since the size of any
4869 * intermediate queue is limited, and the service scheme is lossless
4870 * (no request is silently dropped), the following obvious convergence
4871 * property holds: the number of requests dispatched MUST become
4872 * closer and closer to the number of requests completed as the
4873 * observation interval grows. This is the key property used in
4874 * the next function to estimate the peak service rate as a function
4875 * of the observed dispatch rate. The function assumes to be invoked
4876 * on every request dispatch.
4878 static void bfq_update_peak_rate(struct bfq_data
*bfqd
, struct request
*rq
)
4880 u64 now_ns
= ktime_get_ns();
4882 if (bfqd
->peak_rate_samples
== 0) { /* first dispatch */
4883 bfq_log(bfqd
, "update_peak_rate: goto reset, samples %d",
4884 bfqd
->peak_rate_samples
);
4885 bfq_reset_rate_computation(bfqd
, rq
);
4886 goto update_last_values
; /* will add one sample */
4890 * Device idle for very long: the observation interval lasting
4891 * up to this dispatch cannot be a valid observation interval
4892 * for computing a new peak rate (similarly to the late-
4893 * completion event in bfq_completed_request()). Go to
4894 * update_rate_and_reset to have the following three steps
4896 * - close the observation interval at the last (previous)
4897 * request dispatch or completion
4898 * - compute rate, if possible, for that observation interval
4899 * - start a new observation interval with this dispatch
4901 if (now_ns
- bfqd
->last_dispatch
> 100*NSEC_PER_MSEC
&&
4902 bfqd
->rq_in_driver
== 0)
4903 goto update_rate_and_reset
;
4905 /* Update sampling information */
4906 bfqd
->peak_rate_samples
++;
4908 if ((bfqd
->rq_in_driver
> 0 ||
4909 now_ns
- bfqd
->last_completion
< BFQ_MIN_TT
)
4910 && get_sdist(bfqd
->last_position
, rq
) < BFQQ_SEEK_THR
)
4911 bfqd
->sequential_samples
++;
4913 bfqd
->tot_sectors_dispatched
+= blk_rq_sectors(rq
);
4915 /* Reset max observed rq size every 32 dispatches */
4916 if (likely(bfqd
->peak_rate_samples
% 32))
4917 bfqd
->last_rq_max_size
=
4918 max_t(u32
, blk_rq_sectors(rq
), bfqd
->last_rq_max_size
);
4920 bfqd
->last_rq_max_size
= blk_rq_sectors(rq
);
4922 bfqd
->delta_from_first
= now_ns
- bfqd
->first_dispatch
;
4924 /* Target observation interval not yet reached, go on sampling */
4925 if (bfqd
->delta_from_first
< BFQ_RATE_REF_INTERVAL
)
4926 goto update_last_values
;
4928 update_rate_and_reset
:
4929 bfq_update_rate_reset(bfqd
, rq
);
4931 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
4932 bfqd
->last_dispatch
= now_ns
;
4936 * Remove request from internal lists.
4938 static void bfq_dispatch_remove(struct request_queue
*q
, struct request
*rq
)
4940 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
4943 * For consistency, the next instruction should have been
4944 * executed after removing the request from the queue and
4945 * dispatching it. We execute instead this instruction before
4946 * bfq_remove_request() (and hence introduce a temporary
4947 * inconsistency), for efficiency. In fact, should this
4948 * dispatch occur for a non in-service bfqq, this anticipated
4949 * increment prevents two counters related to bfqq->dispatched
4950 * from risking to be, first, uselessly decremented, and then
4951 * incremented again when the (new) value of bfqq->dispatched
4952 * happens to be taken into account.
4955 bfq_update_peak_rate(q
->elevator
->elevator_data
, rq
);
4957 bfq_remove_request(q
, rq
);
4960 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
4962 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
4963 if (bfqq
->dispatched
== 0)
4965 * Overloading budget_timeout field to store
4966 * the time at which the queue remains with no
4967 * backlog and no outstanding request; used by
4968 * the weight-raising mechanism.
4970 bfqq
->budget_timeout
= jiffies
;
4972 bfq_del_bfqq_busy(bfqd
, bfqq
, true);
4974 bfq_requeue_bfqq(bfqd
, bfqq
);
4977 * All in-service entities must have been properly deactivated
4978 * or requeued before executing the next function, which
4979 * resets all in-service entites as no more in service.
4981 __bfq_bfqd_reset_in_service(bfqd
);
4985 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4986 * @bfqd: device data.
4987 * @bfqq: queue to update.
4988 * @reason: reason for expiration.
4990 * Handle the feedback on @bfqq budget at queue expiration.
4991 * See the body for detailed comments.
4993 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
4994 struct bfq_queue
*bfqq
,
4995 enum bfqq_expiration reason
)
4997 struct request
*next_rq
;
4998 int budget
, min_budget
;
5000 min_budget
= bfq_min_budget(bfqd
);
5002 if (bfqq
->wr_coeff
== 1)
5003 budget
= bfqq
->max_budget
;
5005 * Use a constant, low budget for weight-raised queues,
5006 * to help achieve a low latency. Keep it slightly higher
5007 * than the minimum possible budget, to cause a little
5008 * bit fewer expirations.
5010 budget
= 2 * min_budget
;
5012 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %d, budg left %d",
5013 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
5014 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %d, min budg %d",
5015 budget
, bfq_min_budget(bfqd
));
5016 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
5017 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->in_service_queue
));
5019 if (bfq_bfqq_sync(bfqq
) && bfqq
->wr_coeff
== 1) {
5022 * Caveat: in all the following cases we trade latency
5025 case BFQQE_TOO_IDLE
:
5027 * This is the only case where we may reduce
5028 * the budget: if there is no request of the
5029 * process still waiting for completion, then
5030 * we assume (tentatively) that the timer has
5031 * expired because the batch of requests of
5032 * the process could have been served with a
5033 * smaller budget. Hence, betting that
5034 * process will behave in the same way when it
5035 * becomes backlogged again, we reduce its
5036 * next budget. As long as we guess right,
5037 * this budget cut reduces the latency
5038 * experienced by the process.
5040 * However, if there are still outstanding
5041 * requests, then the process may have not yet
5042 * issued its next request just because it is
5043 * still waiting for the completion of some of
5044 * the still outstanding ones. So in this
5045 * subcase we do not reduce its budget, on the
5046 * contrary we increase it to possibly boost
5047 * the throughput, as discussed in the
5048 * comments to the BUDGET_TIMEOUT case.
5050 if (bfqq
->dispatched
> 0) /* still outstanding reqs */
5051 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5053 if (budget
> 5 * min_budget
)
5054 budget
-= 4 * min_budget
;
5056 budget
= min_budget
;
5059 case BFQQE_BUDGET_TIMEOUT
:
5061 * We double the budget here because it gives
5062 * the chance to boost the throughput if this
5063 * is not a seeky process (and has bumped into
5064 * this timeout because of, e.g., ZBR).
5066 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
5068 case BFQQE_BUDGET_EXHAUSTED
:
5070 * The process still has backlog, and did not
5071 * let either the budget timeout or the disk
5072 * idling timeout expire. Hence it is not
5073 * seeky, has a short thinktime and may be
5074 * happy with a higher budget too. So
5075 * definitely increase the budget of this good
5076 * candidate to boost the disk throughput.
5078 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
5080 case BFQQE_NO_MORE_REQUESTS
:
5082 * For queues that expire for this reason, it
5083 * is particularly important to keep the
5084 * budget close to the actual service they
5085 * need. Doing so reduces the timestamp
5086 * misalignment problem described in the
5087 * comments in the body of
5088 * __bfq_activate_entity. In fact, suppose
5089 * that a queue systematically expires for
5090 * BFQQE_NO_MORE_REQUESTS and presents a
5091 * new request in time to enjoy timestamp
5092 * back-shifting. The larger the budget of the
5093 * queue is with respect to the service the
5094 * queue actually requests in each service
5095 * slot, the more times the queue can be
5096 * reactivated with the same virtual finish
5097 * time. It follows that, even if this finish
5098 * time is pushed to the system virtual time
5099 * to reduce the consequent timestamp
5100 * misalignment, the queue unjustly enjoys for
5101 * many re-activations a lower finish time
5102 * than all newly activated queues.
5104 * The service needed by bfqq is measured
5105 * quite precisely by bfqq->entity.service.
5106 * Since bfqq does not enjoy device idling,
5107 * bfqq->entity.service is equal to the number
5108 * of sectors that the process associated with
5109 * bfqq requested to read/write before waiting
5110 * for request completions, or blocking for
5113 budget
= max_t(int, bfqq
->entity
.service
, min_budget
);
5118 } else if (!bfq_bfqq_sync(bfqq
)) {
5120 * Async queues get always the maximum possible
5121 * budget, as for them we do not care about latency
5122 * (in addition, their ability to dispatch is limited
5123 * by the charging factor).
5125 budget
= bfqd
->bfq_max_budget
;
5128 bfqq
->max_budget
= budget
;
5130 if (bfqd
->budgets_assigned
>= bfq_stats_min_budgets
&&
5131 !bfqd
->bfq_user_max_budget
)
5132 bfqq
->max_budget
= min(bfqq
->max_budget
, bfqd
->bfq_max_budget
);
5135 * If there is still backlog, then assign a new budget, making
5136 * sure that it is large enough for the next request. Since
5137 * the finish time of bfqq must be kept in sync with the
5138 * budget, be sure to call __bfq_bfqq_expire() *after* this
5141 * If there is no backlog, then no need to update the budget;
5142 * it will be updated on the arrival of a new request.
5144 next_rq
= bfqq
->next_rq
;
5146 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
5147 bfq_serv_to_charge(next_rq
, bfqq
));
5149 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %d",
5150 next_rq
? blk_rq_sectors(next_rq
) : 0,
5151 bfqq
->entity
.budget
);
5155 * Return true if the process associated with bfqq is "slow". The slow
5156 * flag is used, in addition to the budget timeout, to reduce the
5157 * amount of service provided to seeky processes, and thus reduce
5158 * their chances to lower the throughput. More details in the comments
5159 * on the function bfq_bfqq_expire().
5161 * An important observation is in order: as discussed in the comments
5162 * on the function bfq_update_peak_rate(), with devices with internal
5163 * queues, it is hard if ever possible to know when and for how long
5164 * an I/O request is processed by the device (apart from the trivial
5165 * I/O pattern where a new request is dispatched only after the
5166 * previous one has been completed). This makes it hard to evaluate
5167 * the real rate at which the I/O requests of each bfq_queue are
5168 * served. In fact, for an I/O scheduler like BFQ, serving a
5169 * bfq_queue means just dispatching its requests during its service
5170 * slot (i.e., until the budget of the queue is exhausted, or the
5171 * queue remains idle, or, finally, a timeout fires). But, during the
5172 * service slot of a bfq_queue, around 100 ms at most, the device may
5173 * be even still processing requests of bfq_queues served in previous
5174 * service slots. On the opposite end, the requests of the in-service
5175 * bfq_queue may be completed after the service slot of the queue
5178 * Anyway, unless more sophisticated solutions are used
5179 * (where possible), the sum of the sizes of the requests dispatched
5180 * during the service slot of a bfq_queue is probably the only
5181 * approximation available for the service received by the bfq_queue
5182 * during its service slot. And this sum is the quantity used in this
5183 * function to evaluate the I/O speed of a process.
5185 static bool bfq_bfqq_is_slow(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
5186 bool compensate
, enum bfqq_expiration reason
,
5187 unsigned long *delta_ms
)
5189 ktime_t delta_ktime
;
5191 bool slow
= BFQQ_SEEKY(bfqq
); /* if delta too short, use seekyness */
5193 if (!bfq_bfqq_sync(bfqq
))
5197 delta_ktime
= bfqd
->last_idling_start
;
5199 delta_ktime
= ktime_get();
5200 delta_ktime
= ktime_sub(delta_ktime
, bfqd
->last_budget_start
);
5201 delta_usecs
= ktime_to_us(delta_ktime
);
5203 /* don't use too short time intervals */
5204 if (delta_usecs
< 1000) {
5205 if (blk_queue_nonrot(bfqd
->queue
))
5207 * give same worst-case guarantees as idling
5210 *delta_ms
= BFQ_MIN_TT
/ NSEC_PER_MSEC
;
5211 else /* charge at least one seek */
5212 *delta_ms
= bfq_slice_idle
/ NSEC_PER_MSEC
;
5217 *delta_ms
= delta_usecs
/ USEC_PER_MSEC
;
5220 * Use only long (> 20ms) intervals to filter out excessive
5221 * spikes in service rate estimation.
5223 if (delta_usecs
> 20000) {
5225 * Caveat for rotational devices: processes doing I/O
5226 * in the slower disk zones tend to be slow(er) even
5227 * if not seeky. In this respect, the estimated peak
5228 * rate is likely to be an average over the disk
5229 * surface. Accordingly, to not be too harsh with
5230 * unlucky processes, a process is deemed slow only if
5231 * its rate has been lower than half of the estimated
5234 slow
= bfqq
->entity
.service
< bfqd
->bfq_max_budget
/ 2;
5237 bfq_log_bfqq(bfqd
, bfqq
, "bfq_bfqq_is_slow: slow %d", slow
);
5243 * To be deemed as soft real-time, an application must meet two
5244 * requirements. First, the application must not require an average
5245 * bandwidth higher than the approximate bandwidth required to playback or
5246 * record a compressed high-definition video.
5247 * The next function is invoked on the completion of the last request of a
5248 * batch, to compute the next-start time instant, soft_rt_next_start, such
5249 * that, if the next request of the application does not arrive before
5250 * soft_rt_next_start, then the above requirement on the bandwidth is met.
5252 * The second requirement is that the request pattern of the application is
5253 * isochronous, i.e., that, after issuing a request or a batch of requests,
5254 * the application stops issuing new requests until all its pending requests
5255 * have been completed. After that, the application may issue a new batch,
5257 * For this reason the next function is invoked to compute
5258 * soft_rt_next_start only for applications that meet this requirement,
5259 * whereas soft_rt_next_start is set to infinity for applications that do
5262 * Unfortunately, even a greedy application may happen to behave in an
5263 * isochronous way if the CPU load is high. In fact, the application may
5264 * stop issuing requests while the CPUs are busy serving other processes,
5265 * then restart, then stop again for a while, and so on. In addition, if
5266 * the disk achieves a low enough throughput with the request pattern
5267 * issued by the application (e.g., because the request pattern is random
5268 * and/or the device is slow), then the application may meet the above
5269 * bandwidth requirement too. To prevent such a greedy application to be
5270 * deemed as soft real-time, a further rule is used in the computation of
5271 * soft_rt_next_start: soft_rt_next_start must be higher than the current
5272 * time plus the maximum time for which the arrival of a request is waited
5273 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
5274 * This filters out greedy applications, as the latter issue instead their
5275 * next request as soon as possible after the last one has been completed
5276 * (in contrast, when a batch of requests is completed, a soft real-time
5277 * application spends some time processing data).
5279 * Unfortunately, the last filter may easily generate false positives if
5280 * only bfqd->bfq_slice_idle is used as a reference time interval and one
5281 * or both the following cases occur:
5282 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
5283 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
5285 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
5286 * for a while, then suddenly 'jump' by several units to recover the lost
5287 * increments. This seems to happen, e.g., inside virtual machines.
5288 * To address this issue, we do not use as a reference time interval just
5289 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
5290 * particular we add the minimum number of jiffies for which the filter
5291 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
5294 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data
*bfqd
,
5295 struct bfq_queue
*bfqq
)
5297 return max(bfqq
->last_idle_bklogged
+
5298 HZ
* bfqq
->service_from_backlogged
/
5299 bfqd
->bfq_wr_max_softrt_rate
,
5300 jiffies
+ nsecs_to_jiffies(bfqq
->bfqd
->bfq_slice_idle
) + 4);
5304 * Return the farthest future time instant according to jiffies
5307 static unsigned long bfq_greatest_from_now(void)
5309 return jiffies
+ MAX_JIFFY_OFFSET
;
5313 * Return the farthest past time instant according to jiffies
5316 static unsigned long bfq_smallest_from_now(void)
5318 return jiffies
- MAX_JIFFY_OFFSET
;
5322 * bfq_bfqq_expire - expire a queue.
5323 * @bfqd: device owning the queue.
5324 * @bfqq: the queue to expire.
5325 * @compensate: if true, compensate for the time spent idling.
5326 * @reason: the reason causing the expiration.
5328 * If the process associated with bfqq does slow I/O (e.g., because it
5329 * issues random requests), we charge bfqq with the time it has been
5330 * in service instead of the service it has received (see
5331 * bfq_bfqq_charge_time for details on how this goal is achieved). As
5332 * a consequence, bfqq will typically get higher timestamps upon
5333 * reactivation, and hence it will be rescheduled as if it had
5334 * received more service than what it has actually received. In the
5335 * end, bfqq receives less service in proportion to how slowly its
5336 * associated process consumes its budgets (and hence how seriously it
5337 * tends to lower the throughput). In addition, this time-charging
5338 * strategy guarantees time fairness among slow processes. In
5339 * contrast, if the process associated with bfqq is not slow, we
5340 * charge bfqq exactly with the service it has received.
5342 * Charging time to the first type of queues and the exact service to
5343 * the other has the effect of using the WF2Q+ policy to schedule the
5344 * former on a timeslice basis, without violating service domain
5345 * guarantees among the latter.
5347 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
5348 struct bfq_queue
*bfqq
,
5350 enum bfqq_expiration reason
)
5353 unsigned long delta
= 0;
5354 struct bfq_entity
*entity
= &bfqq
->entity
;
5358 * Check whether the process is slow (see bfq_bfqq_is_slow).
5360 slow
= bfq_bfqq_is_slow(bfqd
, bfqq
, compensate
, reason
, &delta
);
5363 * Increase service_from_backlogged before next statement,
5364 * because the possible next invocation of
5365 * bfq_bfqq_charge_time would likely inflate
5366 * entity->service. In contrast, service_from_backlogged must
5367 * contain real service, to enable the soft real-time
5368 * heuristic to correctly compute the bandwidth consumed by
5371 bfqq
->service_from_backlogged
+= entity
->service
;
5374 * As above explained, charge slow (typically seeky) and
5375 * timed-out queues with the time and not the service
5376 * received, to favor sequential workloads.
5378 * Processes doing I/O in the slower disk zones will tend to
5379 * be slow(er) even if not seeky. Therefore, since the
5380 * estimated peak rate is actually an average over the disk
5381 * surface, these processes may timeout just for bad luck. To
5382 * avoid punishing them, do not charge time to processes that
5383 * succeeded in consuming at least 2/3 of their budget. This
5384 * allows BFQ to preserve enough elasticity to still perform
5385 * bandwidth, and not time, distribution with little unlucky
5386 * or quasi-sequential processes.
5388 if (bfqq
->wr_coeff
== 1 &&
5390 (reason
== BFQQE_BUDGET_TIMEOUT
&&
5391 bfq_bfqq_budget_left(bfqq
) >= entity
->budget
/ 3)))
5392 bfq_bfqq_charge_time(bfqd
, bfqq
, delta
);
5394 if (reason
== BFQQE_TOO_IDLE
&&
5395 entity
->service
<= 2 * entity
->budget
/ 10)
5396 bfq_clear_bfqq_IO_bound(bfqq
);
5398 if (bfqd
->low_latency
&& bfqq
->wr_coeff
== 1)
5399 bfqq
->last_wr_start_finish
= jiffies
;
5401 if (bfqd
->low_latency
&& bfqd
->bfq_wr_max_softrt_rate
> 0 &&
5402 RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
5404 * If we get here, and there are no outstanding
5405 * requests, then the request pattern is isochronous
5406 * (see the comments on the function
5407 * bfq_bfqq_softrt_next_start()). Thus we can compute
5408 * soft_rt_next_start. If, instead, the queue still
5409 * has outstanding requests, then we have to wait for
5410 * the completion of all the outstanding requests to
5411 * discover whether the request pattern is actually
5414 if (bfqq
->dispatched
== 0)
5415 bfqq
->soft_rt_next_start
=
5416 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
5419 * The application is still waiting for the
5420 * completion of one or more requests:
5421 * prevent it from possibly being incorrectly
5422 * deemed as soft real-time by setting its
5423 * soft_rt_next_start to infinity. In fact,
5424 * without this assignment, the application
5425 * would be incorrectly deemed as soft
5427 * 1) it issued a new request before the
5428 * completion of all its in-flight
5430 * 2) at that time, its soft_rt_next_start
5431 * happened to be in the past.
5433 bfqq
->soft_rt_next_start
=
5434 bfq_greatest_from_now();
5436 * Schedule an update of soft_rt_next_start to when
5437 * the task may be discovered to be isochronous.
5439 bfq_mark_bfqq_softrt_update(bfqq
);
5443 bfq_log_bfqq(bfqd
, bfqq
,
5444 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
,
5445 slow
, bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
5448 * Increase, decrease or leave budget unchanged according to
5451 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
5453 __bfq_bfqq_expire(bfqd
, bfqq
);
5455 /* mark bfqq as waiting a request only if a bic still points to it */
5456 if (ref
> 1 && !bfq_bfqq_busy(bfqq
) &&
5457 reason
!= BFQQE_BUDGET_TIMEOUT
&&
5458 reason
!= BFQQE_BUDGET_EXHAUSTED
)
5459 bfq_mark_bfqq_non_blocking_wait_rq(bfqq
);
5463 * Budget timeout is not implemented through a dedicated timer, but
5464 * just checked on request arrivals and completions, as well as on
5465 * idle timer expirations.
5467 static bool bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
5469 return time_is_before_eq_jiffies(bfqq
->budget_timeout
);
5473 * If we expire a queue that is actively waiting (i.e., with the
5474 * device idled) for the arrival of a new request, then we may incur
5475 * the timestamp misalignment problem described in the body of the
5476 * function __bfq_activate_entity. Hence we return true only if this
5477 * condition does not hold, or if the queue is slow enough to deserve
5478 * only to be kicked off for preserving a high throughput.
5480 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
5482 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
5483 "may_budget_timeout: wait_request %d left %d timeout %d",
5484 bfq_bfqq_wait_request(bfqq
),
5485 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
5486 bfq_bfqq_budget_timeout(bfqq
));
5488 return (!bfq_bfqq_wait_request(bfqq
) ||
5489 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
5491 bfq_bfqq_budget_timeout(bfqq
);
5495 * For a queue that becomes empty, device idling is allowed only if
5496 * this function returns true for the queue. As a consequence, since
5497 * device idling plays a critical role in both throughput boosting and
5498 * service guarantees, the return value of this function plays a
5499 * critical role in both these aspects as well.
5501 * In a nutshell, this function returns true only if idling is
5502 * beneficial for throughput or, even if detrimental for throughput,
5503 * idling is however necessary to preserve service guarantees (low
5504 * latency, desired throughput distribution, ...). In particular, on
5505 * NCQ-capable devices, this function tries to return false, so as to
5506 * help keep the drives' internal queues full, whenever this helps the
5507 * device boost the throughput without causing any service-guarantee
5510 * In more detail, the return value of this function is obtained by,
5511 * first, computing a number of boolean variables that take into
5512 * account throughput and service-guarantee issues, and, then,
5513 * combining these variables in a logical expression. Most of the
5514 * issues taken into account are not trivial. We discuss these issues
5515 * individually while introducing the variables.
5517 static bool bfq_bfqq_may_idle(struct bfq_queue
*bfqq
)
5519 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5520 bool idling_boosts_thr
, idling_boosts_thr_without_issues
,
5521 asymmetric_scenario
;
5523 if (bfqd
->strict_guarantees
)
5527 * The next variable takes into account the cases where idling
5528 * boosts the throughput.
5530 * The value of the variable is computed considering that
5531 * idling is usually beneficial for the throughput if:
5532 * (a) the device is not NCQ-capable, or
5533 * (b) regardless of the presence of NCQ, the request pattern
5534 * for bfqq is I/O-bound (possible throughput losses
5535 * caused by granting idling to seeky queues are mitigated
5536 * by the fact that, in all scenarios where boosting
5537 * throughput is the best thing to do, i.e., in all
5538 * symmetric scenarios, only a minimal idle time is
5539 * allowed to seeky queues).
5541 idling_boosts_thr
= !bfqd
->hw_tag
|| bfq_bfqq_IO_bound(bfqq
);
5544 * The value of the next variable,
5545 * idling_boosts_thr_without_issues, is equal to that of
5546 * idling_boosts_thr, unless a special case holds. In this
5547 * special case, described below, idling may cause problems to
5548 * weight-raised queues.
5550 * When the request pool is saturated (e.g., in the presence
5551 * of write hogs), if the processes associated with
5552 * non-weight-raised queues ask for requests at a lower rate,
5553 * then processes associated with weight-raised queues have a
5554 * higher probability to get a request from the pool
5555 * immediately (or at least soon) when they need one. Thus
5556 * they have a higher probability to actually get a fraction
5557 * of the device throughput proportional to their high
5558 * weight. This is especially true with NCQ-capable drives,
5559 * which enqueue several requests in advance, and further
5560 * reorder internally-queued requests.
5562 * For this reason, we force to false the value of
5563 * idling_boosts_thr_without_issues if there are weight-raised
5564 * busy queues. In this case, and if bfqq is not weight-raised,
5565 * this guarantees that the device is not idled for bfqq (if,
5566 * instead, bfqq is weight-raised, then idling will be
5567 * guaranteed by another variable, see below). Combined with
5568 * the timestamping rules of BFQ (see [1] for details), this
5569 * behavior causes bfqq, and hence any sync non-weight-raised
5570 * queue, to get a lower number of requests served, and thus
5571 * to ask for a lower number of requests from the request
5572 * pool, before the busy weight-raised queues get served
5573 * again. This often mitigates starvation problems in the
5574 * presence of heavy write workloads and NCQ, thereby
5575 * guaranteeing a higher application and system responsiveness
5576 * in these hostile scenarios.
5578 idling_boosts_thr_without_issues
= idling_boosts_thr
&&
5579 bfqd
->wr_busy_queues
== 0;
5582 * There is then a case where idling must be performed not for
5583 * throughput concerns, but to preserve service guarantees. To
5584 * introduce it, we can note that allowing the drive to
5585 * enqueue more than one request at a time, and hence
5586 * delegating de facto final scheduling decisions to the
5587 * drive's internal scheduler, causes loss of control on the
5588 * actual request service order. In particular, the critical
5589 * situation is when requests from different processes happens
5590 * to be present, at the same time, in the internal queue(s)
5591 * of the drive. In such a situation, the drive, by deciding
5592 * the service order of the internally-queued requests, does
5593 * determine also the actual throughput distribution among
5594 * these processes. But the drive typically has no notion or
5595 * concern about per-process throughput distribution, and
5596 * makes its decisions only on a per-request basis. Therefore,
5597 * the service distribution enforced by the drive's internal
5598 * scheduler is likely to coincide with the desired
5599 * device-throughput distribution only in a completely
5600 * symmetric scenario where: (i) each of these processes must
5601 * get the same throughput as the others; (ii) all these
5602 * processes have the same I/O pattern (either sequential or
5603 * random). In fact, in such a scenario, the drive will tend
5604 * to treat the requests of each of these processes in about
5605 * the same way as the requests of the others, and thus to
5606 * provide each of these processes with about the same
5607 * throughput (which is exactly the desired throughput
5608 * distribution). In contrast, in any asymmetric scenario,
5609 * device idling is certainly needed to guarantee that bfqq
5610 * receives its assigned fraction of the device throughput
5611 * (see [1] for details).
5613 * As for sub-condition (i), actually we check only whether
5614 * bfqq is being weight-raised. In fact, if bfqq is not being
5615 * weight-raised, we have that:
5616 * - if the process associated with bfqq is not I/O-bound, then
5617 * it is not either latency- or throughput-critical; therefore
5618 * idling is not needed for bfqq;
5619 * - if the process asociated with bfqq is I/O-bound, then
5620 * idling is already granted with bfqq (see the comments on
5621 * idling_boosts_thr).
5623 * We do not check sub-condition (ii) at all, i.e., the next
5624 * variable is true if and only if bfqq is being
5625 * weight-raised. We do not need to control sub-condition (ii)
5626 * for the following reason:
5627 * - if bfqq is being weight-raised, then idling is already
5628 * guaranteed to bfqq by sub-condition (i);
5629 * - if bfqq is not being weight-raised, then idling is
5630 * already guaranteed to bfqq (only) if it matters, i.e., if
5631 * bfqq is associated to a currently I/O-bound process (see
5632 * the above comment on sub-condition (i)).
5634 * As a side note, it is worth considering that the above
5635 * device-idling countermeasures may however fail in the
5636 * following unlucky scenario: if idling is (correctly)
5637 * disabled in a time period during which the symmetry
5638 * sub-condition holds, and hence the device is allowed to
5639 * enqueue many requests, but at some later point in time some
5640 * sub-condition stops to hold, then it may become impossible
5641 * to let requests be served in the desired order until all
5642 * the requests already queued in the device have been served.
5644 asymmetric_scenario
= bfqq
->wr_coeff
> 1;
5647 * We have now all the components we need to compute the return
5648 * value of the function, which is true only if both the following
5650 * 1) bfqq is sync, because idling make sense only for sync queues;
5651 * 2) idling either boosts the throughput (without issues), or
5652 * is necessary to preserve service guarantees.
5654 return bfq_bfqq_sync(bfqq
) &&
5655 (idling_boosts_thr_without_issues
|| asymmetric_scenario
);
5659 * If the in-service queue is empty but the function bfq_bfqq_may_idle
5660 * returns true, then:
5661 * 1) the queue must remain in service and cannot be expired, and
5662 * 2) the device must be idled to wait for the possible arrival of a new
5663 * request for the queue.
5664 * See the comments on the function bfq_bfqq_may_idle for the reasons
5665 * why performing device idling is the best choice to boost the throughput
5666 * and preserve service guarantees when bfq_bfqq_may_idle itself
5669 static bool bfq_bfqq_must_idle(struct bfq_queue
*bfqq
)
5671 struct bfq_data
*bfqd
= bfqq
->bfqd
;
5673 return RB_EMPTY_ROOT(&bfqq
->sort_list
) && bfqd
->bfq_slice_idle
!= 0 &&
5674 bfq_bfqq_may_idle(bfqq
);
5678 * Select a queue for service. If we have a current queue in service,
5679 * check whether to continue servicing it, or retrieve and set a new one.
5681 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
5683 struct bfq_queue
*bfqq
;
5684 struct request
*next_rq
;
5685 enum bfqq_expiration reason
= BFQQE_BUDGET_TIMEOUT
;
5687 bfqq
= bfqd
->in_service_queue
;
5691 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already in-service queue");
5693 if (bfq_may_expire_for_budg_timeout(bfqq
) &&
5694 !bfq_bfqq_wait_request(bfqq
) &&
5695 !bfq_bfqq_must_idle(bfqq
))
5700 * This loop is rarely executed more than once. Even when it
5701 * happens, it is much more convenient to re-execute this loop
5702 * than to return NULL and trigger a new dispatch to get a
5705 next_rq
= bfqq
->next_rq
;
5707 * If bfqq has requests queued and it has enough budget left to
5708 * serve them, keep the queue, otherwise expire it.
5711 if (bfq_serv_to_charge(next_rq
, bfqq
) >
5712 bfq_bfqq_budget_left(bfqq
)) {
5714 * Expire the queue for budget exhaustion,
5715 * which makes sure that the next budget is
5716 * enough to serve the next request, even if
5717 * it comes from the fifo expired path.
5719 reason
= BFQQE_BUDGET_EXHAUSTED
;
5723 * The idle timer may be pending because we may
5724 * not disable disk idling even when a new request
5727 if (bfq_bfqq_wait_request(bfqq
)) {
5729 * If we get here: 1) at least a new request
5730 * has arrived but we have not disabled the
5731 * timer because the request was too small,
5732 * 2) then the block layer has unplugged
5733 * the device, causing the dispatch to be
5736 * Since the device is unplugged, now the
5737 * requests are probably large enough to
5738 * provide a reasonable throughput.
5739 * So we disable idling.
5741 bfq_clear_bfqq_wait_request(bfqq
);
5742 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
5743 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
5750 * No requests pending. However, if the in-service queue is idling
5751 * for a new request, or has requests waiting for a completion and
5752 * may idle after their completion, then keep it anyway.
5754 if (bfq_bfqq_wait_request(bfqq
) ||
5755 (bfqq
->dispatched
!= 0 && bfq_bfqq_may_idle(bfqq
))) {
5760 reason
= BFQQE_NO_MORE_REQUESTS
;
5762 bfq_bfqq_expire(bfqd
, bfqq
, false, reason
);
5764 bfqq
= bfq_set_in_service_queue(bfqd
);
5766 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: checking new queue");
5771 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: returned this queue");
5773 bfq_log(bfqd
, "select_queue: no queue returned");
5778 static void bfq_update_wr_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
5780 struct bfq_entity
*entity
= &bfqq
->entity
;
5782 if (bfqq
->wr_coeff
> 1) { /* queue is being weight-raised */
5783 bfq_log_bfqq(bfqd
, bfqq
,
5784 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
5785 jiffies_to_msecs(jiffies
- bfqq
->last_wr_start_finish
),
5786 jiffies_to_msecs(bfqq
->wr_cur_max_time
),
5788 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
5790 if (entity
->prio_changed
)
5791 bfq_log_bfqq(bfqd
, bfqq
, "WARN: pending prio change");
5794 * If too much time has elapsed from the beginning of
5795 * this weight-raising period, then end weight
5798 if (time_is_before_jiffies(bfqq
->last_wr_start_finish
+
5799 bfqq
->wr_cur_max_time
)) {
5800 if (bfqq
->wr_cur_max_time
!= bfqd
->bfq_wr_rt_max_time
||
5801 time_is_before_jiffies(bfqq
->wr_start_at_switch_to_srt
+
5802 bfq_wr_duration(bfqd
)))
5803 bfq_bfqq_end_wr(bfqq
);
5805 /* switch back to interactive wr */
5806 bfqq
->wr_coeff
= bfqd
->bfq_wr_coeff
;
5807 bfqq
->wr_cur_max_time
= bfq_wr_duration(bfqd
);
5808 bfqq
->last_wr_start_finish
=
5809 bfqq
->wr_start_at_switch_to_srt
;
5810 bfqq
->entity
.prio_changed
= 1;
5814 /* Update weight both if it must be raised and if it must be lowered */
5815 if ((entity
->weight
> entity
->orig_weight
) != (bfqq
->wr_coeff
> 1))
5816 __bfq_entity_update_weight_prio(
5817 bfq_entity_service_tree(entity
),
5822 * Dispatch next request from bfqq.
5824 static struct request
*bfq_dispatch_rq_from_bfqq(struct bfq_data
*bfqd
,
5825 struct bfq_queue
*bfqq
)
5827 struct request
*rq
= bfqq
->next_rq
;
5828 unsigned long service_to_charge
;
5830 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
5832 bfq_bfqq_served(bfqq
, service_to_charge
);
5834 bfq_dispatch_remove(bfqd
->queue
, rq
);
5837 * If weight raising has to terminate for bfqq, then next
5838 * function causes an immediate update of bfqq's weight,
5839 * without waiting for next activation. As a consequence, on
5840 * expiration, bfqq will be timestamped as if has never been
5841 * weight-raised during this service slot, even if it has
5842 * received part or even most of the service as a
5843 * weight-raised queue. This inflates bfqq's timestamps, which
5844 * is beneficial, as bfqq is then more willing to leave the
5845 * device immediately to possible other weight-raised queues.
5847 bfq_update_wr_data(bfqd
, bfqq
);
5849 if (!bfqd
->in_service_bic
) {
5850 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
5851 bfqd
->in_service_bic
= RQ_BIC(rq
);
5855 * Expire bfqq, pretending that its budget expired, if bfqq
5856 * belongs to CLASS_IDLE and other queues are waiting for
5859 if (bfqd
->busy_queues
> 1 && bfq_class_idle(bfqq
))
5865 bfq_bfqq_expire(bfqd
, bfqq
, false, BFQQE_BUDGET_EXHAUSTED
);
5869 static bool bfq_has_work(struct blk_mq_hw_ctx
*hctx
)
5871 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5874 * Avoiding lock: a race on bfqd->busy_queues should cause at
5875 * most a call to dispatch for nothing
5877 return !list_empty_careful(&bfqd
->dispatch
) ||
5878 bfqd
->busy_queues
> 0;
5881 static struct request
*__bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5883 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5884 struct request
*rq
= NULL
;
5885 struct bfq_queue
*bfqq
= NULL
;
5887 if (!list_empty(&bfqd
->dispatch
)) {
5888 rq
= list_first_entry(&bfqd
->dispatch
, struct request
,
5890 list_del_init(&rq
->queuelist
);
5896 * Increment counters here, because this
5897 * dispatch does not follow the standard
5898 * dispatch flow (where counters are
5903 goto inc_in_driver_start_rq
;
5907 * We exploit the put_rq_private hook to decrement
5908 * rq_in_driver, but put_rq_private will not be
5909 * invoked on this request. So, to avoid unbalance,
5910 * just start this request, without incrementing
5911 * rq_in_driver. As a negative consequence,
5912 * rq_in_driver is deceptively lower than it should be
5913 * while this request is in service. This may cause
5914 * bfq_schedule_dispatch to be invoked uselessly.
5916 * As for implementing an exact solution, the
5917 * put_request hook, if defined, is probably invoked
5918 * also on this request. So, by exploiting this hook,
5919 * we could 1) increment rq_in_driver here, and 2)
5920 * decrement it in put_request. Such a solution would
5921 * let the value of the counter be always accurate,
5922 * but it would entail using an extra interface
5923 * function. This cost seems higher than the benefit,
5924 * being the frequency of non-elevator-private
5925 * requests very low.
5930 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
5932 if (bfqd
->busy_queues
== 0)
5936 * Force device to serve one request at a time if
5937 * strict_guarantees is true. Forcing this service scheme is
5938 * currently the ONLY way to guarantee that the request
5939 * service order enforced by the scheduler is respected by a
5940 * queueing device. Otherwise the device is free even to make
5941 * some unlucky request wait for as long as the device
5944 * Of course, serving one request at at time may cause loss of
5947 if (bfqd
->strict_guarantees
&& bfqd
->rq_in_driver
> 0)
5950 bfqq
= bfq_select_queue(bfqd
);
5954 rq
= bfq_dispatch_rq_from_bfqq(bfqd
, bfqq
);
5957 inc_in_driver_start_rq
:
5958 bfqd
->rq_in_driver
++;
5960 rq
->rq_flags
|= RQF_STARTED
;
5966 static struct request
*bfq_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
5968 struct bfq_data
*bfqd
= hctx
->queue
->elevator
->elevator_data
;
5971 spin_lock_irq(&bfqd
->lock
);
5972 rq
= __bfq_dispatch_request(hctx
);
5973 spin_unlock_irq(&bfqd
->lock
);
5979 * Task holds one reference to the queue, dropped when task exits. Each rq
5980 * in-flight on this queue also holds a reference, dropped when rq is freed.
5982 * Scheduler lock must be held here. Recall not to use bfqq after calling
5983 * this function on it.
5985 static void bfq_put_queue(struct bfq_queue
*bfqq
)
5987 #ifdef CONFIG_BFQ_GROUP_IOSCHED
5988 struct bfq_group
*bfqg
= bfqq_group(bfqq
);
5992 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p %d",
5999 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
6001 kmem_cache_free(bfq_pool
, bfqq
);
6002 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6007 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
6009 if (bfqq
== bfqd
->in_service_queue
) {
6010 __bfq_bfqq_expire(bfqd
, bfqq
);
6011 bfq_schedule_dispatch(bfqd
);
6014 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
, bfqq
->ref
);
6016 bfq_put_queue(bfqq
); /* release process reference */
6019 static void bfq_exit_icq_bfqq(struct bfq_io_cq
*bic
, bool is_sync
)
6021 struct bfq_queue
*bfqq
= bic_to_bfqq(bic
, is_sync
);
6022 struct bfq_data
*bfqd
;
6025 bfqd
= bfqq
->bfqd
; /* NULL if scheduler already exited */
6028 unsigned long flags
;
6030 spin_lock_irqsave(&bfqd
->lock
, flags
);
6031 bfq_exit_bfqq(bfqd
, bfqq
);
6032 bic_set_bfqq(bic
, NULL
, is_sync
);
6033 spin_unlock_irq(&bfqd
->lock
);
6037 static void bfq_exit_icq(struct io_cq
*icq
)
6039 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
6041 bfq_exit_icq_bfqq(bic
, true);
6042 bfq_exit_icq_bfqq(bic
, false);
6046 * Update the entity prio values; note that the new values will not
6047 * be used until the next (re)activation.
6050 bfq_set_next_ioprio_data(struct bfq_queue
*bfqq
, struct bfq_io_cq
*bic
)
6052 struct task_struct
*tsk
= current
;
6054 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6059 ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6060 switch (ioprio_class
) {
6062 dev_err(bfqq
->bfqd
->queue
->backing_dev_info
->dev
,
6063 "bfq: bad prio class %d\n", ioprio_class
);
6064 case IOPRIO_CLASS_NONE
:
6066 * No prio set, inherit CPU scheduling settings.
6068 bfqq
->new_ioprio
= task_nice_ioprio(tsk
);
6069 bfqq
->new_ioprio_class
= task_nice_ioclass(tsk
);
6071 case IOPRIO_CLASS_RT
:
6072 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6073 bfqq
->new_ioprio_class
= IOPRIO_CLASS_RT
;
6075 case IOPRIO_CLASS_BE
:
6076 bfqq
->new_ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6077 bfqq
->new_ioprio_class
= IOPRIO_CLASS_BE
;
6079 case IOPRIO_CLASS_IDLE
:
6080 bfqq
->new_ioprio_class
= IOPRIO_CLASS_IDLE
;
6081 bfqq
->new_ioprio
= 7;
6082 bfq_clear_bfqq_idle_window(bfqq
);
6086 if (bfqq
->new_ioprio
>= IOPRIO_BE_NR
) {
6087 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
6089 bfqq
->new_ioprio
= IOPRIO_BE_NR
;
6092 bfqq
->entity
.new_weight
= bfq_ioprio_to_weight(bfqq
->new_ioprio
);
6093 bfqq
->entity
.prio_changed
= 1;
6096 static void bfq_check_ioprio_change(struct bfq_io_cq
*bic
, struct bio
*bio
)
6098 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
6099 struct bfq_queue
*bfqq
;
6100 int ioprio
= bic
->icq
.ioc
->ioprio
;
6103 * This condition may trigger on a newly created bic, be sure to
6104 * drop the lock before returning.
6106 if (unlikely(!bfqd
) || likely(bic
->ioprio
== ioprio
))
6109 bic
->ioprio
= ioprio
;
6111 bfqq
= bic_to_bfqq(bic
, false);
6113 /* release process reference on this queue */
6114 bfq_put_queue(bfqq
);
6115 bfqq
= bfq_get_queue(bfqd
, bio
, BLK_RW_ASYNC
, bic
);
6116 bic_set_bfqq(bic
, bfqq
, false);
6119 bfqq
= bic_to_bfqq(bic
, true);
6121 bfq_set_next_ioprio_data(bfqq
, bic
);
6124 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6125 struct bfq_io_cq
*bic
, pid_t pid
, int is_sync
)
6127 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
6128 INIT_LIST_HEAD(&bfqq
->fifo
);
6134 bfq_set_next_ioprio_data(bfqq
, bic
);
6137 if (!bfq_class_idle(bfqq
))
6138 bfq_mark_bfqq_idle_window(bfqq
);
6139 bfq_mark_bfqq_sync(bfqq
);
6141 bfq_clear_bfqq_sync(bfqq
);
6143 /* set end request to minus infinity from now */
6144 bfqq
->ttime
.last_end_request
= ktime_get_ns() + 1;
6146 bfq_mark_bfqq_IO_bound(bfqq
);
6150 /* Tentative initial value to trade off between thr and lat */
6151 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
6152 bfqq
->budget_timeout
= bfq_smallest_from_now();
6155 bfqq
->last_wr_start_finish
= bfq_smallest_from_now();
6156 bfqq
->wr_start_at_switch_to_srt
= bfq_smallest_from_now();
6159 * Set to the value for which bfqq will not be deemed as
6160 * soft rt when it becomes backlogged.
6162 bfqq
->soft_rt_next_start
= bfq_greatest_from_now();
6164 /* first request is almost certainly seeky */
6165 bfqq
->seek_history
= 1;
6168 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
6169 struct bfq_group
*bfqg
,
6170 int ioprio_class
, int ioprio
)
6172 switch (ioprio_class
) {
6173 case IOPRIO_CLASS_RT
:
6174 return &bfqg
->async_bfqq
[0][ioprio
];
6175 case IOPRIO_CLASS_NONE
:
6176 ioprio
= IOPRIO_NORM
;
6178 case IOPRIO_CLASS_BE
:
6179 return &bfqg
->async_bfqq
[1][ioprio
];
6180 case IOPRIO_CLASS_IDLE
:
6181 return &bfqg
->async_idle_bfqq
;
6187 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
6188 struct bio
*bio
, bool is_sync
,
6189 struct bfq_io_cq
*bic
)
6191 const int ioprio
= IOPRIO_PRIO_DATA(bic
->ioprio
);
6192 const int ioprio_class
= IOPRIO_PRIO_CLASS(bic
->ioprio
);
6193 struct bfq_queue
**async_bfqq
= NULL
;
6194 struct bfq_queue
*bfqq
;
6195 struct bfq_group
*bfqg
;
6199 bfqg
= bfq_find_set_group(bfqd
, bio_blkcg(bio
));
6201 bfqq
= &bfqd
->oom_bfqq
;
6206 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
6213 bfqq
= kmem_cache_alloc_node(bfq_pool
,
6214 GFP_NOWAIT
| __GFP_ZERO
| __GFP_NOWARN
,
6218 bfq_init_bfqq(bfqd
, bfqq
, bic
, current
->pid
,
6220 bfq_init_entity(&bfqq
->entity
, bfqg
);
6221 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
6223 bfqq
= &bfqd
->oom_bfqq
;
6224 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
6229 * Pin the queue now that it's allocated, scheduler exit will
6234 * Extra group reference, w.r.t. sync
6235 * queue. This extra reference is removed
6236 * only if bfqq->bfqg disappears, to
6237 * guarantee that this queue is not freed
6238 * until its group goes away.
6240 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
6246 bfqq
->ref
++; /* get a process reference to this queue */
6247 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
, bfqq
->ref
);
6252 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
6253 struct bfq_queue
*bfqq
)
6255 struct bfq_ttime
*ttime
= &bfqq
->ttime
;
6256 u64 elapsed
= ktime_get_ns() - bfqq
->ttime
.last_end_request
;
6258 elapsed
= min_t(u64
, elapsed
, 2ULL * bfqd
->bfq_slice_idle
);
6260 ttime
->ttime_samples
= (7*bfqq
->ttime
.ttime_samples
+ 256) / 8;
6261 ttime
->ttime_total
= div_u64(7*ttime
->ttime_total
+ 256*elapsed
, 8);
6262 ttime
->ttime_mean
= div64_ul(ttime
->ttime_total
+ 128,
6263 ttime
->ttime_samples
);
6267 bfq_update_io_seektime(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6270 bfqq
->seek_history
<<= 1;
6271 bfqq
->seek_history
|=
6272 get_sdist(bfqq
->last_request_pos
, rq
) > BFQQ_SEEK_THR
&&
6273 (!blk_queue_nonrot(bfqd
->queue
) ||
6274 blk_rq_sectors(rq
) < BFQQ_SECT_THR_NONROT
);
6278 * Disable idle window if the process thinks too long or seeks so much that
6279 * it doesn't matter.
6281 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
6282 struct bfq_queue
*bfqq
,
6283 struct bfq_io_cq
*bic
)
6287 /* Don't idle for async or idle io prio class. */
6288 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
6291 enable_idle
= bfq_bfqq_idle_window(bfqq
);
6293 if (atomic_read(&bic
->icq
.ioc
->active_ref
) == 0 ||
6294 bfqd
->bfq_slice_idle
== 0 ||
6295 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
6296 bfqq
->wr_coeff
== 1))
6298 else if (bfq_sample_valid(bfqq
->ttime
.ttime_samples
)) {
6299 if (bfqq
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
6300 bfqq
->wr_coeff
== 1)
6305 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
6309 bfq_mark_bfqq_idle_window(bfqq
);
6311 bfq_clear_bfqq_idle_window(bfqq
);
6315 * Called when a new fs request (rq) is added to bfqq. Check if there's
6316 * something we should do about it.
6318 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
6321 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
6323 if (rq
->cmd_flags
& REQ_META
)
6324 bfqq
->meta_pending
++;
6326 bfq_update_io_thinktime(bfqd
, bfqq
);
6327 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
6328 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
6330 bfq_update_idle_window(bfqd
, bfqq
, bic
);
6332 bfq_log_bfqq(bfqd
, bfqq
,
6333 "rq_enqueued: idle_window=%d (seeky %d)",
6334 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
));
6336 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
6338 if (bfqq
== bfqd
->in_service_queue
&& bfq_bfqq_wait_request(bfqq
)) {
6339 bool small_req
= bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
6340 blk_rq_sectors(rq
) < 32;
6341 bool budget_timeout
= bfq_bfqq_budget_timeout(bfqq
);
6344 * There is just this request queued: if the request
6345 * is small and the queue is not to be expired, then
6348 * In this way, if the device is being idled to wait
6349 * for a new request from the in-service queue, we
6350 * avoid unplugging the device and committing the
6351 * device to serve just a small request. On the
6352 * contrary, we wait for the block layer to decide
6353 * when to unplug the device: hopefully, new requests
6354 * will be merged to this one quickly, then the device
6355 * will be unplugged and larger requests will be
6358 if (small_req
&& !budget_timeout
)
6362 * A large enough request arrived, or the queue is to
6363 * be expired: in both cases disk idling is to be
6364 * stopped, so clear wait_request flag and reset
6367 bfq_clear_bfqq_wait_request(bfqq
);
6368 hrtimer_try_to_cancel(&bfqd
->idle_slice_timer
);
6369 bfqg_stats_update_idle_time(bfqq_group(bfqq
));
6372 * The queue is not empty, because a new request just
6373 * arrived. Hence we can safely expire the queue, in
6374 * case of budget timeout, without risking that the
6375 * timestamps of the queue are not updated correctly.
6376 * See [1] for more details.
6379 bfq_bfqq_expire(bfqd
, bfqq
, false,
6380 BFQQE_BUDGET_TIMEOUT
);
6384 static void __bfq_insert_request(struct bfq_data
*bfqd
, struct request
*rq
)
6386 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
6388 bfq_add_request(rq
);
6390 rq
->fifo_time
= ktime_get_ns() + bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)];
6391 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
6393 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
6396 static void bfq_insert_request(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
,
6399 struct request_queue
*q
= hctx
->queue
;
6400 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
6402 spin_lock_irq(&bfqd
->lock
);
6403 if (blk_mq_sched_try_insert_merge(q
, rq
)) {
6404 spin_unlock_irq(&bfqd
->lock
);
6408 spin_unlock_irq(&bfqd
->lock
);
6410 blk_mq_sched_request_inserted(rq
);
6412 spin_lock_irq(&bfqd
->lock
);
6413 if (at_head
|| blk_rq_is_passthrough(rq
)) {
6415 list_add(&rq
->queuelist
, &bfqd
->dispatch
);
6417 list_add_tail(&rq
->queuelist
, &bfqd
->dispatch
);
6419 __bfq_insert_request(bfqd
, rq
);
6421 if (rq_mergeable(rq
)) {
6422 elv_rqhash_add(q
, rq
);
6428 spin_unlock_irq(&bfqd
->lock
);
6431 static void bfq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
6432 struct list_head
*list
, bool at_head
)
6434 while (!list_empty(list
)) {
6437 rq
= list_first_entry(list
, struct request
, queuelist
);
6438 list_del_init(&rq
->queuelist
);
6439 bfq_insert_request(hctx
, rq
, at_head
);
6443 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
6445 bfqd
->max_rq_in_driver
= max_t(int, bfqd
->max_rq_in_driver
,
6446 bfqd
->rq_in_driver
);
6448 if (bfqd
->hw_tag
== 1)
6452 * This sample is valid if the number of outstanding requests
6453 * is large enough to allow a queueing behavior. Note that the
6454 * sum is not exact, as it's not taking into account deactivated
6457 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
6460 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
6463 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
6464 bfqd
->max_rq_in_driver
= 0;
6465 bfqd
->hw_tag_samples
= 0;
6468 static void bfq_completed_request(struct bfq_queue
*bfqq
, struct bfq_data
*bfqd
)
6473 bfq_update_hw_tag(bfqd
);
6475 bfqd
->rq_in_driver
--;
6478 if (!bfqq
->dispatched
&& !bfq_bfqq_busy(bfqq
)) {
6480 * Set budget_timeout (which we overload to store the
6481 * time at which the queue remains with no backlog and
6482 * no outstanding request; used by the weight-raising
6485 bfqq
->budget_timeout
= jiffies
;
6488 now_ns
= ktime_get_ns();
6490 bfqq
->ttime
.last_end_request
= now_ns
;
6493 * Using us instead of ns, to get a reasonable precision in
6494 * computing rate in next check.
6496 delta_us
= div_u64(now_ns
- bfqd
->last_completion
, NSEC_PER_USEC
);
6499 * If the request took rather long to complete, and, according
6500 * to the maximum request size recorded, this completion latency
6501 * implies that the request was certainly served at a very low
6502 * rate (less than 1M sectors/sec), then the whole observation
6503 * interval that lasts up to this time instant cannot be a
6504 * valid time interval for computing a new peak rate. Invoke
6505 * bfq_update_rate_reset to have the following three steps
6507 * - close the observation interval at the last (previous)
6508 * request dispatch or completion
6509 * - compute rate, if possible, for that observation interval
6510 * - reset to zero samples, which will trigger a proper
6511 * re-initialization of the observation interval on next
6514 if (delta_us
> BFQ_MIN_TT
/NSEC_PER_USEC
&&
6515 (bfqd
->last_rq_max_size
<<BFQ_RATE_SHIFT
)/delta_us
<
6516 1UL<<(BFQ_RATE_SHIFT
- 10))
6517 bfq_update_rate_reset(bfqd
, NULL
);
6518 bfqd
->last_completion
= now_ns
;
6521 * If we are waiting to discover whether the request pattern
6522 * of the task associated with the queue is actually
6523 * isochronous, and both requisites for this condition to hold
6524 * are now satisfied, then compute soft_rt_next_start (see the
6525 * comments on the function bfq_bfqq_softrt_next_start()). We
6526 * schedule this delayed check when bfqq expires, if it still
6527 * has in-flight requests.
6529 if (bfq_bfqq_softrt_update(bfqq
) && bfqq
->dispatched
== 0 &&
6530 RB_EMPTY_ROOT(&bfqq
->sort_list
))
6531 bfqq
->soft_rt_next_start
=
6532 bfq_bfqq_softrt_next_start(bfqd
, bfqq
);
6535 * If this is the in-service queue, check if it needs to be expired,
6536 * or if we want to idle in case it has no pending requests.
6538 if (bfqd
->in_service_queue
== bfqq
) {
6539 if (bfqq
->dispatched
== 0 && bfq_bfqq_must_idle(bfqq
)) {
6540 bfq_arm_slice_timer(bfqd
);
6542 } else if (bfq_may_expire_for_budg_timeout(bfqq
))
6543 bfq_bfqq_expire(bfqd
, bfqq
, false,
6544 BFQQE_BUDGET_TIMEOUT
);
6545 else if (RB_EMPTY_ROOT(&bfqq
->sort_list
) &&
6546 (bfqq
->dispatched
== 0 ||
6547 !bfq_bfqq_may_idle(bfqq
)))
6548 bfq_bfqq_expire(bfqd
, bfqq
, false,
6549 BFQQE_NO_MORE_REQUESTS
);
6553 static void bfq_put_rq_priv_body(struct bfq_queue
*bfqq
)
6557 bfq_put_queue(bfqq
);
6560 static void bfq_put_rq_private(struct request_queue
*q
, struct request
*rq
)
6562 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
6563 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6565 if (rq
->rq_flags
& RQF_STARTED
)
6566 bfqg_stats_update_completion(bfqq_group(bfqq
),
6567 rq_start_time_ns(rq
),
6568 rq_io_start_time_ns(rq
),
6571 if (likely(rq
->rq_flags
& RQF_STARTED
)) {
6572 unsigned long flags
;
6574 spin_lock_irqsave(&bfqd
->lock
, flags
);
6576 bfq_completed_request(bfqq
, bfqd
);
6577 bfq_put_rq_priv_body(bfqq
);
6579 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6582 * Request rq may be still/already in the scheduler,
6583 * in which case we need to remove it. And we cannot
6584 * defer such a check and removal, to avoid
6585 * inconsistencies in the time interval from the end
6586 * of this function to the start of the deferred work.
6587 * This situation seems to occur only in process
6588 * context, as a consequence of a merge. In the
6589 * current version of the code, this implies that the
6593 if (!RB_EMPTY_NODE(&rq
->rb_node
))
6594 bfq_remove_request(q
, rq
);
6595 bfq_put_rq_priv_body(bfqq
);
6598 rq
->elv
.priv
[0] = NULL
;
6599 rq
->elv
.priv
[1] = NULL
;
6603 * Allocate bfq data structures associated with this request.
6605 static int bfq_get_rq_private(struct request_queue
*q
, struct request
*rq
,
6608 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
6609 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
6610 const int is_sync
= rq_is_sync(rq
);
6611 struct bfq_queue
*bfqq
;
6613 spin_lock_irq(&bfqd
->lock
);
6615 bfq_check_ioprio_change(bic
, bio
);
6620 bfq_bic_update_cgroup(bic
, bio
);
6622 bfqq
= bic_to_bfqq(bic
, is_sync
);
6623 if (!bfqq
|| bfqq
== &bfqd
->oom_bfqq
) {
6625 bfq_put_queue(bfqq
);
6626 bfqq
= bfq_get_queue(bfqd
, bio
, is_sync
, bic
);
6627 bic_set_bfqq(bic
, bfqq
, is_sync
);
6632 bfq_log_bfqq(bfqd
, bfqq
, "get_request %p: bfqq %p, %d",
6633 rq
, bfqq
, bfqq
->ref
);
6635 rq
->elv
.priv
[0] = bic
;
6636 rq
->elv
.priv
[1] = bfqq
;
6638 spin_unlock_irq(&bfqd
->lock
);
6643 spin_unlock_irq(&bfqd
->lock
);
6648 static void bfq_idle_slice_timer_body(struct bfq_queue
*bfqq
)
6650 struct bfq_data
*bfqd
= bfqq
->bfqd
;
6651 enum bfqq_expiration reason
;
6652 unsigned long flags
;
6654 spin_lock_irqsave(&bfqd
->lock
, flags
);
6655 bfq_clear_bfqq_wait_request(bfqq
);
6657 if (bfqq
!= bfqd
->in_service_queue
) {
6658 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6662 if (bfq_bfqq_budget_timeout(bfqq
))
6664 * Also here the queue can be safely expired
6665 * for budget timeout without wasting
6668 reason
= BFQQE_BUDGET_TIMEOUT
;
6669 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
6671 * The queue may not be empty upon timer expiration,
6672 * because we may not disable the timer when the
6673 * first request of the in-service queue arrives
6674 * during disk idling.
6676 reason
= BFQQE_TOO_IDLE
;
6678 goto schedule_dispatch
;
6680 bfq_bfqq_expire(bfqd
, bfqq
, true, reason
);
6683 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
6684 bfq_schedule_dispatch(bfqd
);
6688 * Handler of the expiration of the timer running if the in-service queue
6689 * is idling inside its time slice.
6691 static enum hrtimer_restart
bfq_idle_slice_timer(struct hrtimer
*timer
)
6693 struct bfq_data
*bfqd
= container_of(timer
, struct bfq_data
,
6695 struct bfq_queue
*bfqq
= bfqd
->in_service_queue
;
6698 * Theoretical race here: the in-service queue can be NULL or
6699 * different from the queue that was idling if a new request
6700 * arrives for the current queue and there is a full dispatch
6701 * cycle that changes the in-service queue. This can hardly
6702 * happen, but in the worst case we just expire a queue too
6706 bfq_idle_slice_timer_body(bfqq
);
6708 return HRTIMER_NORESTART
;
6711 static void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
6712 struct bfq_queue
**bfqq_ptr
)
6714 struct bfq_queue
*bfqq
= *bfqq_ptr
;
6716 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
6718 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
6720 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
6722 bfq_put_queue(bfqq
);
6728 * Release all the bfqg references to its async queues. If we are
6729 * deallocating the group these queues may still contain requests, so
6730 * we reparent them to the root cgroup (i.e., the only one that will
6731 * exist for sure until all the requests on a device are gone).
6733 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
6737 for (i
= 0; i
< 2; i
++)
6738 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
6739 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
6741 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
6744 static void bfq_exit_queue(struct elevator_queue
*e
)
6746 struct bfq_data
*bfqd
= e
->elevator_data
;
6747 struct bfq_queue
*bfqq
, *n
;
6749 hrtimer_cancel(&bfqd
->idle_slice_timer
);
6751 spin_lock_irq(&bfqd
->lock
);
6752 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
6753 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
6754 spin_unlock_irq(&bfqd
->lock
);
6756 hrtimer_cancel(&bfqd
->idle_slice_timer
);
6758 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6759 blkcg_deactivate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
6761 spin_lock_irq(&bfqd
->lock
);
6762 bfq_put_async_queues(bfqd
, bfqd
->root_group
);
6763 kfree(bfqd
->root_group
);
6764 spin_unlock_irq(&bfqd
->lock
);
6770 static void bfq_init_root_group(struct bfq_group
*root_group
,
6771 struct bfq_data
*bfqd
)
6775 #ifdef CONFIG_BFQ_GROUP_IOSCHED
6776 root_group
->entity
.parent
= NULL
;
6777 root_group
->my_entity
= NULL
;
6778 root_group
->bfqd
= bfqd
;
6780 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
6781 root_group
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
6782 root_group
->sched_data
.bfq_class_idle_last_service
= jiffies
;
6785 static int bfq_init_queue(struct request_queue
*q
, struct elevator_type
*e
)
6787 struct bfq_data
*bfqd
;
6788 struct elevator_queue
*eq
;
6790 eq
= elevator_alloc(q
, e
);
6794 bfqd
= kzalloc_node(sizeof(*bfqd
), GFP_KERNEL
, q
->node
);
6796 kobject_put(&eq
->kobj
);
6799 eq
->elevator_data
= bfqd
;
6801 spin_lock_irq(q
->queue_lock
);
6803 spin_unlock_irq(q
->queue_lock
);
6806 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
6807 * Grab a permanent reference to it, so that the normal code flow
6808 * will not attempt to free it.
6810 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, NULL
, 1, 0);
6811 bfqd
->oom_bfqq
.ref
++;
6812 bfqd
->oom_bfqq
.new_ioprio
= BFQ_DEFAULT_QUEUE_IOPRIO
;
6813 bfqd
->oom_bfqq
.new_ioprio_class
= IOPRIO_CLASS_BE
;
6814 bfqd
->oom_bfqq
.entity
.new_weight
=
6815 bfq_ioprio_to_weight(bfqd
->oom_bfqq
.new_ioprio
);
6817 * Trigger weight initialization, according to ioprio, at the
6818 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
6819 * class won't be changed any more.
6821 bfqd
->oom_bfqq
.entity
.prio_changed
= 1;
6825 INIT_LIST_HEAD(&bfqd
->dispatch
);
6827 hrtimer_init(&bfqd
->idle_slice_timer
, CLOCK_MONOTONIC
,
6829 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
6831 INIT_LIST_HEAD(&bfqd
->active_list
);
6832 INIT_LIST_HEAD(&bfqd
->idle_list
);
6836 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
6838 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
6839 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
6840 bfqd
->bfq_back_max
= bfq_back_max
;
6841 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
6842 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
6843 bfqd
->bfq_timeout
= bfq_timeout
;
6845 bfqd
->bfq_requests_within_timer
= 120;
6847 bfqd
->low_latency
= true;
6850 * Trade-off between responsiveness and fairness.
6852 bfqd
->bfq_wr_coeff
= 30;
6853 bfqd
->bfq_wr_rt_max_time
= msecs_to_jiffies(300);
6854 bfqd
->bfq_wr_max_time
= 0;
6855 bfqd
->bfq_wr_min_idle_time
= msecs_to_jiffies(2000);
6856 bfqd
->bfq_wr_min_inter_arr_async
= msecs_to_jiffies(500);
6857 bfqd
->bfq_wr_max_softrt_rate
= 7000; /*
6858 * Approximate rate required
6859 * to playback or record a
6860 * high-definition compressed
6863 bfqd
->wr_busy_queues
= 0;
6866 * Begin by assuming, optimistically, that the device is a
6867 * high-speed one, and that its peak rate is equal to 2/3 of
6868 * the highest reference rate.
6870 bfqd
->RT_prod
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] *
6871 T_fast
[blk_queue_nonrot(bfqd
->queue
)];
6872 bfqd
->peak_rate
= R_fast
[blk_queue_nonrot(bfqd
->queue
)] * 2 / 3;
6873 bfqd
->device_speed
= BFQ_BFQD_FAST
;
6875 spin_lock_init(&bfqd
->lock
);
6878 * The invocation of the next bfq_create_group_hierarchy
6879 * function is the head of a chain of function calls
6880 * (bfq_create_group_hierarchy->blkcg_activate_policy->
6881 * blk_mq_freeze_queue) that may lead to the invocation of the
6882 * has_work hook function. For this reason,
6883 * bfq_create_group_hierarchy is invoked only after all
6884 * scheduler data has been initialized, apart from the fields
6885 * that can be initialized only after invoking
6886 * bfq_create_group_hierarchy. This, in particular, enables
6887 * has_work to correctly return false. Of course, to avoid
6888 * other inconsistencies, the blk-mq stack must then refrain
6889 * from invoking further scheduler hooks before this init
6890 * function is finished.
6892 bfqd
->root_group
= bfq_create_group_hierarchy(bfqd
, q
->node
);
6893 if (!bfqd
->root_group
)
6895 bfq_init_root_group(bfqd
->root_group
, bfqd
);
6896 bfq_init_entity(&bfqd
->oom_bfqq
.entity
, bfqd
->root_group
);
6903 kobject_put(&eq
->kobj
);
6907 static void bfq_slab_kill(void)
6909 kmem_cache_destroy(bfq_pool
);
6912 static int __init
bfq_slab_setup(void)
6914 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
6920 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
6922 return sprintf(page
, "%u\n", var
);
6925 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
,
6928 unsigned long new_val
;
6929 int ret
= kstrtoul(page
, 10, &new_val
);
6937 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
6938 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6940 struct bfq_data *bfqd = e->elevator_data; \
6941 u64 __data = __VAR; \
6943 __data = jiffies_to_msecs(__data); \
6944 else if (__CONV == 2) \
6945 __data = div_u64(__data, NSEC_PER_MSEC); \
6946 return bfq_var_show(__data, (page)); \
6948 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 2);
6949 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 2);
6950 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
6951 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
6952 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 2);
6953 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
6954 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
, 1);
6955 SHOW_FUNCTION(bfq_strict_guarantees_show
, bfqd
->strict_guarantees
, 0);
6956 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
6957 #undef SHOW_FUNCTION
6959 #define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6960 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6962 struct bfq_data *bfqd = e->elevator_data; \
6963 u64 __data = __VAR; \
6964 __data = div_u64(__data, NSEC_PER_USEC); \
6965 return bfq_var_show(__data, (page)); \
6967 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show
, bfqd
->bfq_slice_idle
);
6968 #undef USEC_SHOW_FUNCTION
6970 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6972 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
6974 struct bfq_data *bfqd = e->elevator_data; \
6975 unsigned long uninitialized_var(__data); \
6976 int ret = bfq_var_store(&__data, (page), count); \
6977 if (__data < (MIN)) \
6979 else if (__data > (MAX)) \
6982 *(__PTR) = msecs_to_jiffies(__data); \
6983 else if (__CONV == 2) \
6984 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6986 *(__PTR) = __data; \
6989 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
6991 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
6993 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
6994 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
6996 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 2);
6997 #undef STORE_FUNCTION
6999 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
7000 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
7002 struct bfq_data *bfqd = e->elevator_data; \
7003 unsigned long uninitialized_var(__data); \
7004 int ret = bfq_var_store(&__data, (page), count); \
7005 if (__data < (MIN)) \
7007 else if (__data > (MAX)) \
7009 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
7012 USEC_STORE_FUNCTION(bfq_slice_idle_us_store
, &bfqd
->bfq_slice_idle
, 0,
7014 #undef USEC_STORE_FUNCTION
7016 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
7017 const char *page
, size_t count
)
7019 struct bfq_data
*bfqd
= e
->elevator_data
;
7020 unsigned long uninitialized_var(__data
);
7021 int ret
= bfq_var_store(&__data
, (page
), count
);
7024 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
7026 if (__data
> INT_MAX
)
7028 bfqd
->bfq_max_budget
= __data
;
7031 bfqd
->bfq_user_max_budget
= __data
;
7037 * Leaving this name to preserve name compatibility with cfq
7038 * parameters, but this timeout is used for both sync and async.
7040 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
7041 const char *page
, size_t count
)
7043 struct bfq_data
*bfqd
= e
->elevator_data
;
7044 unsigned long uninitialized_var(__data
);
7045 int ret
= bfq_var_store(&__data
, (page
), count
);
7049 else if (__data
> INT_MAX
)
7052 bfqd
->bfq_timeout
= msecs_to_jiffies(__data
);
7053 if (bfqd
->bfq_user_max_budget
== 0)
7054 bfqd
->bfq_max_budget
= bfq_calc_max_budget(bfqd
);
7059 static ssize_t
bfq_strict_guarantees_store(struct elevator_queue
*e
,
7060 const char *page
, size_t count
)
7062 struct bfq_data
*bfqd
= e
->elevator_data
;
7063 unsigned long uninitialized_var(__data
);
7064 int ret
= bfq_var_store(&__data
, (page
), count
);
7068 if (!bfqd
->strict_guarantees
&& __data
== 1
7069 && bfqd
->bfq_slice_idle
< 8 * NSEC_PER_MSEC
)
7070 bfqd
->bfq_slice_idle
= 8 * NSEC_PER_MSEC
;
7072 bfqd
->strict_guarantees
= __data
;
7077 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
7078 const char *page
, size_t count
)
7080 struct bfq_data
*bfqd
= e
->elevator_data
;
7081 unsigned long uninitialized_var(__data
);
7082 int ret
= bfq_var_store(&__data
, (page
), count
);
7086 if (__data
== 0 && bfqd
->low_latency
!= 0)
7088 bfqd
->low_latency
= __data
;
7093 #define BFQ_ATTR(name) \
7094 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
7096 static struct elv_fs_entry bfq_attrs
[] = {
7097 BFQ_ATTR(fifo_expire_sync
),
7098 BFQ_ATTR(fifo_expire_async
),
7099 BFQ_ATTR(back_seek_max
),
7100 BFQ_ATTR(back_seek_penalty
),
7101 BFQ_ATTR(slice_idle
),
7102 BFQ_ATTR(slice_idle_us
),
7103 BFQ_ATTR(max_budget
),
7104 BFQ_ATTR(timeout_sync
),
7105 BFQ_ATTR(strict_guarantees
),
7106 BFQ_ATTR(low_latency
),
7110 static struct elevator_type iosched_bfq_mq
= {
7112 .get_rq_priv
= bfq_get_rq_private
,
7113 .put_rq_priv
= bfq_put_rq_private
,
7114 .exit_icq
= bfq_exit_icq
,
7115 .insert_requests
= bfq_insert_requests
,
7116 .dispatch_request
= bfq_dispatch_request
,
7117 .next_request
= elv_rb_latter_request
,
7118 .former_request
= elv_rb_former_request
,
7119 .allow_merge
= bfq_allow_bio_merge
,
7120 .bio_merge
= bfq_bio_merge
,
7121 .request_merge
= bfq_request_merge
,
7122 .requests_merged
= bfq_requests_merged
,
7123 .request_merged
= bfq_request_merged
,
7124 .has_work
= bfq_has_work
,
7125 .init_sched
= bfq_init_queue
,
7126 .exit_sched
= bfq_exit_queue
,
7130 .icq_size
= sizeof(struct bfq_io_cq
),
7131 .icq_align
= __alignof__(struct bfq_io_cq
),
7132 .elevator_attrs
= bfq_attrs
,
7133 .elevator_name
= "bfq",
7134 .elevator_owner
= THIS_MODULE
,
7137 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7138 static struct blkcg_policy blkcg_policy_bfq
= {
7139 .dfl_cftypes
= bfq_blkg_files
,
7140 .legacy_cftypes
= bfq_blkcg_legacy_files
,
7142 .cpd_alloc_fn
= bfq_cpd_alloc
,
7143 .cpd_init_fn
= bfq_cpd_init
,
7144 .cpd_bind_fn
= bfq_cpd_init
,
7145 .cpd_free_fn
= bfq_cpd_free
,
7147 .pd_alloc_fn
= bfq_pd_alloc
,
7148 .pd_init_fn
= bfq_pd_init
,
7149 .pd_offline_fn
= bfq_pd_offline
,
7150 .pd_free_fn
= bfq_pd_free
,
7151 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
7155 static int __init
bfq_init(void)
7159 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7160 ret
= blkcg_policy_register(&blkcg_policy_bfq
);
7166 if (bfq_slab_setup())
7170 * Times to load large popular applications for the typical
7171 * systems installed on the reference devices (see the
7172 * comments before the definitions of the next two
7173 * arrays). Actually, we use slightly slower values, as the
7174 * estimated peak rate tends to be smaller than the actual
7175 * peak rate. The reason for this last fact is that estimates
7176 * are computed over much shorter time intervals than the long
7177 * intervals typically used for benchmarking. Why? First, to
7178 * adapt more quickly to variations. Second, because an I/O
7179 * scheduler cannot rely on a peak-rate-evaluation workload to
7180 * be run for a long time.
7182 T_slow
[0] = msecs_to_jiffies(3500); /* actually 4 sec */
7183 T_slow
[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
7184 T_fast
[0] = msecs_to_jiffies(7000); /* actually 8 sec */
7185 T_fast
[1] = msecs_to_jiffies(2500); /* actually 3 sec */
7188 * Thresholds that determine the switch between speed classes
7189 * (see the comments before the definition of the array
7190 * device_speed_thresh). These thresholds are biased towards
7191 * transitions to the fast class. This is safer than the
7192 * opposite bias. In fact, a wrong transition to the slow
7193 * class results in short weight-raising periods, because the
7194 * speed of the device then tends to be higher that the
7195 * reference peak rate. On the opposite end, a wrong
7196 * transition to the fast class tends to increase
7197 * weight-raising periods, because of the opposite reason.
7199 device_speed_thresh
[0] = (4 * R_slow
[0]) / 3;
7200 device_speed_thresh
[1] = (4 * R_slow
[1]) / 3;
7202 ret
= elv_register(&iosched_bfq_mq
);
7209 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7210 blkcg_policy_unregister(&blkcg_policy_bfq
);
7215 static void __exit
bfq_exit(void)
7217 elv_unregister(&iosched_bfq_mq
);
7218 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7219 blkcg_policy_unregister(&blkcg_policy_bfq
);
7224 module_init(bfq_init
);
7225 module_exit(bfq_exit
);
7227 MODULE_AUTHOR("Paolo Valente");
7228 MODULE_LICENSE("GPL");
7229 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");