]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/bfq-iosched.c
block, bfq: add full hierarchical scheduling and cgroups support
[mirror_ubuntu-bionic-kernel.git] / block / bfq-iosched.c
CommitLineData
aee69d78
PV
1/*
2 * Budget Fair Queueing (BFQ) I/O scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
12 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
30 *
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
48 * applications.
49 *
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
58 *
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
67 *
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
71 * in [3].
72 *
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
77 *
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
80 * Oct 1997.
81 *
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
83 *
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
87 *
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
89 */
90#include <linux/module.h>
91#include <linux/slab.h>
92#include <linux/blkdev.h>
e21b7a0b 93#include <linux/cgroup.h>
aee69d78
PV
94#include <linux/elevator.h>
95#include <linux/ktime.h>
96#include <linux/rbtree.h>
97#include <linux/ioprio.h>
98#include <linux/sbitmap.h>
99#include <linux/delay.h>
100
101#include "blk.h"
102#include "blk-mq.h"
103#include "blk-mq-tag.h"
104#include "blk-mq-sched.h"
105#include <linux/blktrace_api.h>
106#include <linux/hrtimer.h>
107#include <linux/blk-cgroup.h>
108
109#define BFQ_IOPRIO_CLASSES 3
110#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
111
112#define BFQ_MIN_WEIGHT 1
113#define BFQ_MAX_WEIGHT 1000
114#define BFQ_WEIGHT_CONVERSION_COEFF 10
115
116#define BFQ_DEFAULT_QUEUE_IOPRIO 4
117
e21b7a0b 118#define BFQ_WEIGHT_LEGACY_DFL 100
aee69d78
PV
119#define BFQ_DEFAULT_GRP_IOPRIO 0
120#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
121
122struct bfq_entity;
123
124/**
125 * struct bfq_service_tree - per ioprio_class service tree.
126 *
127 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
128 * ioprio_class has its own independent scheduler, and so its own
129 * bfq_service_tree. All the fields are protected by the queue lock
130 * of the containing bfqd.
131 */
132struct bfq_service_tree {
133 /* tree for active entities (i.e., those backlogged) */
134 struct rb_root active;
135 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
136 struct rb_root idle;
137
138 /* idle entity with minimum F_i */
139 struct bfq_entity *first_idle;
140 /* idle entity with maximum F_i */
141 struct bfq_entity *last_idle;
142
143 /* scheduler virtual time */
144 u64 vtime;
145 /* scheduler weight sum; active and idle entities contribute to it */
146 unsigned long wsum;
147};
148
149/**
150 * struct bfq_sched_data - multi-class scheduler.
151 *
152 * bfq_sched_data is the basic scheduler queue. It supports three
e21b7a0b
AA
153 * ioprio_classes, and can be used either as a toplevel queue or as an
154 * intermediate queue on a hierarchical setup. @next_in_service
155 * points to the active entity of the sched_data service trees that
156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
aee69d78
PV
158 *
159 * The supported ioprio_classes are the same as in CFQ, in descending
160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
161 * Requests from higher priority queues are served before all the
162 * requests from lower priority queues; among requests of the same
163 * queue requests are served according to B-WF2Q+.
164 * All the fields are protected by the queue lock of the containing bfqd.
165 */
166struct bfq_sched_data {
167 /* entity in service */
168 struct bfq_entity *in_service_entity;
e21b7a0b 169 /* head-of-line entity (see comments above) */
aee69d78
PV
170 struct bfq_entity *next_in_service;
171 /* array of service trees, one per ioprio_class */
172 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
e21b7a0b
AA
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service;
175
aee69d78
PV
176};
177
178/**
179 * struct bfq_entity - schedulable entity.
180 *
e21b7a0b
AA
181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
183 * entity belongs to the sched_data of the parent group in the cgroup
184 * hierarchy. Non-leaf entities have also their own sched_data, stored
185 * in @my_sched_data.
aee69d78
PV
186 *
187 * Each entity stores independently its priority values; this would
188 * allow different weights on different devices, but this
189 * functionality is not exported to userspace by now. Priorities and
190 * weights are updated lazily, first storing the new values into the
191 * new_* fields, then setting the @prio_changed flag. As soon as
192 * there is a transition in the entity state that allows the priority
193 * update to take place the effective and the requested priority
194 * values are synchronized.
195 *
e21b7a0b
AA
196 * Unless cgroups are used, the weight value is calculated from the
197 * ioprio to export the same interface as CFQ. When dealing with
198 * ``well-behaved'' queues (i.e., queues that do not spend too much
199 * time to consume their budget and have true sequential behavior, and
200 * when there are no external factors breaking anticipation) the
201 * relative weights at each level of the cgroups hierarchy should be
202 * guaranteed. All the fields are protected by the queue lock of the
203 * containing bfqd.
aee69d78
PV
204 */
205struct bfq_entity {
206 /* service_tree member */
207 struct rb_node rb_node;
208
209 /*
e21b7a0b
AA
210 * Flag, true if the entity is on a tree (either the active or
211 * the idle one of its service_tree) or is in service.
aee69d78 212 */
e21b7a0b 213 bool on_st;
aee69d78
PV
214
215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
216 u64 start, finish;
217
218 /* tree the entity is enqueued into; %NULL if not on a tree */
219 struct rb_root *tree;
220
221 /*
222 * minimum start time of the (active) subtree rooted at this
223 * entity; used for O(log N) lookups into active trees
224 */
225 u64 min_start;
226
227 /* amount of service received during the last service slot */
228 int service;
229
230 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
231 int budget;
232
233 /* weight of the queue */
234 int weight;
235 /* next weight if a change is in progress */
236 int new_weight;
237
238 /* original weight, used to implement weight boosting */
239 int orig_weight;
240
241 /* parent entity, for hierarchical scheduling */
242 struct bfq_entity *parent;
243
244 /*
245 * For non-leaf nodes in the hierarchy, the associated
246 * scheduler queue, %NULL on leaf nodes.
247 */
248 struct bfq_sched_data *my_sched_data;
249 /* the scheduler queue this entity belongs to */
250 struct bfq_sched_data *sched_data;
251
252 /* flag, set to request a weight, ioprio or ioprio_class change */
253 int prio_changed;
254};
255
e21b7a0b
AA
256struct bfq_group;
257
aee69d78
PV
258/**
259 * struct bfq_ttime - per process thinktime stats.
260 */
261struct bfq_ttime {
262 /* completion time of the last request */
263 u64 last_end_request;
264
265 /* total process thinktime */
266 u64 ttime_total;
267 /* number of thinktime samples */
268 unsigned long ttime_samples;
269 /* average process thinktime */
270 u64 ttime_mean;
271};
272
273/**
274 * struct bfq_queue - leaf schedulable entity.
275 *
276 * A bfq_queue is a leaf request queue; it can be associated with an
e21b7a0b
AA
277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
aee69d78
PV
282 */
283struct bfq_queue {
284 /* reference counter */
285 int ref;
286 /* parent bfq_data */
287 struct bfq_data *bfqd;
288
289 /* current ioprio and ioprio class */
290 unsigned short ioprio, ioprio_class;
291 /* next ioprio and ioprio class if a change is in progress */
292 unsigned short new_ioprio, new_ioprio_class;
293
294 /* sorted list of pending requests */
295 struct rb_root sort_list;
296 /* if fifo isn't expired, next request to serve */
297 struct request *next_rq;
298 /* number of sync and async requests queued */
299 int queued[2];
300 /* number of requests currently allocated */
301 int allocated;
302 /* number of pending metadata requests */
303 int meta_pending;
304 /* fifo list of requests in sort_list */
305 struct list_head fifo;
306
307 /* entity representing this queue in the scheduler */
308 struct bfq_entity entity;
309
310 /* maximum budget allowed from the feedback mechanism */
311 int max_budget;
312 /* budget expiration (in jiffies) */
313 unsigned long budget_timeout;
314
315 /* number of requests on the dispatch list or inside driver */
316 int dispatched;
317
318 /* status flags */
319 unsigned long flags;
320
321 /* node for active/idle bfqq list inside parent bfqd */
322 struct list_head bfqq_list;
323
324 /* associated @bfq_ttime struct */
325 struct bfq_ttime ttime;
326
327 /* bit vector: a 1 for each seeky requests in history */
328 u32 seek_history;
329 /* position of the last request enqueued */
330 sector_t last_request_pos;
331
332 /* Number of consecutive pairs of request completion and
333 * arrival, such that the queue becomes idle after the
334 * completion, but the next request arrives within an idle
335 * time slice; used only if the queue's IO_bound flag has been
336 * cleared.
337 */
338 unsigned int requests_within_timer;
339
340 /* pid of the process owning the queue, used for logging purposes */
341 pid_t pid;
342};
343
344/**
345 * struct bfq_io_cq - per (request_queue, io_context) structure.
346 */
347struct bfq_io_cq {
348 /* associated io_cq structure */
349 struct io_cq icq; /* must be the first member */
350 /* array of two process queues, the sync and the async */
351 struct bfq_queue *bfqq[2];
352 /* per (request_queue, blkcg) ioprio */
353 int ioprio;
e21b7a0b
AA
354#ifdef CONFIG_BFQ_GROUP_IOSCHED
355 uint64_t blkcg_serial_nr; /* the current blkcg serial */
356#endif
aee69d78
PV
357};
358
359/**
360 * struct bfq_data - per-device data structure.
361 *
362 * All the fields are protected by @lock.
363 */
364struct bfq_data {
365 /* device request queue */
366 struct request_queue *queue;
367 /* dispatch queue */
368 struct list_head dispatch;
369
e21b7a0b
AA
370 /* root bfq_group for the device */
371 struct bfq_group *root_group;
aee69d78
PV
372
373 /*
374 * Number of bfq_queues containing requests (including the
375 * queue in service, even if it is idling).
376 */
377 int busy_queues;
378 /* number of queued requests */
379 int queued;
380 /* number of requests dispatched and waiting for completion */
381 int rq_in_driver;
382
383 /*
384 * Maximum number of requests in driver in the last
385 * @hw_tag_samples completed requests.
386 */
387 int max_rq_in_driver;
388 /* number of samples used to calculate hw_tag */
389 int hw_tag_samples;
390 /* flag set to one if the driver is showing a queueing behavior */
391 int hw_tag;
392
393 /* number of budgets assigned */
394 int budgets_assigned;
395
396 /*
397 * Timer set when idling (waiting) for the next request from
398 * the queue in service.
399 */
400 struct hrtimer idle_slice_timer;
401
402 /* bfq_queue in service */
403 struct bfq_queue *in_service_queue;
404 /* bfq_io_cq (bic) associated with the @in_service_queue */
405 struct bfq_io_cq *in_service_bic;
406
407 /* on-disk position of the last served request */
408 sector_t last_position;
409
410 /* beginning of the last budget */
411 ktime_t last_budget_start;
412 /* beginning of the last idle slice */
413 ktime_t last_idling_start;
414 /* number of samples used to calculate @peak_rate */
415 int peak_rate_samples;
416 /*
417 * Peak read/write rate, observed during the service of a
418 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
419 * left-shifted by BFQ_RATE_SHIFT to increase precision in
420 * fixed-point calculations.
421 */
422 u64 peak_rate;
423 /* maximum budget allotted to a bfq_queue before rescheduling */
424 int bfq_max_budget;
425
426 /* list of all the bfq_queues active on the device */
427 struct list_head active_list;
428 /* list of all the bfq_queues idle on the device */
429 struct list_head idle_list;
430
431 /*
432 * Timeout for async/sync requests; when it fires, requests
433 * are served in fifo order.
434 */
435 u64 bfq_fifo_expire[2];
436 /* weight of backward seeks wrt forward ones */
437 unsigned int bfq_back_penalty;
438 /* maximum allowed backward seek */
439 unsigned int bfq_back_max;
440 /* maximum idling time */
441 u32 bfq_slice_idle;
aee69d78
PV
442
443 /* user-configured max budget value (0 for auto-tuning) */
444 int bfq_user_max_budget;
445 /*
446 * Timeout for bfq_queues to consume their budget; used to
447 * prevent seeky queues from imposing long latencies to
448 * sequential or quasi-sequential ones (this also implies that
449 * seeky queues cannot receive guarantees in the service
450 * domain; after a timeout they are charged for the time they
451 * have been in service, to preserve fairness among them, but
452 * without service-domain guarantees).
453 */
454 unsigned int bfq_timeout;
455
456 /*
457 * Number of consecutive requests that must be issued within
458 * the idle time slice to set again idling to a queue which
459 * was marked as non-I/O-bound (see the definition of the
460 * IO_bound flag for further details).
461 */
462 unsigned int bfq_requests_within_timer;
463
464 /*
465 * Force device idling whenever needed to provide accurate
466 * service guarantees, without caring about throughput
467 * issues. CAVEAT: this may even increase latencies, in case
468 * of useless idling for processes that did stop doing I/O.
469 */
470 bool strict_guarantees;
471
472 /* fallback dummy bfqq for extreme OOM conditions */
473 struct bfq_queue oom_bfqq;
474
475 spinlock_t lock;
476
477 /*
478 * bic associated with the task issuing current bio for
479 * merging. This and the next field are used as a support to
480 * be able to perform the bic lookup, needed by bio-merge
481 * functions, before the scheduler lock is taken, and thus
482 * avoid taking the request-queue lock while the scheduler
483 * lock is being held.
484 */
485 struct bfq_io_cq *bio_bic;
486 /* bfqq associated with the task issuing current bio for merging */
487 struct bfq_queue *bio_bfqq;
488};
489
490enum bfqq_state_flags {
491 BFQQF_busy = 0, /* has requests or is in service */
492 BFQQF_wait_request, /* waiting for a request */
493 BFQQF_non_blocking_wait_rq, /*
494 * waiting for a request
495 * without idling the device
496 */
497 BFQQF_fifo_expire, /* FIFO checked in this slice */
498 BFQQF_idle_window, /* slice idling enabled */
499 BFQQF_sync, /* synchronous queue */
500 BFQQF_budget_new, /* no completion with this budget */
501 BFQQF_IO_bound, /*
502 * bfqq has timed-out at least once
503 * having consumed at most 2/10 of
504 * its budget
505 */
506};
507
508#define BFQ_BFQQ_FNS(name) \
509static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
510{ \
511 __set_bit(BFQQF_##name, &(bfqq)->flags); \
512} \
513static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
514{ \
515 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
516} \
517static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
518{ \
519 return test_bit(BFQQF_##name, &(bfqq)->flags); \
520}
521
522BFQ_BFQQ_FNS(busy);
523BFQ_BFQQ_FNS(wait_request);
524BFQ_BFQQ_FNS(non_blocking_wait_rq);
525BFQ_BFQQ_FNS(fifo_expire);
526BFQ_BFQQ_FNS(idle_window);
527BFQ_BFQQ_FNS(sync);
528BFQ_BFQQ_FNS(budget_new);
529BFQ_BFQQ_FNS(IO_bound);
530#undef BFQ_BFQQ_FNS
531
532/* Logging facilities. */
e21b7a0b
AA
533#ifdef CONFIG_BFQ_GROUP_IOSCHED
534static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
535static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
536
537#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
538 char __pbuf[128]; \
539 \
540 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
541 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
542 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
543 __pbuf, ##args); \
544} while (0)
545
546#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
547 char __pbuf[128]; \
548 \
549 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
550 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
551} while (0)
552
553#else /* CONFIG_BFQ_GROUP_IOSCHED */
554
555#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
556 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
557 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
558 ##args)
559#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
560
561#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
562
563#define bfq_log(bfqd, fmt, args...) \
564 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
565
566/* Expiration reasons. */
567enum bfqq_expiration {
568 BFQQE_TOO_IDLE = 0, /*
569 * queue has been idling for
570 * too long
571 */
572 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
573 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
574 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
575 BFQQE_PREEMPTED /* preemption in progress */
576};
577
e21b7a0b
AA
578struct bfqg_stats {
579#ifdef CONFIG_BFQ_GROUP_IOSCHED
580 /* number of ios merged */
581 struct blkg_rwstat merged;
582 /* total time spent on device in ns, may not be accurate w/ queueing */
583 struct blkg_rwstat service_time;
584 /* total time spent waiting in scheduler queue in ns */
585 struct blkg_rwstat wait_time;
586 /* number of IOs queued up */
587 struct blkg_rwstat queued;
588 /* total disk time and nr sectors dispatched by this group */
589 struct blkg_stat time;
590 /* sum of number of ios queued across all samples */
591 struct blkg_stat avg_queue_size_sum;
592 /* count of samples taken for average */
593 struct blkg_stat avg_queue_size_samples;
594 /* how many times this group has been removed from service tree */
595 struct blkg_stat dequeue;
596 /* total time spent waiting for it to be assigned a timeslice. */
597 struct blkg_stat group_wait_time;
598 /* time spent idling for this blkcg_gq */
599 struct blkg_stat idle_time;
600 /* total time with empty current active q with other requests queued */
601 struct blkg_stat empty_time;
602 /* fields after this shouldn't be cleared on stat reset */
603 uint64_t start_group_wait_time;
604 uint64_t start_idle_time;
605 uint64_t start_empty_time;
606 uint16_t flags;
607#endif /* CONFIG_BFQ_GROUP_IOSCHED */
608};
609
610#ifdef CONFIG_BFQ_GROUP_IOSCHED
611
612/*
613 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
614 *
615 * @ps: @blkcg_policy_storage that this structure inherits
616 * @weight: weight of the bfq_group
617 */
618struct bfq_group_data {
619 /* must be the first member */
620 struct blkcg_policy_data pd;
621
622 unsigned short weight;
623};
624
625/**
626 * struct bfq_group - per (device, cgroup) data structure.
627 * @entity: schedulable entity to insert into the parent group sched_data.
628 * @sched_data: own sched_data, to contain child entities (they may be
629 * both bfq_queues and bfq_groups).
630 * @bfqd: the bfq_data for the device this group acts upon.
631 * @async_bfqq: array of async queues for all the tasks belonging to
632 * the group, one queue per ioprio value per ioprio_class,
633 * except for the idle class that has only one queue.
634 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
635 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
636 * to avoid too many special cases during group creation/
637 * migration.
638 * @stats: stats for this bfqg.
639 *
640 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
641 * there is a set of bfq_groups, each one collecting the lower-level
642 * entities belonging to the group that are acting on the same device.
643 *
644 * Locking works as follows:
645 * o @bfqd is protected by the queue lock, RCU is used to access it
646 * from the readers.
647 * o All the other fields are protected by the @bfqd queue lock.
648 */
649struct bfq_group {
650 /* must be the first member */
651 struct blkg_policy_data pd;
652
653 struct bfq_entity entity;
654 struct bfq_sched_data sched_data;
655
656 void *bfqd;
657
658 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
659 struct bfq_queue *async_idle_bfqq;
660
661 struct bfq_entity *my_entity;
662
663 struct bfqg_stats stats;
664};
665
666#else
667struct bfq_group {
668 struct bfq_sched_data sched_data;
669
670 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
671 struct bfq_queue *async_idle_bfqq;
672
673 struct rb_root rq_pos_tree;
674};
675#endif
676
aee69d78
PV
677static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
678
e21b7a0b
AA
679static unsigned int bfq_class_idx(struct bfq_entity *entity)
680{
681 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
682
683 return bfqq ? bfqq->ioprio_class - 1 :
684 BFQ_DEFAULT_GRP_CLASS - 1;
685}
686
aee69d78
PV
687static struct bfq_service_tree *
688bfq_entity_service_tree(struct bfq_entity *entity)
689{
690 struct bfq_sched_data *sched_data = entity->sched_data;
e21b7a0b 691 unsigned int idx = bfq_class_idx(entity);
aee69d78
PV
692
693 return sched_data->service_tree + idx;
694}
695
696static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
697{
698 return bic->bfqq[is_sync];
699}
700
701static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
702 bool is_sync)
703{
704 bic->bfqq[is_sync] = bfqq;
705}
706
707static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
708{
709 return bic->icq.q->elevator->elevator_data;
710}
711
712static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
713static void bfq_put_queue(struct bfq_queue *bfqq);
714static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
715 struct bio *bio, bool is_sync,
716 struct bfq_io_cq *bic);
e21b7a0b 717static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
aee69d78
PV
718static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
719
aee69d78
PV
720/* Expiration time of sync (0) and async (1) requests, in ns. */
721static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
722
723/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
724static const int bfq_back_max = 16 * 1024;
725
726/* Penalty of a backwards seek, in number of sectors. */
727static const int bfq_back_penalty = 2;
728
729/* Idling period duration, in ns. */
730static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
731
732/* Minimum number of assigned budgets for which stats are safe to compute. */
733static const int bfq_stats_min_budgets = 194;
734
735/* Default maximum budget values, in sectors and number of requests. */
736static const int bfq_default_max_budget = 16 * 1024;
737
738/* Default timeout values, in jiffies, approximating CFQ defaults. */
739static const int bfq_timeout = HZ / 8;
740
741static struct kmem_cache *bfq_pool;
742
743/* Below this threshold (in ms), we consider thinktime immediate. */
744#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
745
746/* hw_tag detection: parallel requests threshold and min samples needed. */
747#define BFQ_HW_QUEUE_THRESHOLD 4
748#define BFQ_HW_QUEUE_SAMPLES 32
749
750#define BFQQ_SEEK_THR (sector_t)(8 * 100)
751#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
752#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
753#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
754
755/* Budget feedback step. */
756#define BFQ_BUDGET_STEP 128
757
758/* Min samples used for peak rate estimation (for autotuning). */
759#define BFQ_PEAK_RATE_SAMPLES 32
760
761/* Shift used for peak rate fixed precision calculations. */
762#define BFQ_RATE_SHIFT 16
763
764#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
765 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
766
767#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
768#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
769
770/**
771 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
772 * @icq: the iocontext queue.
773 */
774static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
775{
776 /* bic->icq is the first member, %NULL will convert to %NULL */
777 return container_of(icq, struct bfq_io_cq, icq);
778}
779
780/**
781 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
782 * @bfqd: the lookup key.
783 * @ioc: the io_context of the process doing I/O.
784 * @q: the request queue.
785 */
786static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
787 struct io_context *ioc,
788 struct request_queue *q)
789{
790 if (ioc) {
791 unsigned long flags;
792 struct bfq_io_cq *icq;
793
794 spin_lock_irqsave(q->queue_lock, flags);
795 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
796 spin_unlock_irqrestore(q->queue_lock, flags);
797
798 return icq;
799 }
800
801 return NULL;
802}
803
804/*
e21b7a0b
AA
805 * Scheduler run of queue, if there are requests pending and no one in the
806 * driver that will restart queueing.
807 */
808static void bfq_schedule_dispatch(struct bfq_data *bfqd)
809{
810 if (bfqd->queued != 0) {
811 bfq_log(bfqd, "schedule dispatch");
812 blk_mq_run_hw_queues(bfqd->queue, true);
813 }
814}
815
816/**
817 * bfq_gt - compare two timestamps.
818 * @a: first ts.
819 * @b: second ts.
820 *
821 * Return @a > @b, dealing with wrapping correctly.
822 */
823static int bfq_gt(u64 a, u64 b)
824{
825 return (s64)(a - b) > 0;
826}
827
828static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
829{
830 struct rb_node *node = tree->rb_node;
831
832 return rb_entry(node, struct bfq_entity, rb_node);
833}
834
835static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
836
837static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
838
839/**
840 * bfq_update_next_in_service - update sd->next_in_service
841 * @sd: sched_data for which to perform the update.
842 * @new_entity: if not NULL, pointer to the entity whose activation,
843 * requeueing or repositionig triggered the invocation of
844 * this function.
845 *
846 * This function is called to update sd->next_in_service, which, in
847 * its turn, may change as a consequence of the insertion or
848 * extraction of an entity into/from one of the active trees of
849 * sd. These insertions/extractions occur as a consequence of
850 * activations/deactivations of entities, with some activations being
851 * 'true' activations, and other activations being requeueings (i.e.,
852 * implementing the second, requeueing phase of the mechanism used to
853 * reposition an entity in its active tree; see comments on
854 * __bfq_activate_entity and __bfq_requeue_entity for details). In
855 * both the last two activation sub-cases, new_entity points to the
856 * just activated or requeued entity.
857 *
858 * Returns true if sd->next_in_service changes in such a way that
859 * entity->parent may become the next_in_service for its parent
860 * entity.
aee69d78 861 */
e21b7a0b
AA
862static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
863 struct bfq_entity *new_entity)
864{
865 struct bfq_entity *next_in_service = sd->next_in_service;
866 bool parent_sched_may_change = false;
867
868 /*
869 * If this update is triggered by the activation, requeueing
870 * or repositiong of an entity that does not coincide with
871 * sd->next_in_service, then a full lookup in the active tree
872 * can be avoided. In fact, it is enough to check whether the
873 * just-modified entity has a higher priority than
874 * sd->next_in_service, or, even if it has the same priority
875 * as sd->next_in_service, is eligible and has a lower virtual
876 * finish time than sd->next_in_service. If this compound
877 * condition holds, then the new entity becomes the new
878 * next_in_service. Otherwise no change is needed.
879 */
880 if (new_entity && new_entity != sd->next_in_service) {
881 /*
882 * Flag used to decide whether to replace
883 * sd->next_in_service with new_entity. Tentatively
884 * set to true, and left as true if
885 * sd->next_in_service is NULL.
886 */
887 bool replace_next = true;
888
889 /*
890 * If there is already a next_in_service candidate
891 * entity, then compare class priorities or timestamps
892 * to decide whether to replace sd->service_tree with
893 * new_entity.
894 */
895 if (next_in_service) {
896 unsigned int new_entity_class_idx =
897 bfq_class_idx(new_entity);
898 struct bfq_service_tree *st =
899 sd->service_tree + new_entity_class_idx;
900
901 /*
902 * For efficiency, evaluate the most likely
903 * sub-condition first.
904 */
905 replace_next =
906 (new_entity_class_idx ==
907 bfq_class_idx(next_in_service)
908 &&
909 !bfq_gt(new_entity->start, st->vtime)
910 &&
911 bfq_gt(next_in_service->finish,
912 new_entity->finish))
913 ||
914 new_entity_class_idx <
915 bfq_class_idx(next_in_service);
916 }
917
918 if (replace_next)
919 next_in_service = new_entity;
920 } else /* invoked because of a deactivation: lookup needed */
921 next_in_service = bfq_lookup_next_entity(sd);
922
923 if (next_in_service) {
924 parent_sched_may_change = !sd->next_in_service ||
925 bfq_update_parent_budget(next_in_service);
926 }
927
928 sd->next_in_service = next_in_service;
929
930 if (!next_in_service)
931 return parent_sched_may_change;
932
933 return parent_sched_may_change;
934}
935
936#ifdef CONFIG_BFQ_GROUP_IOSCHED
937/* both next loops stop at one of the child entities of the root group */
aee69d78 938#define for_each_entity(entity) \
e21b7a0b 939 for (; entity ; entity = entity->parent)
aee69d78 940
e21b7a0b
AA
941/*
942 * For each iteration, compute parent in advance, so as to be safe if
943 * entity is deallocated during the iteration. Such a deallocation may
944 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
945 * containing entity.
946 */
aee69d78 947#define for_each_entity_safe(entity, parent) \
e21b7a0b 948 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
aee69d78 949
e21b7a0b
AA
950/*
951 * Returns true if this budget changes may let next_in_service->parent
952 * become the next_in_service entity for its parent entity.
953 */
954static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 955{
e21b7a0b
AA
956 struct bfq_entity *bfqg_entity;
957 struct bfq_group *bfqg;
958 struct bfq_sched_data *group_sd;
959 bool ret = false;
960
961 group_sd = next_in_service->sched_data;
962
963 bfqg = container_of(group_sd, struct bfq_group, sched_data);
964 /*
965 * bfq_group's my_entity field is not NULL only if the group
966 * is not the root group. We must not touch the root entity
967 * as it must never become an in-service entity.
968 */
969 bfqg_entity = bfqg->my_entity;
970 if (bfqg_entity) {
971 if (bfqg_entity->budget > next_in_service->budget)
972 ret = true;
973 bfqg_entity->budget = next_in_service->budget;
974 }
975
976 return ret;
977}
978
979/*
980 * This function tells whether entity stops being a candidate for next
981 * service, according to the following logic.
982 *
983 * This function is invoked for an entity that is about to be set in
984 * service. If such an entity is a queue, then the entity is no longer
985 * a candidate for next service (i.e, a candidate entity to serve
986 * after the in-service entity is expired). The function then returns
987 * true.
988 */
989static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
990{
991 if (bfq_entity_to_bfqq(entity))
992 return true;
993
994 return false;
aee69d78
PV
995}
996
e21b7a0b
AA
997#else /* CONFIG_BFQ_GROUP_IOSCHED */
998/*
999 * Next two macros are fake loops when cgroups support is not
1000 * enabled. I fact, in such a case, there is only one level to go up
1001 * (to reach the root group).
1002 */
1003#define for_each_entity(entity) \
1004 for (; entity ; entity = NULL)
1005
1006#define for_each_entity_safe(entity, parent) \
1007 for (parent = NULL; entity ; entity = parent)
1008
1009static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 1010{
e21b7a0b 1011 return false;
aee69d78
PV
1012}
1013
e21b7a0b 1014static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
aee69d78 1015{
e21b7a0b 1016 return true;
aee69d78
PV
1017}
1018
e21b7a0b
AA
1019#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1020
aee69d78
PV
1021/*
1022 * Shift for timestamp calculations. This actually limits the maximum
1023 * service allowed in one timestamp delta (small shift values increase it),
1024 * the maximum total weight that can be used for the queues in the system
1025 * (big shift values increase it), and the period of virtual time
1026 * wraparounds.
1027 */
1028#define WFQ_SERVICE_SHIFT 22
1029
aee69d78
PV
1030static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1031{
1032 struct bfq_queue *bfqq = NULL;
1033
1034 if (!entity->my_sched_data)
1035 bfqq = container_of(entity, struct bfq_queue, entity);
1036
1037 return bfqq;
1038}
1039
1040
1041/**
1042 * bfq_delta - map service into the virtual time domain.
1043 * @service: amount of service.
1044 * @weight: scale factor (weight of an entity or weight sum).
1045 */
1046static u64 bfq_delta(unsigned long service, unsigned long weight)
1047{
1048 u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1049
1050 do_div(d, weight);
1051 return d;
1052}
1053
1054/**
1055 * bfq_calc_finish - assign the finish time to an entity.
1056 * @entity: the entity to act upon.
1057 * @service: the service to be charged to the entity.
1058 */
1059static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
1060{
1061 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1062
1063 entity->finish = entity->start +
1064 bfq_delta(service, entity->weight);
1065
1066 if (bfqq) {
1067 bfq_log_bfqq(bfqq->bfqd, bfqq,
1068 "calc_finish: serv %lu, w %d",
1069 service, entity->weight);
1070 bfq_log_bfqq(bfqq->bfqd, bfqq,
1071 "calc_finish: start %llu, finish %llu, delta %llu",
1072 entity->start, entity->finish,
1073 bfq_delta(service, entity->weight));
1074 }
1075}
1076
1077/**
1078 * bfq_entity_of - get an entity from a node.
1079 * @node: the node field of the entity.
1080 *
1081 * Convert a node pointer to the relative entity. This is used only
1082 * to simplify the logic of some functions and not as the generic
1083 * conversion mechanism because, e.g., in the tree walking functions,
1084 * the check for a %NULL value would be redundant.
1085 */
1086static struct bfq_entity *bfq_entity_of(struct rb_node *node)
1087{
1088 struct bfq_entity *entity = NULL;
1089
1090 if (node)
1091 entity = rb_entry(node, struct bfq_entity, rb_node);
1092
1093 return entity;
1094}
1095
1096/**
1097 * bfq_extract - remove an entity from a tree.
1098 * @root: the tree root.
1099 * @entity: the entity to remove.
1100 */
1101static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
1102{
1103 entity->tree = NULL;
1104 rb_erase(&entity->rb_node, root);
1105}
1106
1107/**
1108 * bfq_idle_extract - extract an entity from the idle tree.
1109 * @st: the service tree of the owning @entity.
1110 * @entity: the entity being removed.
1111 */
1112static void bfq_idle_extract(struct bfq_service_tree *st,
1113 struct bfq_entity *entity)
1114{
1115 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1116 struct rb_node *next;
1117
1118 if (entity == st->first_idle) {
1119 next = rb_next(&entity->rb_node);
1120 st->first_idle = bfq_entity_of(next);
1121 }
1122
1123 if (entity == st->last_idle) {
1124 next = rb_prev(&entity->rb_node);
1125 st->last_idle = bfq_entity_of(next);
1126 }
1127
1128 bfq_extract(&st->idle, entity);
1129
1130 if (bfqq)
1131 list_del(&bfqq->bfqq_list);
1132}
1133
1134/**
1135 * bfq_insert - generic tree insertion.
1136 * @root: tree root.
1137 * @entity: entity to insert.
1138 *
1139 * This is used for the idle and the active tree, since they are both
1140 * ordered by finish time.
1141 */
1142static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
1143{
1144 struct bfq_entity *entry;
1145 struct rb_node **node = &root->rb_node;
1146 struct rb_node *parent = NULL;
1147
1148 while (*node) {
1149 parent = *node;
1150 entry = rb_entry(parent, struct bfq_entity, rb_node);
1151
1152 if (bfq_gt(entry->finish, entity->finish))
1153 node = &parent->rb_left;
1154 else
1155 node = &parent->rb_right;
1156 }
1157
1158 rb_link_node(&entity->rb_node, parent, node);
1159 rb_insert_color(&entity->rb_node, root);
1160
1161 entity->tree = root;
1162}
1163
1164/**
1165 * bfq_update_min - update the min_start field of a entity.
1166 * @entity: the entity to update.
1167 * @node: one of its children.
1168 *
1169 * This function is called when @entity may store an invalid value for
1170 * min_start due to updates to the active tree. The function assumes
1171 * that the subtree rooted at @node (which may be its left or its right
1172 * child) has a valid min_start value.
1173 */
1174static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
1175{
1176 struct bfq_entity *child;
1177
1178 if (node) {
1179 child = rb_entry(node, struct bfq_entity, rb_node);
1180 if (bfq_gt(entity->min_start, child->min_start))
1181 entity->min_start = child->min_start;
1182 }
1183}
1184
1185/**
1186 * bfq_update_active_node - recalculate min_start.
1187 * @node: the node to update.
1188 *
1189 * @node may have changed position or one of its children may have moved,
1190 * this function updates its min_start value. The left and right subtrees
1191 * are assumed to hold a correct min_start value.
1192 */
1193static void bfq_update_active_node(struct rb_node *node)
1194{
1195 struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
1196
1197 entity->min_start = entity->start;
1198 bfq_update_min(entity, node->rb_right);
1199 bfq_update_min(entity, node->rb_left);
1200}
1201
1202/**
1203 * bfq_update_active_tree - update min_start for the whole active tree.
1204 * @node: the starting node.
1205 *
1206 * @node must be the deepest modified node after an update. This function
1207 * updates its min_start using the values held by its children, assuming
1208 * that they did not change, and then updates all the nodes that may have
1209 * changed in the path to the root. The only nodes that may have changed
1210 * are the ones in the path or their siblings.
1211 */
1212static void bfq_update_active_tree(struct rb_node *node)
1213{
1214 struct rb_node *parent;
1215
1216up:
1217 bfq_update_active_node(node);
1218
1219 parent = rb_parent(node);
1220 if (!parent)
1221 return;
1222
1223 if (node == parent->rb_left && parent->rb_right)
1224 bfq_update_active_node(parent->rb_right);
1225 else if (parent->rb_left)
1226 bfq_update_active_node(parent->rb_left);
1227
1228 node = parent;
1229 goto up;
1230}
1231
1232/**
1233 * bfq_active_insert - insert an entity in the active tree of its
1234 * group/device.
1235 * @st: the service tree of the entity.
1236 * @entity: the entity being inserted.
1237 *
1238 * The active tree is ordered by finish time, but an extra key is kept
1239 * per each node, containing the minimum value for the start times of
1240 * its children (and the node itself), so it's possible to search for
1241 * the eligible node with the lowest finish time in logarithmic time.
1242 */
1243static void bfq_active_insert(struct bfq_service_tree *st,
1244 struct bfq_entity *entity)
1245{
1246 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1247 struct rb_node *node = &entity->rb_node;
e21b7a0b
AA
1248#ifdef CONFIG_BFQ_GROUP_IOSCHED
1249 struct bfq_sched_data *sd = NULL;
1250 struct bfq_group *bfqg = NULL;
1251 struct bfq_data *bfqd = NULL;
1252#endif
aee69d78
PV
1253
1254 bfq_insert(&st->active, entity);
1255
1256 if (node->rb_left)
1257 node = node->rb_left;
1258 else if (node->rb_right)
1259 node = node->rb_right;
1260
1261 bfq_update_active_tree(node);
1262
e21b7a0b
AA
1263#ifdef CONFIG_BFQ_GROUP_IOSCHED
1264 sd = entity->sched_data;
1265 bfqg = container_of(sd, struct bfq_group, sched_data);
1266 bfqd = (struct bfq_data *)bfqg->bfqd;
1267#endif
aee69d78
PV
1268 if (bfqq)
1269 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
1270}
1271
1272/**
1273 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1274 * @ioprio: the ioprio value to convert.
1275 */
1276static unsigned short bfq_ioprio_to_weight(int ioprio)
1277{
1278 return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
1279}
1280
1281/**
1282 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1283 * @weight: the weight value to convert.
1284 *
1285 * To preserve as much as possible the old only-ioprio user interface,
1286 * 0 is used as an escape ioprio value for weights (numerically) equal or
1287 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1288 */
1289static unsigned short bfq_weight_to_ioprio(int weight)
1290{
1291 return max_t(int, 0,
1292 IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
1293}
1294
1295static void bfq_get_entity(struct bfq_entity *entity)
1296{
1297 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1298
1299 if (bfqq) {
1300 bfqq->ref++;
1301 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1302 bfqq, bfqq->ref);
1303 }
1304}
1305
1306/**
1307 * bfq_find_deepest - find the deepest node that an extraction can modify.
1308 * @node: the node being removed.
1309 *
1310 * Do the first step of an extraction in an rb tree, looking for the
1311 * node that will replace @node, and returning the deepest node that
1312 * the following modifications to the tree can touch. If @node is the
1313 * last node in the tree return %NULL.
1314 */
1315static struct rb_node *bfq_find_deepest(struct rb_node *node)
1316{
1317 struct rb_node *deepest;
1318
1319 if (!node->rb_right && !node->rb_left)
1320 deepest = rb_parent(node);
1321 else if (!node->rb_right)
1322 deepest = node->rb_left;
1323 else if (!node->rb_left)
1324 deepest = node->rb_right;
1325 else {
1326 deepest = rb_next(node);
1327 if (deepest->rb_right)
1328 deepest = deepest->rb_right;
1329 else if (rb_parent(deepest) != node)
1330 deepest = rb_parent(deepest);
1331 }
1332
1333 return deepest;
1334}
1335
1336/**
1337 * bfq_active_extract - remove an entity from the active tree.
1338 * @st: the service_tree containing the tree.
1339 * @entity: the entity being removed.
1340 */
1341static void bfq_active_extract(struct bfq_service_tree *st,
1342 struct bfq_entity *entity)
1343{
1344 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1345 struct rb_node *node;
e21b7a0b
AA
1346#ifdef CONFIG_BFQ_GROUP_IOSCHED
1347 struct bfq_sched_data *sd = NULL;
1348 struct bfq_group *bfqg = NULL;
1349 struct bfq_data *bfqd = NULL;
1350#endif
aee69d78
PV
1351
1352 node = bfq_find_deepest(&entity->rb_node);
1353 bfq_extract(&st->active, entity);
1354
1355 if (node)
1356 bfq_update_active_tree(node);
1357
e21b7a0b
AA
1358#ifdef CONFIG_BFQ_GROUP_IOSCHED
1359 sd = entity->sched_data;
1360 bfqg = container_of(sd, struct bfq_group, sched_data);
1361 bfqd = (struct bfq_data *)bfqg->bfqd;
1362#endif
aee69d78
PV
1363 if (bfqq)
1364 list_del(&bfqq->bfqq_list);
1365}
1366
1367/**
1368 * bfq_idle_insert - insert an entity into the idle tree.
1369 * @st: the service tree containing the tree.
1370 * @entity: the entity to insert.
1371 */
1372static void bfq_idle_insert(struct bfq_service_tree *st,
1373 struct bfq_entity *entity)
1374{
1375 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1376 struct bfq_entity *first_idle = st->first_idle;
1377 struct bfq_entity *last_idle = st->last_idle;
1378
1379 if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
1380 st->first_idle = entity;
1381 if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
1382 st->last_idle = entity;
1383
1384 bfq_insert(&st->idle, entity);
1385
1386 if (bfqq)
1387 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
1388}
1389
1390/**
1391 * bfq_forget_entity - do not consider entity any longer for scheduling
1392 * @st: the service tree.
1393 * @entity: the entity being removed.
1394 * @is_in_service: true if entity is currently the in-service entity.
1395 *
1396 * Forget everything about @entity. In addition, if entity represents
1397 * a queue, and the latter is not in service, then release the service
1398 * reference to the queue (the one taken through bfq_get_entity). In
1399 * fact, in this case, there is really no more service reference to
1400 * the queue, as the latter is also outside any service tree. If,
1401 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1402 * will take care of putting the reference when the queue finally
1403 * stops being served.
1404 */
1405static void bfq_forget_entity(struct bfq_service_tree *st,
1406 struct bfq_entity *entity,
1407 bool is_in_service)
1408{
1409 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1410
e21b7a0b 1411 entity->on_st = false;
aee69d78
PV
1412 st->wsum -= entity->weight;
1413 if (bfqq && !is_in_service)
1414 bfq_put_queue(bfqq);
1415}
1416
1417/**
1418 * bfq_put_idle_entity - release the idle tree ref of an entity.
1419 * @st: service tree for the entity.
1420 * @entity: the entity being released.
1421 */
1422static void bfq_put_idle_entity(struct bfq_service_tree *st,
1423 struct bfq_entity *entity)
1424{
1425 bfq_idle_extract(st, entity);
1426 bfq_forget_entity(st, entity,
1427 entity == entity->sched_data->in_service_entity);
1428}
1429
1430/**
1431 * bfq_forget_idle - update the idle tree if necessary.
1432 * @st: the service tree to act upon.
1433 *
1434 * To preserve the global O(log N) complexity we only remove one entry here;
1435 * as the idle tree will not grow indefinitely this can be done safely.
1436 */
1437static void bfq_forget_idle(struct bfq_service_tree *st)
1438{
1439 struct bfq_entity *first_idle = st->first_idle;
1440 struct bfq_entity *last_idle = st->last_idle;
1441
1442 if (RB_EMPTY_ROOT(&st->active) && last_idle &&
1443 !bfq_gt(last_idle->finish, st->vtime)) {
1444 /*
1445 * Forget the whole idle tree, increasing the vtime past
1446 * the last finish time of idle entities.
1447 */
1448 st->vtime = last_idle->finish;
1449 }
1450
1451 if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
1452 bfq_put_idle_entity(st, first_idle);
1453}
1454
1455static struct bfq_service_tree *
1456__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
e21b7a0b 1457 struct bfq_entity *entity)
aee69d78
PV
1458{
1459 struct bfq_service_tree *new_st = old_st;
1460
1461 if (entity->prio_changed) {
1462 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1463 unsigned short prev_weight, new_weight;
1464 struct bfq_data *bfqd = NULL;
e21b7a0b
AA
1465#ifdef CONFIG_BFQ_GROUP_IOSCHED
1466 struct bfq_sched_data *sd;
1467 struct bfq_group *bfqg;
1468#endif
aee69d78
PV
1469
1470 if (bfqq)
1471 bfqd = bfqq->bfqd;
e21b7a0b
AA
1472#ifdef CONFIG_BFQ_GROUP_IOSCHED
1473 else {
1474 sd = entity->my_sched_data;
1475 bfqg = container_of(sd, struct bfq_group, sched_data);
1476 bfqd = (struct bfq_data *)bfqg->bfqd;
1477 }
1478#endif
aee69d78
PV
1479
1480 old_st->wsum -= entity->weight;
1481
1482 if (entity->new_weight != entity->orig_weight) {
1483 if (entity->new_weight < BFQ_MIN_WEIGHT ||
1484 entity->new_weight > BFQ_MAX_WEIGHT) {
1485 pr_crit("update_weight_prio: new_weight %d\n",
1486 entity->new_weight);
1487 if (entity->new_weight < BFQ_MIN_WEIGHT)
1488 entity->new_weight = BFQ_MIN_WEIGHT;
1489 else
1490 entity->new_weight = BFQ_MAX_WEIGHT;
1491 }
1492 entity->orig_weight = entity->new_weight;
1493 if (bfqq)
1494 bfqq->ioprio =
1495 bfq_weight_to_ioprio(entity->orig_weight);
1496 }
1497
1498 if (bfqq)
1499 bfqq->ioprio_class = bfqq->new_ioprio_class;
1500 entity->prio_changed = 0;
1501
1502 /*
1503 * NOTE: here we may be changing the weight too early,
1504 * this will cause unfairness. The correct approach
1505 * would have required additional complexity to defer
1506 * weight changes to the proper time instants (i.e.,
1507 * when entity->finish <= old_st->vtime).
1508 */
1509 new_st = bfq_entity_service_tree(entity);
1510
1511 prev_weight = entity->weight;
1512 new_weight = entity->orig_weight;
1513 entity->weight = new_weight;
1514
1515 new_st->wsum += entity->weight;
1516
1517 if (new_st != old_st)
1518 entity->start = new_st->vtime;
1519 }
1520
1521 return new_st;
1522}
1523
e21b7a0b
AA
1524static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1525static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1526
aee69d78
PV
1527/**
1528 * bfq_bfqq_served - update the scheduler status after selection for
1529 * service.
1530 * @bfqq: the queue being served.
1531 * @served: bytes to transfer.
1532 *
1533 * NOTE: this can be optimized, as the timestamps of upper level entities
1534 * are synchronized every time a new bfqq is selected for service. By now,
1535 * we keep it to better check consistency.
1536 */
1537static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1538{
1539 struct bfq_entity *entity = &bfqq->entity;
1540 struct bfq_service_tree *st;
1541
1542 for_each_entity(entity) {
1543 st = bfq_entity_service_tree(entity);
1544
1545 entity->service += served;
1546
1547 st->vtime += bfq_delta(served, st->wsum);
1548 bfq_forget_idle(st);
1549 }
e21b7a0b 1550 bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
aee69d78
PV
1551 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1552}
1553
1554/**
1555 * bfq_bfqq_charge_full_budget - set the service to the entity budget.
1556 * @bfqq: the queue that needs a service update.
1557 *
1558 * When it's not possible to be fair in the service domain, because
1559 * a queue is not consuming its budget fast enough (the meaning of
1560 * fast depends on the timeout parameter), we charge it a full
1561 * budget. In this way we should obtain a sort of time-domain
1562 * fairness among all the seeky/slow queues.
1563 */
1564static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1565{
1566 struct bfq_entity *entity = &bfqq->entity;
1567
1568 bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
1569
1570 bfq_bfqq_served(bfqq, entity->budget - entity->service);
1571}
1572
e21b7a0b
AA
1573static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1574 struct bfq_service_tree *st,
1575 bool backshifted)
aee69d78 1576{
aee69d78
PV
1577 st = __bfq_entity_update_weight_prio(st, entity);
1578 bfq_calc_finish(entity, entity->budget);
1579
1580 /*
1581 * If some queues enjoy backshifting for a while, then their
1582 * (virtual) finish timestamps may happen to become lower and
1583 * lower than the system virtual time. In particular, if
1584 * these queues often happen to be idle for short time
1585 * periods, and during such time periods other queues with
1586 * higher timestamps happen to be busy, then the backshifted
1587 * timestamps of the former queues can become much lower than
1588 * the system virtual time. In fact, to serve the queues with
1589 * higher timestamps while the ones with lower timestamps are
1590 * idle, the system virtual time may be pushed-up to much
1591 * higher values than the finish timestamps of the idle
1592 * queues. As a consequence, the finish timestamps of all new
1593 * or newly activated queues may end up being much larger than
1594 * those of lucky queues with backshifted timestamps. The
1595 * latter queues may then monopolize the device for a lot of
1596 * time. This would simply break service guarantees.
1597 *
1598 * To reduce this problem, push up a little bit the
1599 * backshifted timestamps of the queue associated with this
1600 * entity (only a queue can happen to have the backshifted
1601 * flag set): just enough to let the finish timestamp of the
1602 * queue be equal to the current value of the system virtual
1603 * time. This may introduce a little unfairness among queues
1604 * with backshifted timestamps, but it does not break
1605 * worst-case fairness guarantees.
1606 */
1607 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
1608 unsigned long delta = st->vtime - entity->finish;
1609
1610 entity->start += delta;
1611 entity->finish += delta;
1612 }
1613
1614 bfq_active_insert(st, entity);
1615}
1616
1617/**
e21b7a0b
AA
1618 * __bfq_activate_entity - handle activation of entity.
1619 * @entity: the entity being activated.
1620 * @non_blocking_wait_rq: true if entity was waiting for a request
1621 *
1622 * Called for a 'true' activation, i.e., if entity is not active and
1623 * one of its children receives a new request.
1624 *
1625 * Basically, this function updates the timestamps of entity and
1626 * inserts entity into its active tree, ater possible extracting it
1627 * from its idle tree.
1628 */
1629static void __bfq_activate_entity(struct bfq_entity *entity,
1630 bool non_blocking_wait_rq)
1631{
1632 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1633 bool backshifted = false;
1634 unsigned long long min_vstart;
1635
1636 /* See comments on bfq_fqq_update_budg_for_activation */
1637 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1638 backshifted = true;
1639 min_vstart = entity->finish;
1640 } else
1641 min_vstart = st->vtime;
1642
1643 if (entity->tree == &st->idle) {
1644 /*
1645 * Must be on the idle tree, bfq_idle_extract() will
1646 * check for that.
1647 */
1648 bfq_idle_extract(st, entity);
1649 entity->start = bfq_gt(min_vstart, entity->finish) ?
1650 min_vstart : entity->finish;
1651 } else {
1652 /*
1653 * The finish time of the entity may be invalid, and
1654 * it is in the past for sure, otherwise the queue
1655 * would have been on the idle tree.
1656 */
1657 entity->start = min_vstart;
1658 st->wsum += entity->weight;
1659 /*
1660 * entity is about to be inserted into a service tree,
1661 * and then set in service: get a reference to make
1662 * sure entity does not disappear until it is no
1663 * longer in service or scheduled for service.
1664 */
1665 bfq_get_entity(entity);
1666
1667 entity->on_st = true;
1668 }
1669
1670 bfq_update_fin_time_enqueue(entity, st, backshifted);
1671}
1672
1673/**
1674 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1675 * @entity: the entity being requeued or repositioned.
1676 *
1677 * Requeueing is needed if this entity stops being served, which
1678 * happens if a leaf descendant entity has expired. On the other hand,
1679 * repositioning is needed if the next_inservice_entity for the child
1680 * entity has changed. See the comments inside the function for
1681 * details.
1682 *
1683 * Basically, this function: 1) removes entity from its active tree if
1684 * present there, 2) updates the timestamps of entity and 3) inserts
1685 * entity back into its active tree (in the new, right position for
1686 * the new values of the timestamps).
1687 */
1688static void __bfq_requeue_entity(struct bfq_entity *entity)
1689{
1690 struct bfq_sched_data *sd = entity->sched_data;
1691 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1692
1693 if (entity == sd->in_service_entity) {
1694 /*
1695 * We are requeueing the current in-service entity,
1696 * which may have to be done for one of the following
1697 * reasons:
1698 * - entity represents the in-service queue, and the
1699 * in-service queue is being requeued after an
1700 * expiration;
1701 * - entity represents a group, and its budget has
1702 * changed because one of its child entities has
1703 * just been either activated or requeued for some
1704 * reason; the timestamps of the entity need then to
1705 * be updated, and the entity needs to be enqueued
1706 * or repositioned accordingly.
1707 *
1708 * In particular, before requeueing, the start time of
1709 * the entity must be moved forward to account for the
1710 * service that the entity has received while in
1711 * service. This is done by the next instructions. The
1712 * finish time will then be updated according to this
1713 * new value of the start time, and to the budget of
1714 * the entity.
1715 */
1716 bfq_calc_finish(entity, entity->service);
1717 entity->start = entity->finish;
1718 /*
1719 * In addition, if the entity had more than one child
1720 * when set in service, then was not extracted from
1721 * the active tree. This implies that the position of
1722 * the entity in the active tree may need to be
1723 * changed now, because we have just updated the start
1724 * time of the entity, and we will update its finish
1725 * time in a moment (the requeueing is then, more
1726 * precisely, a repositioning in this case). To
1727 * implement this repositioning, we: 1) dequeue the
1728 * entity here, 2) update the finish time and
1729 * requeue the entity according to the new
1730 * timestamps below.
1731 */
1732 if (entity->tree)
1733 bfq_active_extract(st, entity);
1734 } else { /* The entity is already active, and not in service */
1735 /*
1736 * In this case, this function gets called only if the
1737 * next_in_service entity below this entity has
1738 * changed, and this change has caused the budget of
1739 * this entity to change, which, finally implies that
1740 * the finish time of this entity must be
1741 * updated. Such an update may cause the scheduling,
1742 * i.e., the position in the active tree, of this
1743 * entity to change. We handle this change by: 1)
1744 * dequeueing the entity here, 2) updating the finish
1745 * time and requeueing the entity according to the new
1746 * timestamps below. This is the same approach as the
1747 * non-extracted-entity sub-case above.
1748 */
1749 bfq_active_extract(st, entity);
1750 }
1751
1752 bfq_update_fin_time_enqueue(entity, st, false);
1753}
1754
1755static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1756 struct bfq_sched_data *sd,
1757 bool non_blocking_wait_rq)
1758{
1759 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1760
1761 if (sd->in_service_entity == entity || entity->tree == &st->active)
1762 /*
1763 * in service or already queued on the active tree,
1764 * requeue or reposition
1765 */
1766 __bfq_requeue_entity(entity);
1767 else
1768 /*
1769 * Not in service and not queued on its active tree:
1770 * the activity is idle and this is a true activation.
1771 */
1772 __bfq_activate_entity(entity, non_blocking_wait_rq);
1773}
1774
1775
1776/**
1777 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1778 * and activate, requeue or reposition all ancestors
1779 * for which such an update becomes necessary.
aee69d78
PV
1780 * @entity: the entity to activate.
1781 * @non_blocking_wait_rq: true if this entity was waiting for a request
e21b7a0b
AA
1782 * @requeue: true if this is a requeue, which implies that bfqq is
1783 * being expired; thus ALL its ancestors stop being served and must
1784 * therefore be requeued
aee69d78 1785 */
e21b7a0b
AA
1786static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1787 bool non_blocking_wait_rq,
1788 bool requeue)
aee69d78
PV
1789{
1790 struct bfq_sched_data *sd;
1791
1792 for_each_entity(entity) {
aee69d78 1793 sd = entity->sched_data;
e21b7a0b
AA
1794 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1795
1796 if (!bfq_update_next_in_service(sd, entity) && !requeue)
aee69d78
PV
1797 break;
1798 }
1799}
1800
1801/**
1802 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1803 * @entity: the entity to deactivate.
e21b7a0b
AA
1804 * @ins_into_idle_tree: if false, the entity will not be put into the
1805 * idle tree.
aee69d78 1806 *
e21b7a0b
AA
1807 * Deactivates an entity, independently from its previous state. Must
1808 * be invoked only if entity is on a service tree. Extracts the entity
1809 * from that tree, and if necessary and allowed, puts it on the idle
1810 * tree.
aee69d78 1811 */
e21b7a0b
AA
1812static bool __bfq_deactivate_entity(struct bfq_entity *entity,
1813 bool ins_into_idle_tree)
aee69d78
PV
1814{
1815 struct bfq_sched_data *sd = entity->sched_data;
1816 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1817 int is_in_service = entity == sd->in_service_entity;
aee69d78 1818
e21b7a0b
AA
1819 if (!entity->on_st) /* entity never activated, or already inactive */
1820 return false;
aee69d78 1821
e21b7a0b 1822 if (is_in_service)
aee69d78 1823 bfq_calc_finish(entity, entity->service);
e21b7a0b
AA
1824
1825 if (entity->tree == &st->active)
aee69d78 1826 bfq_active_extract(st, entity);
e21b7a0b 1827 else if (!is_in_service && entity->tree == &st->idle)
aee69d78
PV
1828 bfq_idle_extract(st, entity);
1829
e21b7a0b 1830 if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
aee69d78
PV
1831 bfq_forget_entity(st, entity, is_in_service);
1832 else
1833 bfq_idle_insert(st, entity);
1834
e21b7a0b 1835 return true;
aee69d78
PV
1836}
1837
1838/**
e21b7a0b 1839 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
aee69d78 1840 * @entity: the entity to deactivate.
e21b7a0b 1841 * @ins_into_idle_tree: true if the entity can be put on the idle tree
aee69d78 1842 */
e21b7a0b
AA
1843static void bfq_deactivate_entity(struct bfq_entity *entity,
1844 bool ins_into_idle_tree,
1845 bool expiration)
aee69d78
PV
1846{
1847 struct bfq_sched_data *sd;
1848 struct bfq_entity *parent = NULL;
1849
1850 for_each_entity_safe(entity, parent) {
1851 sd = entity->sched_data;
1852
e21b7a0b 1853 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
aee69d78 1854 /*
e21b7a0b
AA
1855 * entity is not in any tree any more, so
1856 * this deactivation is a no-op, and there is
1857 * nothing to change for upper-level entities
1858 * (in case of expiration, this can never
1859 * happen).
aee69d78 1860 */
e21b7a0b
AA
1861 return;
1862 }
1863
1864 if (sd->next_in_service == entity)
1865 /*
1866 * entity was the next_in_service entity,
1867 * then, since entity has just been
1868 * deactivated, a new one must be found.
1869 */
1870 bfq_update_next_in_service(sd, NULL);
aee69d78
PV
1871
1872 if (sd->next_in_service)
1873 /*
e21b7a0b
AA
1874 * The parent entity is still backlogged,
1875 * because next_in_service is not NULL. So, no
1876 * further upwards deactivation must be
1877 * performed. Yet, next_in_service has
1878 * changed. Then the schedule does need to be
1879 * updated upwards.
aee69d78 1880 */
e21b7a0b 1881 break;
aee69d78
PV
1882
1883 /*
e21b7a0b
AA
1884 * If we get here, then the parent is no more
1885 * backlogged and we need to propagate the
1886 * deactivation upwards. Thus let the loop go on.
aee69d78 1887 */
aee69d78 1888
e21b7a0b
AA
1889 /*
1890 * Also let parent be queued into the idle tree on
1891 * deactivation, to preserve service guarantees, and
1892 * assuming that who invoked this function does not
1893 * need parent entities too to be removed completely.
1894 */
1895 ins_into_idle_tree = true;
1896 }
aee69d78 1897
e21b7a0b
AA
1898 /*
1899 * If the deactivation loop is fully executed, then there are
1900 * no more entities to touch and next loop is not executed at
1901 * all. Otherwise, requeue remaining entities if they are
1902 * about to stop receiving service, or reposition them if this
1903 * is not the case.
1904 */
aee69d78
PV
1905 entity = parent;
1906 for_each_entity(entity) {
e21b7a0b
AA
1907 /*
1908 * Invoke __bfq_requeue_entity on entity, even if
1909 * already active, to requeue/reposition it in the
1910 * active tree (because sd->next_in_service has
1911 * changed)
1912 */
1913 __bfq_requeue_entity(entity);
aee69d78
PV
1914
1915 sd = entity->sched_data;
e21b7a0b
AA
1916 if (!bfq_update_next_in_service(sd, entity) &&
1917 !expiration)
1918 /*
1919 * next_in_service unchanged or not causing
1920 * any change in entity->parent->sd, and no
1921 * requeueing needed for expiration: stop
1922 * here.
1923 */
aee69d78
PV
1924 break;
1925 }
1926}
1927
1928/**
e21b7a0b
AA
1929 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1930 * if needed, to have at least one entity eligible.
aee69d78
PV
1931 * @st: the service tree to act upon.
1932 *
e21b7a0b 1933 * Assumes that st is not empty.
aee69d78 1934 */
e21b7a0b 1935static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
aee69d78 1936{
e21b7a0b
AA
1937 struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
1938
1939 if (bfq_gt(root_entity->min_start, st->vtime))
1940 return root_entity->min_start;
1941
1942 return st->vtime;
1943}
aee69d78 1944
e21b7a0b
AA
1945static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
1946{
1947 if (new_value > st->vtime) {
1948 st->vtime = new_value;
aee69d78
PV
1949 bfq_forget_idle(st);
1950 }
1951}
1952
1953/**
1954 * bfq_first_active_entity - find the eligible entity with
1955 * the smallest finish time
1956 * @st: the service tree to select from.
e21b7a0b 1957 * @vtime: the system virtual to use as a reference for eligibility
aee69d78
PV
1958 *
1959 * This function searches the first schedulable entity, starting from the
1960 * root of the tree and going on the left every time on this side there is
1961 * a subtree with at least one eligible (start >= vtime) entity. The path on
1962 * the right is followed only if a) the left subtree contains no eligible
1963 * entities and b) no eligible entity has been found yet.
1964 */
e21b7a0b
AA
1965static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
1966 u64 vtime)
aee69d78
PV
1967{
1968 struct bfq_entity *entry, *first = NULL;
1969 struct rb_node *node = st->active.rb_node;
1970
1971 while (node) {
1972 entry = rb_entry(node, struct bfq_entity, rb_node);
1973left:
e21b7a0b 1974 if (!bfq_gt(entry->start, vtime))
aee69d78
PV
1975 first = entry;
1976
1977 if (node->rb_left) {
1978 entry = rb_entry(node->rb_left,
1979 struct bfq_entity, rb_node);
e21b7a0b 1980 if (!bfq_gt(entry->min_start, vtime)) {
aee69d78
PV
1981 node = node->rb_left;
1982 goto left;
1983 }
1984 }
1985 if (first)
1986 break;
1987 node = node->rb_right;
1988 }
1989
e21b7a0b
AA
1990 return first;
1991}
1992
1993/**
1994 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1995 * @st: the service tree.
1996 *
1997 * If there is no in-service entity for the sched_data st belongs to,
1998 * then return the entity that will be set in service if:
1999 * 1) the parent entity this st belongs to is set in service;
2000 * 2) no entity belonging to such parent entity undergoes a state change
2001 * that would influence the timestamps of the entity (e.g., becomes idle,
2002 * becomes backlogged, changes its budget, ...).
2003 *
2004 * In this first case, update the virtual time in @st too (see the
2005 * comments on this update inside the function).
2006 *
2007 * In constrast, if there is an in-service entity, then return the
2008 * entity that would be set in service if not only the above
2009 * conditions, but also the next one held true: the currently
2010 * in-service entity, on expiration,
2011 * 1) gets a finish time equal to the current one, or
2012 * 2) is not eligible any more, or
2013 * 3) is idle.
2014 */
2015static struct bfq_entity *
2016__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
2017{
2018 struct bfq_entity *entity;
2019 u64 new_vtime;
2020
2021 if (RB_EMPTY_ROOT(&st->active))
2022 return NULL;
2023
2024 /*
2025 * Get the value of the system virtual time for which at
2026 * least one entity is eligible.
2027 */
2028 new_vtime = bfq_calc_vtime_jump(st);
2029
2030 /*
2031 * If there is no in-service entity for the sched_data this
2032 * active tree belongs to, then push the system virtual time
2033 * up to the value that guarantees that at least one entity is
2034 * eligible. If, instead, there is an in-service entity, then
2035 * do not make any such update, because there is already an
2036 * eligible entity, namely the in-service one (even if the
2037 * entity is not on st, because it was extracted when set in
2038 * service).
2039 */
2040 if (!in_service)
2041 bfq_update_vtime(st, new_vtime);
2042
2043 entity = bfq_first_active_entity(st, new_vtime);
2044
2045 return entity;
2046}
2047
2048/**
2049 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2050 * @sd: the sched_data.
2051 *
2052 * This function is invoked when there has been a change in the trees
2053 * for sd, and we need know what is the new next entity after this
2054 * change.
2055 */
2056static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
2057{
2058 struct bfq_service_tree *st = sd->service_tree;
2059 struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
2060 struct bfq_entity *entity = NULL;
2061 int class_idx = 0;
2062
2063 /*
2064 * Choose from idle class, if needed to guarantee a minimum
2065 * bandwidth to this class (and if there is some active entity
2066 * in idle class). This should also mitigate
2067 * priority-inversion problems in case a low priority task is
2068 * holding file system resources.
2069 */
2070 if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
2071 BFQ_CL_IDLE_TIMEOUT)) {
2072 if (!RB_EMPTY_ROOT(&idle_class_st->active))
2073 class_idx = BFQ_IOPRIO_CLASSES - 1;
2074 /* About to be served if backlogged, or not yet backlogged */
2075 sd->bfq_class_idle_last_service = jiffies;
2076 }
2077
2078 /*
2079 * Find the next entity to serve for the highest-priority
2080 * class, unless the idle class needs to be served.
2081 */
2082 for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
2083 entity = __bfq_lookup_next_entity(st + class_idx,
2084 sd->in_service_entity);
2085
2086 if (entity)
2087 break;
2088 }
2089
2090 if (!entity)
2091 return NULL;
2092
2093 return entity;
2094}
2095
2096static bool next_queue_may_preempt(struct bfq_data *bfqd)
2097{
2098 struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
2099
2100 return sd->next_in_service != sd->in_service_entity;
2101}
2102
2103/*
2104 * Get next queue for service.
2105 */
2106static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
2107{
2108 struct bfq_entity *entity = NULL;
2109 struct bfq_sched_data *sd;
2110 struct bfq_queue *bfqq;
2111
2112 if (bfqd->busy_queues == 0)
2113 return NULL;
2114
2115 /*
2116 * Traverse the path from the root to the leaf entity to
2117 * serve. Set in service all the entities visited along the
2118 * way.
2119 */
2120 sd = &bfqd->root_group->sched_data;
2121 for (; sd ; sd = entity->my_sched_data) {
2122 /*
2123 * WARNING. We are about to set the in-service entity
2124 * to sd->next_in_service, i.e., to the (cached) value
2125 * returned by bfq_lookup_next_entity(sd) the last
2126 * time it was invoked, i.e., the last time when the
2127 * service order in sd changed as a consequence of the
2128 * activation or deactivation of an entity. In this
2129 * respect, if we execute bfq_lookup_next_entity(sd)
2130 * in this very moment, it may, although with low
2131 * probability, yield a different entity than that
2132 * pointed to by sd->next_in_service. This rare event
2133 * happens in case there was no CLASS_IDLE entity to
2134 * serve for sd when bfq_lookup_next_entity(sd) was
2135 * invoked for the last time, while there is now one
2136 * such entity.
2137 *
2138 * If the above event happens, then the scheduling of
2139 * such entity in CLASS_IDLE is postponed until the
2140 * service of the sd->next_in_service entity
2141 * finishes. In fact, when the latter is expired,
2142 * bfq_lookup_next_entity(sd) gets called again,
2143 * exactly to update sd->next_in_service.
2144 */
2145
2146 /* Make next_in_service entity become in_service_entity */
2147 entity = sd->next_in_service;
2148 sd->in_service_entity = entity;
2149
2150 /*
2151 * Reset the accumulator of the amount of service that
2152 * the entity is about to receive.
2153 */
2154 entity->service = 0;
2155
2156 /*
2157 * If entity is no longer a candidate for next
2158 * service, then we extract it from its active tree,
2159 * for the following reason. To further boost the
2160 * throughput in some special case, BFQ needs to know
2161 * which is the next candidate entity to serve, while
2162 * there is already an entity in service. In this
2163 * respect, to make it easy to compute/update the next
2164 * candidate entity to serve after the current
2165 * candidate has been set in service, there is a case
2166 * where it is necessary to extract the current
2167 * candidate from its service tree. Such a case is
2168 * when the entity just set in service cannot be also
2169 * a candidate for next service. Details about when
2170 * this conditions holds are reported in the comments
2171 * on the function bfq_no_longer_next_in_service()
2172 * invoked below.
2173 */
2174 if (bfq_no_longer_next_in_service(entity))
2175 bfq_active_extract(bfq_entity_service_tree(entity),
2176 entity);
2177
2178 /*
2179 * For the same reason why we may have just extracted
2180 * entity from its active tree, we may need to update
2181 * next_in_service for the sched_data of entity too,
2182 * regardless of whether entity has been extracted.
2183 * In fact, even if entity has not been extracted, a
2184 * descendant entity may get extracted. Such an event
2185 * would cause a change in next_in_service for the
2186 * level of the descendant entity, and thus possibly
2187 * back to upper levels.
2188 *
2189 * We cannot perform the resulting needed update
2190 * before the end of this loop, because, to know which
2191 * is the correct next-to-serve candidate entity for
2192 * each level, we need first to find the leaf entity
2193 * to set in service. In fact, only after we know
2194 * which is the next-to-serve leaf entity, we can
2195 * discover whether the parent entity of the leaf
2196 * entity becomes the next-to-serve, and so on.
2197 */
2198
2199 }
2200
2201 bfqq = bfq_entity_to_bfqq(entity);
2202
2203 /*
2204 * We can finally update all next-to-serve entities along the
2205 * path from the leaf entity just set in service to the root.
2206 */
2207 for_each_entity(entity) {
2208 struct bfq_sched_data *sd = entity->sched_data;
2209
2210 if (!bfq_update_next_in_service(sd, NULL))
2211 break;
2212 }
2213
2214 return bfqq;
2215}
2216
2217static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
2218{
2219 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
2220 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
2221 struct bfq_entity *entity = in_serv_entity;
2222
2223 if (bfqd->in_service_bic) {
2224 put_io_context(bfqd->in_service_bic->icq.ioc);
2225 bfqd->in_service_bic = NULL;
2226 }
2227
2228 bfq_clear_bfqq_wait_request(in_serv_bfqq);
2229 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
2230 bfqd->in_service_queue = NULL;
2231
2232 /*
2233 * When this function is called, all in-service entities have
2234 * been properly deactivated or requeued, so we can safely
2235 * execute the final step: reset in_service_entity along the
2236 * path from entity to the root.
2237 */
2238 for_each_entity(entity)
2239 entity->sched_data->in_service_entity = NULL;
2240
2241 /*
2242 * in_serv_entity is no longer in service, so, if it is in no
2243 * service tree either, then release the service reference to
2244 * the queue it represents (taken with bfq_get_entity).
2245 */
2246 if (!in_serv_entity->on_st)
2247 bfq_put_queue(in_serv_bfqq);
2248}
2249
2250static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2251 bool ins_into_idle_tree, bool expiration)
2252{
2253 struct bfq_entity *entity = &bfqq->entity;
2254
2255 bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
2256}
2257
2258static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2259{
2260 struct bfq_entity *entity = &bfqq->entity;
2261
2262 bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
2263 false);
2264 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
2265}
2266
2267static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2268{
2269 struct bfq_entity *entity = &bfqq->entity;
2270
2271 bfq_activate_requeue_entity(entity, false,
2272 bfqq == bfqd->in_service_queue);
2273}
2274
2275static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
2276
2277/*
2278 * Called when the bfqq no longer has requests pending, remove it from
2279 * the service tree. As a special case, it can be invoked during an
2280 * expiration.
2281 */
2282static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2283 bool expiration)
2284{
2285 bfq_log_bfqq(bfqd, bfqq, "del from busy");
2286
2287 bfq_clear_bfqq_busy(bfqq);
2288
2289 bfqd->busy_queues--;
2290
2291 bfqg_stats_update_dequeue(bfqq_group(bfqq));
2292
2293 bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
2294}
2295
2296/*
2297 * Called when an inactive queue receives a new request.
2298 */
2299static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2300{
2301 bfq_log_bfqq(bfqd, bfqq, "add to busy");
2302
2303 bfq_activate_bfqq(bfqd, bfqq);
2304
2305 bfq_mark_bfqq_busy(bfqq);
2306 bfqd->busy_queues++;
2307}
2308
2309#ifdef CONFIG_BFQ_GROUP_IOSCHED
2310
2311/* bfqg stats flags */
2312enum bfqg_stats_flags {
2313 BFQG_stats_waiting = 0,
2314 BFQG_stats_idling,
2315 BFQG_stats_empty,
2316};
2317
2318#define BFQG_FLAG_FNS(name) \
2319static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2320{ \
2321 stats->flags |= (1 << BFQG_stats_##name); \
2322} \
2323static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2324{ \
2325 stats->flags &= ~(1 << BFQG_stats_##name); \
2326} \
2327static int bfqg_stats_##name(struct bfqg_stats *stats) \
2328{ \
2329 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2330} \
2331
2332BFQG_FLAG_FNS(waiting)
2333BFQG_FLAG_FNS(idling)
2334BFQG_FLAG_FNS(empty)
2335#undef BFQG_FLAG_FNS
2336
2337/* This should be called with the queue_lock held. */
2338static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
2339{
2340 unsigned long long now;
2341
2342 if (!bfqg_stats_waiting(stats))
2343 return;
2344
2345 now = sched_clock();
2346 if (time_after64(now, stats->start_group_wait_time))
2347 blkg_stat_add(&stats->group_wait_time,
2348 now - stats->start_group_wait_time);
2349 bfqg_stats_clear_waiting(stats);
2350}
2351
2352/* This should be called with the queue_lock held. */
2353static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
2354 struct bfq_group *curr_bfqg)
2355{
2356 struct bfqg_stats *stats = &bfqg->stats;
2357
2358 if (bfqg_stats_waiting(stats))
2359 return;
2360 if (bfqg == curr_bfqg)
2361 return;
2362 stats->start_group_wait_time = sched_clock();
2363 bfqg_stats_mark_waiting(stats);
2364}
2365
2366/* This should be called with the queue_lock held. */
2367static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
2368{
2369 unsigned long long now;
2370
2371 if (!bfqg_stats_empty(stats))
2372 return;
2373
2374 now = sched_clock();
2375 if (time_after64(now, stats->start_empty_time))
2376 blkg_stat_add(&stats->empty_time,
2377 now - stats->start_empty_time);
2378 bfqg_stats_clear_empty(stats);
2379}
2380
2381static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
2382{
2383 blkg_stat_add(&bfqg->stats.dequeue, 1);
2384}
2385
2386static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
2387{
2388 struct bfqg_stats *stats = &bfqg->stats;
2389
2390 if (blkg_rwstat_total(&stats->queued))
2391 return;
2392
2393 /*
2394 * group is already marked empty. This can happen if bfqq got new
2395 * request in parent group and moved to this group while being added
2396 * to service tree. Just ignore the event and move on.
2397 */
2398 if (bfqg_stats_empty(stats))
2399 return;
2400
2401 stats->start_empty_time = sched_clock();
2402 bfqg_stats_mark_empty(stats);
2403}
2404
2405static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
2406{
2407 struct bfqg_stats *stats = &bfqg->stats;
2408
2409 if (bfqg_stats_idling(stats)) {
2410 unsigned long long now = sched_clock();
2411
2412 if (time_after64(now, stats->start_idle_time))
2413 blkg_stat_add(&stats->idle_time,
2414 now - stats->start_idle_time);
2415 bfqg_stats_clear_idling(stats);
2416 }
2417}
2418
2419static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
2420{
2421 struct bfqg_stats *stats = &bfqg->stats;
2422
2423 stats->start_idle_time = sched_clock();
2424 bfqg_stats_mark_idling(stats);
2425}
2426
2427static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
2428{
2429 struct bfqg_stats *stats = &bfqg->stats;
2430
2431 blkg_stat_add(&stats->avg_queue_size_sum,
2432 blkg_rwstat_total(&stats->queued));
2433 blkg_stat_add(&stats->avg_queue_size_samples, 1);
2434 bfqg_stats_update_group_wait_time(stats);
2435}
2436
2437/*
2438 * blk-cgroup policy-related handlers
2439 * The following functions help in converting between blk-cgroup
2440 * internal structures and BFQ-specific structures.
2441 */
2442
2443static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
2444{
2445 return pd ? container_of(pd, struct bfq_group, pd) : NULL;
2446}
2447
2448static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
2449{
2450 return pd_to_blkg(&bfqg->pd);
2451}
2452
2453static struct blkcg_policy blkcg_policy_bfq;
2454
2455static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
2456{
2457 return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
2458}
2459
2460/*
2461 * bfq_group handlers
2462 * The following functions help in navigating the bfq_group hierarchy
2463 * by allowing to find the parent of a bfq_group or the bfq_group
2464 * associated to a bfq_queue.
2465 */
2466
2467static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
2468{
2469 struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
2470
2471 return pblkg ? blkg_to_bfqg(pblkg) : NULL;
2472}
2473
2474static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
2475{
2476 struct bfq_entity *group_entity = bfqq->entity.parent;
2477
2478 return group_entity ? container_of(group_entity, struct bfq_group,
2479 entity) :
2480 bfqq->bfqd->root_group;
2481}
2482
2483/*
2484 * The following two functions handle get and put of a bfq_group by
2485 * wrapping the related blk-cgroup hooks.
2486 */
2487
2488static void bfqg_get(struct bfq_group *bfqg)
2489{
2490 return blkg_get(bfqg_to_blkg(bfqg));
2491}
2492
2493static void bfqg_put(struct bfq_group *bfqg)
2494{
2495 return blkg_put(bfqg_to_blkg(bfqg));
2496}
2497
2498static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
2499 struct bfq_queue *bfqq,
2500 unsigned int op)
2501{
2502 blkg_rwstat_add(&bfqg->stats.queued, op, 1);
2503 bfqg_stats_end_empty_time(&bfqg->stats);
2504 if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
2505 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
2506}
2507
2508static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
2509{
2510 blkg_rwstat_add(&bfqg->stats.queued, op, -1);
2511}
2512
2513static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
2514{
2515 blkg_rwstat_add(&bfqg->stats.merged, op, 1);
2516}
2517
2518static void bfqg_stats_update_completion(struct bfq_group *bfqg,
2519 uint64_t start_time, uint64_t io_start_time,
2520 unsigned int op)
2521{
2522 struct bfqg_stats *stats = &bfqg->stats;
2523 unsigned long long now = sched_clock();
2524
2525 if (time_after64(now, io_start_time))
2526 blkg_rwstat_add(&stats->service_time, op,
2527 now - io_start_time);
2528 if (time_after64(io_start_time, start_time))
2529 blkg_rwstat_add(&stats->wait_time, op,
2530 io_start_time - start_time);
2531}
2532
2533/* @stats = 0 */
2534static void bfqg_stats_reset(struct bfqg_stats *stats)
2535{
2536 /* queued stats shouldn't be cleared */
2537 blkg_rwstat_reset(&stats->merged);
2538 blkg_rwstat_reset(&stats->service_time);
2539 blkg_rwstat_reset(&stats->wait_time);
2540 blkg_stat_reset(&stats->time);
2541 blkg_stat_reset(&stats->avg_queue_size_sum);
2542 blkg_stat_reset(&stats->avg_queue_size_samples);
2543 blkg_stat_reset(&stats->dequeue);
2544 blkg_stat_reset(&stats->group_wait_time);
2545 blkg_stat_reset(&stats->idle_time);
2546 blkg_stat_reset(&stats->empty_time);
2547}
2548
2549/* @to += @from */
2550static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
2551{
2552 if (!to || !from)
2553 return;
2554
2555 /* queued stats shouldn't be cleared */
2556 blkg_rwstat_add_aux(&to->merged, &from->merged);
2557 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
2558 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
2559 blkg_stat_add_aux(&from->time, &from->time);
2560 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
2561 blkg_stat_add_aux(&to->avg_queue_size_samples,
2562 &from->avg_queue_size_samples);
2563 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
2564 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
2565 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
2566 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
2567}
2568
2569/*
2570 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2571 * recursive stats can still account for the amount used by this bfqg after
2572 * it's gone.
2573 */
2574static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
2575{
2576 struct bfq_group *parent;
2577
2578 if (!bfqg) /* root_group */
2579 return;
2580
2581 parent = bfqg_parent(bfqg);
2582
2583 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
2584
2585 if (unlikely(!parent))
2586 return;
2587
2588 bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
2589 bfqg_stats_reset(&bfqg->stats);
2590}
2591
2592static void bfq_init_entity(struct bfq_entity *entity,
2593 struct bfq_group *bfqg)
2594{
2595 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2596
2597 entity->weight = entity->new_weight;
2598 entity->orig_weight = entity->new_weight;
2599 if (bfqq) {
2600 bfqq->ioprio = bfqq->new_ioprio;
2601 bfqq->ioprio_class = bfqq->new_ioprio_class;
2602 bfqg_get(bfqg);
2603 }
2604 entity->parent = bfqg->my_entity; /* NULL for root group */
2605 entity->sched_data = &bfqg->sched_data;
2606}
2607
2608static void bfqg_stats_exit(struct bfqg_stats *stats)
2609{
2610 blkg_rwstat_exit(&stats->merged);
2611 blkg_rwstat_exit(&stats->service_time);
2612 blkg_rwstat_exit(&stats->wait_time);
2613 blkg_rwstat_exit(&stats->queued);
2614 blkg_stat_exit(&stats->time);
2615 blkg_stat_exit(&stats->avg_queue_size_sum);
2616 blkg_stat_exit(&stats->avg_queue_size_samples);
2617 blkg_stat_exit(&stats->dequeue);
2618 blkg_stat_exit(&stats->group_wait_time);
2619 blkg_stat_exit(&stats->idle_time);
2620 blkg_stat_exit(&stats->empty_time);
2621}
2622
2623static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
2624{
2625 if (blkg_rwstat_init(&stats->merged, gfp) ||
2626 blkg_rwstat_init(&stats->service_time, gfp) ||
2627 blkg_rwstat_init(&stats->wait_time, gfp) ||
2628 blkg_rwstat_init(&stats->queued, gfp) ||
2629 blkg_stat_init(&stats->time, gfp) ||
2630 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
2631 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
2632 blkg_stat_init(&stats->dequeue, gfp) ||
2633 blkg_stat_init(&stats->group_wait_time, gfp) ||
2634 blkg_stat_init(&stats->idle_time, gfp) ||
2635 blkg_stat_init(&stats->empty_time, gfp)) {
2636 bfqg_stats_exit(stats);
2637 return -ENOMEM;
2638 }
2639
2640 return 0;
2641}
2642
2643static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
2644{
2645 return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
2646}
2647
2648static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
2649{
2650 return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
2651}
2652
2653static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
2654{
2655 struct bfq_group_data *bgd;
2656
2657 bgd = kzalloc(sizeof(*bgd), gfp);
2658 if (!bgd)
2659 return NULL;
2660 return &bgd->pd;
2661}
2662
2663static void bfq_cpd_init(struct blkcg_policy_data *cpd)
2664{
2665 struct bfq_group_data *d = cpd_to_bfqgd(cpd);
2666
2667 d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
2668 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
2669}
2670
2671static void bfq_cpd_free(struct blkcg_policy_data *cpd)
2672{
2673 kfree(cpd_to_bfqgd(cpd));
2674}
2675
2676static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
2677{
2678 struct bfq_group *bfqg;
2679
2680 bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
2681 if (!bfqg)
2682 return NULL;
2683
2684 if (bfqg_stats_init(&bfqg->stats, gfp)) {
2685 kfree(bfqg);
2686 return NULL;
2687 }
2688
2689 return &bfqg->pd;
2690}
2691
2692static void bfq_pd_init(struct blkg_policy_data *pd)
2693{
2694 struct blkcg_gq *blkg = pd_to_blkg(pd);
2695 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
2696 struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
2697 struct bfq_entity *entity = &bfqg->entity;
2698 struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
2699
2700 entity->orig_weight = entity->weight = entity->new_weight = d->weight;
2701 entity->my_sched_data = &bfqg->sched_data;
2702 bfqg->my_entity = entity; /*
2703 * the root_group's will be set to NULL
2704 * in bfq_init_queue()
2705 */
2706 bfqg->bfqd = bfqd;
2707}
2708
2709static void bfq_pd_free(struct blkg_policy_data *pd)
2710{
2711 struct bfq_group *bfqg = pd_to_bfqg(pd);
2712
2713 bfqg_stats_exit(&bfqg->stats);
2714 return kfree(bfqg);
2715}
2716
2717static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
2718{
2719 struct bfq_group *bfqg = pd_to_bfqg(pd);
2720
2721 bfqg_stats_reset(&bfqg->stats);
2722}
2723
2724static void bfq_group_set_parent(struct bfq_group *bfqg,
2725 struct bfq_group *parent)
2726{
2727 struct bfq_entity *entity;
2728
2729 entity = &bfqg->entity;
2730 entity->parent = parent->my_entity;
2731 entity->sched_data = &parent->sched_data;
2732}
2733
2734static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
2735 struct blkcg *blkcg)
2736{
2737 struct blkcg_gq *blkg;
2738
2739 blkg = blkg_lookup(blkcg, bfqd->queue);
2740 if (likely(blkg))
2741 return blkg_to_bfqg(blkg);
2742 return NULL;
2743}
2744
2745static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
2746 struct blkcg *blkcg)
2747{
2748 struct bfq_group *bfqg, *parent;
2749 struct bfq_entity *entity;
2750
2751 bfqg = bfq_lookup_bfqg(bfqd, blkcg);
2752
2753 if (unlikely(!bfqg))
2754 return NULL;
2755
2756 /*
2757 * Update chain of bfq_groups as we might be handling a leaf group
2758 * which, along with some of its relatives, has not been hooked yet
2759 * to the private hierarchy of BFQ.
2760 */
2761 entity = &bfqg->entity;
2762 for_each_entity(entity) {
2763 bfqg = container_of(entity, struct bfq_group, entity);
2764 if (bfqg != bfqd->root_group) {
2765 parent = bfqg_parent(bfqg);
2766 if (!parent)
2767 parent = bfqd->root_group;
2768 bfq_group_set_parent(bfqg, parent);
2769 }
2770 }
2771
2772 return bfqg;
2773}
2774
2775static void bfq_bfqq_expire(struct bfq_data *bfqd,
2776 struct bfq_queue *bfqq,
2777 bool compensate,
2778 enum bfqq_expiration reason);
2779
2780/**
2781 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2782 * @bfqd: queue descriptor.
2783 * @bfqq: the queue to move.
2784 * @bfqg: the group to move to.
2785 *
2786 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2787 * it on the new one. Avoid putting the entity on the old group idle tree.
2788 *
2789 * Must be called under the queue lock; the cgroup owning @bfqg must
2790 * not disappear (by now this just means that we are called under
2791 * rcu_read_lock()).
2792 */
2793static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2794 struct bfq_group *bfqg)
2795{
2796 struct bfq_entity *entity = &bfqq->entity;
2797
2798 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2799 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2800 * from data structures related to current group. Otherwise we
2801 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2802 * we do below.
2803 */
2804 if (bfqq == bfqd->in_service_queue)
2805 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
2806 false, BFQQE_PREEMPTED);
2807
2808 if (bfq_bfqq_busy(bfqq))
2809 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
2810 else if (entity->on_st)
2811 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
2812 bfqg_put(bfqq_group(bfqq));
2813
2814 /*
2815 * Here we use a reference to bfqg. We don't need a refcounter
2816 * as the cgroup reference will not be dropped, so that its
2817 * destroy() callback will not be invoked.
2818 */
2819 entity->parent = bfqg->my_entity;
2820 entity->sched_data = &bfqg->sched_data;
2821 bfqg_get(bfqg);
2822
2823 if (bfq_bfqq_busy(bfqq))
2824 bfq_activate_bfqq(bfqd, bfqq);
2825
2826 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
2827 bfq_schedule_dispatch(bfqd);
2828}
2829
2830/**
2831 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2832 * @bfqd: the queue descriptor.
2833 * @bic: the bic to move.
2834 * @blkcg: the blk-cgroup to move to.
2835 *
2836 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2837 * has to make sure that the reference to cgroup is valid across the call.
2838 *
2839 * NOTE: an alternative approach might have been to store the current
2840 * cgroup in bfqq and getting a reference to it, reducing the lookup
2841 * time here, at the price of slightly more complex code.
2842 */
2843static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
2844 struct bfq_io_cq *bic,
2845 struct blkcg *blkcg)
2846{
2847 struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
2848 struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
2849 struct bfq_group *bfqg;
2850 struct bfq_entity *entity;
2851
2852 bfqg = bfq_find_set_group(bfqd, blkcg);
2853
2854 if (unlikely(!bfqg))
2855 bfqg = bfqd->root_group;
2856
2857 if (async_bfqq) {
2858 entity = &async_bfqq->entity;
2859
2860 if (entity->sched_data != &bfqg->sched_data) {
2861 bic_set_bfqq(bic, NULL, 0);
2862 bfq_log_bfqq(bfqd, async_bfqq,
2863 "bic_change_group: %p %d",
2864 async_bfqq,
2865 async_bfqq->ref);
2866 bfq_put_queue(async_bfqq);
2867 }
2868 }
2869
2870 if (sync_bfqq) {
2871 entity = &sync_bfqq->entity;
2872 if (entity->sched_data != &bfqg->sched_data)
2873 bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
2874 }
2875
2876 return bfqg;
2877}
2878
2879static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
2880{
2881 struct bfq_data *bfqd = bic_to_bfqd(bic);
2882 struct bfq_group *bfqg = NULL;
2883 uint64_t serial_nr;
2884
2885 rcu_read_lock();
2886 serial_nr = bio_blkcg(bio)->css.serial_nr;
2887
2888 /*
2889 * Check whether blkcg has changed. The condition may trigger
2890 * spuriously on a newly created cic but there's no harm.
2891 */
2892 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
2893 goto out;
2894
2895 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
2896 bic->blkcg_serial_nr = serial_nr;
2897out:
2898 rcu_read_unlock();
2899}
2900
2901/**
2902 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
2903 * @st: the service tree being flushed.
2904 */
2905static void bfq_flush_idle_tree(struct bfq_service_tree *st)
2906{
2907 struct bfq_entity *entity = st->first_idle;
2908
2909 for (; entity ; entity = st->first_idle)
2910 __bfq_deactivate_entity(entity, false);
2911}
2912
2913/**
2914 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
2915 * @bfqd: the device data structure with the root group.
2916 * @entity: the entity to move.
2917 */
2918static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
2919 struct bfq_entity *entity)
2920{
2921 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2922
2923 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
aee69d78
PV
2924}
2925
2926/**
e21b7a0b
AA
2927 * bfq_reparent_active_entities - move to the root group all active
2928 * entities.
2929 * @bfqd: the device data structure with the root group.
2930 * @bfqg: the group to move from.
2931 * @st: the service tree with the entities.
aee69d78 2932 *
e21b7a0b 2933 * Needs queue_lock to be taken and reference to be valid over the call.
aee69d78 2934 */
e21b7a0b
AA
2935static void bfq_reparent_active_entities(struct bfq_data *bfqd,
2936 struct bfq_group *bfqg,
2937 struct bfq_service_tree *st)
aee69d78 2938{
e21b7a0b
AA
2939 struct rb_root *active = &st->active;
2940 struct bfq_entity *entity = NULL;
aee69d78 2941
e21b7a0b
AA
2942 if (!RB_EMPTY_ROOT(&st->active))
2943 entity = bfq_entity_of(rb_first(active));
aee69d78 2944
e21b7a0b
AA
2945 for (; entity ; entity = bfq_entity_of(rb_first(active)))
2946 bfq_reparent_leaf_entity(bfqd, entity);
aee69d78 2947
e21b7a0b
AA
2948 if (bfqg->sched_data.in_service_entity)
2949 bfq_reparent_leaf_entity(bfqd,
2950 bfqg->sched_data.in_service_entity);
aee69d78
PV
2951}
2952
2953/**
e21b7a0b
AA
2954 * bfq_pd_offline - deactivate the entity associated with @pd,
2955 * and reparent its children entities.
2956 * @pd: descriptor of the policy going offline.
aee69d78 2957 *
e21b7a0b
AA
2958 * blkio already grabs the queue_lock for us, so no need to use
2959 * RCU-based magic
aee69d78 2960 */
e21b7a0b 2961static void bfq_pd_offline(struct blkg_policy_data *pd)
aee69d78 2962{
e21b7a0b
AA
2963 struct bfq_service_tree *st;
2964 struct bfq_group *bfqg = pd_to_bfqg(pd);
2965 struct bfq_data *bfqd = bfqg->bfqd;
2966 struct bfq_entity *entity = bfqg->my_entity;
2967 unsigned long flags;
2968 int i;
aee69d78 2969
e21b7a0b
AA
2970 if (!entity) /* root group */
2971 return;
2972
2973 spin_lock_irqsave(&bfqd->lock, flags);
aee69d78 2974 /*
e21b7a0b
AA
2975 * Empty all service_trees belonging to this group before
2976 * deactivating the group itself.
aee69d78 2977 */
e21b7a0b
AA
2978 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
2979 st = bfqg->sched_data.service_tree + i;
2980
2981 /*
2982 * The idle tree may still contain bfq_queues belonging
2983 * to exited task because they never migrated to a different
2984 * cgroup from the one being destroyed now. No one else
2985 * can access them so it's safe to act without any lock.
2986 */
2987 bfq_flush_idle_tree(st);
2988
2989 /*
2990 * It may happen that some queues are still active
2991 * (busy) upon group destruction (if the corresponding
2992 * processes have been forced to terminate). We move
2993 * all the leaf entities corresponding to these queues
2994 * to the root_group.
2995 * Also, it may happen that the group has an entity
2996 * in service, which is disconnected from the active
2997 * tree: it must be moved, too.
2998 * There is no need to put the sync queues, as the
2999 * scheduler has taken no reference.
3000 */
3001 bfq_reparent_active_entities(bfqd, bfqg, st);
aee69d78
PV
3002 }
3003
e21b7a0b
AA
3004 __bfq_deactivate_entity(entity, false);
3005 bfq_put_async_queues(bfqd, bfqg);
3006
3007 spin_unlock_irqrestore(&bfqd->lock, flags);
3008 /*
3009 * @blkg is going offline and will be ignored by
3010 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3011 * that they don't get lost. If IOs complete after this point, the
3012 * stats for them will be lost. Oh well...
3013 */
3014 bfqg_stats_xfer_dead(bfqg);
aee69d78
PV
3015}
3016
e21b7a0b 3017static int bfq_io_show_weight(struct seq_file *sf, void *v)
aee69d78 3018{
e21b7a0b
AA
3019 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3020 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3021 unsigned int val = 0;
aee69d78 3022
e21b7a0b
AA
3023 if (bfqgd)
3024 val = bfqgd->weight;
aee69d78 3025
e21b7a0b 3026 seq_printf(sf, "%u\n", val);
aee69d78 3027
e21b7a0b
AA
3028 return 0;
3029}
3030
3031static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
3032 struct cftype *cftype,
3033 u64 val)
aee69d78 3034{
e21b7a0b
AA
3035 struct blkcg *blkcg = css_to_blkcg(css);
3036 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3037 struct blkcg_gq *blkg;
3038 int ret = -ERANGE;
aee69d78 3039
e21b7a0b
AA
3040 if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
3041 return ret;
aee69d78 3042
e21b7a0b
AA
3043 ret = 0;
3044 spin_lock_irq(&blkcg->lock);
3045 bfqgd->weight = (unsigned short)val;
3046 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3047 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3048
3049 if (!bfqg)
3050 continue;
3051 /*
3052 * Setting the prio_changed flag of the entity
3053 * to 1 with new_weight == weight would re-set
3054 * the value of the weight to its ioprio mapping.
3055 * Set the flag only if necessary.
3056 */
3057 if ((unsigned short)val != bfqg->entity.new_weight) {
3058 bfqg->entity.new_weight = (unsigned short)val;
3059 /*
3060 * Make sure that the above new value has been
3061 * stored in bfqg->entity.new_weight before
3062 * setting the prio_changed flag. In fact,
3063 * this flag may be read asynchronously (in
3064 * critical sections protected by a different
3065 * lock than that held here), and finding this
3066 * flag set may cause the execution of the code
3067 * for updating parameters whose value may
3068 * depend also on bfqg->entity.new_weight (in
3069 * __bfq_entity_update_weight_prio).
3070 * This barrier makes sure that the new value
3071 * of bfqg->entity.new_weight is correctly
3072 * seen in that code.
3073 */
3074 smp_wmb();
3075 bfqg->entity.prio_changed = 1;
3076 }
aee69d78 3077 }
e21b7a0b 3078 spin_unlock_irq(&blkcg->lock);
aee69d78 3079
e21b7a0b
AA
3080 return ret;
3081}
aee69d78 3082
e21b7a0b
AA
3083static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
3084 char *buf, size_t nbytes,
3085 loff_t off)
3086{
3087 u64 weight;
3088 /* First unsigned long found in the file is used */
3089 int ret = kstrtoull(strim(buf), 0, &weight);
3090
3091 if (ret)
3092 return ret;
3093
3094 return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
aee69d78
PV
3095}
3096
e21b7a0b 3097static int bfqg_print_stat(struct seq_file *sf, void *v)
aee69d78 3098{
e21b7a0b
AA
3099 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
3100 &blkcg_policy_bfq, seq_cft(sf)->private, false);
3101 return 0;
3102}
aee69d78 3103
e21b7a0b
AA
3104static int bfqg_print_rwstat(struct seq_file *sf, void *v)
3105{
3106 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
3107 &blkcg_policy_bfq, seq_cft(sf)->private, true);
3108 return 0;
3109}
aee69d78 3110
e21b7a0b
AA
3111static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
3112 struct blkg_policy_data *pd, int off)
3113{
3114 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
3115 &blkcg_policy_bfq, off);
3116 return __blkg_prfill_u64(sf, pd, sum);
3117}
aee69d78 3118
e21b7a0b
AA
3119static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
3120 struct blkg_policy_data *pd, int off)
3121{
3122 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
3123 &blkcg_policy_bfq,
3124 off);
3125 return __blkg_prfill_rwstat(sf, pd, &sum);
aee69d78
PV
3126}
3127
e21b7a0b 3128static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
aee69d78 3129{
e21b7a0b
AA
3130 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3131 bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
3132 seq_cft(sf)->private, false);
3133 return 0;
3134}
aee69d78 3135
e21b7a0b
AA
3136static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
3137{
3138 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3139 bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
3140 seq_cft(sf)->private, true);
3141 return 0;
aee69d78
PV
3142}
3143
e21b7a0b
AA
3144static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
3145 int off)
aee69d78 3146{
e21b7a0b 3147 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
aee69d78 3148
e21b7a0b 3149 return __blkg_prfill_u64(sf, pd, sum >> 9);
aee69d78
PV
3150}
3151
e21b7a0b 3152static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
aee69d78 3153{
e21b7a0b
AA
3154 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3155 bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
3156 return 0;
3157}
aee69d78 3158
e21b7a0b
AA
3159static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
3160 struct blkg_policy_data *pd, int off)
3161{
3162 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
3163 offsetof(struct blkcg_gq, stat_bytes));
3164 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
3165 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
aee69d78 3166
e21b7a0b
AA
3167 return __blkg_prfill_u64(sf, pd, sum >> 9);
3168}
aee69d78 3169
e21b7a0b
AA
3170static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
3171{
3172 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3173 bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
3174 false);
3175 return 0;
aee69d78
PV
3176}
3177
e21b7a0b
AA
3178static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
3179 struct blkg_policy_data *pd, int off)
aee69d78 3180{
e21b7a0b
AA
3181 struct bfq_group *bfqg = pd_to_bfqg(pd);
3182 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
3183 u64 v = 0;
aee69d78 3184
e21b7a0b
AA
3185 if (samples) {
3186 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
3187 v = div64_u64(v, samples);
3188 }
3189 __blkg_prfill_u64(sf, pd, v);
3190 return 0;
3191}
aee69d78 3192
e21b7a0b
AA
3193/* print avg_queue_size */
3194static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
3195{
3196 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3197 bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
3198 0, false);
3199 return 0;
3200}
3201
3202static struct bfq_group *
3203bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
3204{
3205 int ret;
3206
3207 ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
3208 if (ret)
3209 return NULL;
3210
3211 return blkg_to_bfqg(bfqd->queue->root_blkg);
aee69d78
PV
3212}
3213
e21b7a0b
AA
3214static struct cftype bfq_blkcg_legacy_files[] = {
3215 {
3216 .name = "bfq.weight",
3217 .flags = CFTYPE_NOT_ON_ROOT,
3218 .seq_show = bfq_io_show_weight,
3219 .write_u64 = bfq_io_set_weight_legacy,
3220 },
3221
3222 /* statistics, covers only the tasks in the bfqg */
3223 {
3224 .name = "bfq.time",
3225 .private = offsetof(struct bfq_group, stats.time),
3226 .seq_show = bfqg_print_stat,
3227 },
3228 {
3229 .name = "bfq.sectors",
3230 .seq_show = bfqg_print_stat_sectors,
3231 },
3232 {
3233 .name = "bfq.io_service_bytes",
3234 .private = (unsigned long)&blkcg_policy_bfq,
3235 .seq_show = blkg_print_stat_bytes,
3236 },
3237 {
3238 .name = "bfq.io_serviced",
3239 .private = (unsigned long)&blkcg_policy_bfq,
3240 .seq_show = blkg_print_stat_ios,
3241 },
3242 {
3243 .name = "bfq.io_service_time",
3244 .private = offsetof(struct bfq_group, stats.service_time),
3245 .seq_show = bfqg_print_rwstat,
3246 },
3247 {
3248 .name = "bfq.io_wait_time",
3249 .private = offsetof(struct bfq_group, stats.wait_time),
3250 .seq_show = bfqg_print_rwstat,
3251 },
3252 {
3253 .name = "bfq.io_merged",
3254 .private = offsetof(struct bfq_group, stats.merged),
3255 .seq_show = bfqg_print_rwstat,
3256 },
3257 {
3258 .name = "bfq.io_queued",
3259 .private = offsetof(struct bfq_group, stats.queued),
3260 .seq_show = bfqg_print_rwstat,
3261 },
3262
3263 /* the same statictics which cover the bfqg and its descendants */
3264 {
3265 .name = "bfq.time_recursive",
3266 .private = offsetof(struct bfq_group, stats.time),
3267 .seq_show = bfqg_print_stat_recursive,
3268 },
3269 {
3270 .name = "bfq.sectors_recursive",
3271 .seq_show = bfqg_print_stat_sectors_recursive,
3272 },
3273 {
3274 .name = "bfq.io_service_bytes_recursive",
3275 .private = (unsigned long)&blkcg_policy_bfq,
3276 .seq_show = blkg_print_stat_bytes_recursive,
3277 },
3278 {
3279 .name = "bfq.io_serviced_recursive",
3280 .private = (unsigned long)&blkcg_policy_bfq,
3281 .seq_show = blkg_print_stat_ios_recursive,
3282 },
3283 {
3284 .name = "bfq.io_service_time_recursive",
3285 .private = offsetof(struct bfq_group, stats.service_time),
3286 .seq_show = bfqg_print_rwstat_recursive,
3287 },
3288 {
3289 .name = "bfq.io_wait_time_recursive",
3290 .private = offsetof(struct bfq_group, stats.wait_time),
3291 .seq_show = bfqg_print_rwstat_recursive,
3292 },
3293 {
3294 .name = "bfq.io_merged_recursive",
3295 .private = offsetof(struct bfq_group, stats.merged),
3296 .seq_show = bfqg_print_rwstat_recursive,
3297 },
3298 {
3299 .name = "bfq.io_queued_recursive",
3300 .private = offsetof(struct bfq_group, stats.queued),
3301 .seq_show = bfqg_print_rwstat_recursive,
3302 },
3303 {
3304 .name = "bfq.avg_queue_size",
3305 .seq_show = bfqg_print_avg_queue_size,
3306 },
3307 {
3308 .name = "bfq.group_wait_time",
3309 .private = offsetof(struct bfq_group, stats.group_wait_time),
3310 .seq_show = bfqg_print_stat,
3311 },
3312 {
3313 .name = "bfq.idle_time",
3314 .private = offsetof(struct bfq_group, stats.idle_time),
3315 .seq_show = bfqg_print_stat,
3316 },
3317 {
3318 .name = "bfq.empty_time",
3319 .private = offsetof(struct bfq_group, stats.empty_time),
3320 .seq_show = bfqg_print_stat,
3321 },
3322 {
3323 .name = "bfq.dequeue",
3324 .private = offsetof(struct bfq_group, stats.dequeue),
3325 .seq_show = bfqg_print_stat,
3326 },
3327 { } /* terminate */
3328};
3329
3330static struct cftype bfq_blkg_files[] = {
3331 {
3332 .name = "bfq.weight",
3333 .flags = CFTYPE_NOT_ON_ROOT,
3334 .seq_show = bfq_io_show_weight,
3335 .write = bfq_io_set_weight,
3336 },
3337 {} /* terminate */
3338};
3339
3340#else /* CONFIG_BFQ_GROUP_IOSCHED */
3341
3342static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
3343 struct bfq_queue *bfqq, unsigned int op) { }
3344static inline void
3345bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
3346static inline void
3347bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
3348static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
3349 uint64_t start_time, uint64_t io_start_time,
3350 unsigned int op) { }
3351static inline void
3352bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
3353 struct bfq_group *curr_bfqg) { }
3354static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
3355static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
3356static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
3357static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
3358static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
3359static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
3360
3361static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3362 struct bfq_group *bfqg) {}
3363
3364static void bfq_init_entity(struct bfq_entity *entity,
3365 struct bfq_group *bfqg)
aee69d78
PV
3366{
3367 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3368
3369 entity->weight = entity->new_weight;
3370 entity->orig_weight = entity->new_weight;
e21b7a0b
AA
3371 if (bfqq) {
3372 bfqq->ioprio = bfqq->new_ioprio;
3373 bfqq->ioprio_class = bfqq->new_ioprio_class;
3374 }
3375 entity->sched_data = &bfqg->sched_data;
3376}
3377
3378static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3379
3380static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3381 struct blkcg *blkcg)
3382{
3383 return bfqd->root_group;
3384}
3385
3386static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
3387{
3388 return bfqq->bfqd->root_group;
3389}
aee69d78 3390
e21b7a0b
AA
3391static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
3392 int node)
3393{
3394 struct bfq_group *bfqg;
3395 int i;
3396
3397 bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
3398 if (!bfqg)
3399 return NULL;
aee69d78 3400
e21b7a0b
AA
3401 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3402 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3403
3404 return bfqg;
aee69d78 3405}
e21b7a0b 3406#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
3407
3408#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3409#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3410
3411#define bfq_sample_valid(samples) ((samples) > 80)
3412
aee69d78
PV
3413/*
3414 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3415 * We choose the request that is closesr to the head right now. Distance
3416 * behind the head is penalized and only allowed to a certain extent.
3417 */
3418static struct request *bfq_choose_req(struct bfq_data *bfqd,
3419 struct request *rq1,
3420 struct request *rq2,
3421 sector_t last)
3422{
3423 sector_t s1, s2, d1 = 0, d2 = 0;
3424 unsigned long back_max;
3425#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3426#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3427 unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
3428
3429 if (!rq1 || rq1 == rq2)
3430 return rq2;
3431 if (!rq2)
3432 return rq1;
3433
3434 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
3435 return rq1;
3436 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
3437 return rq2;
3438 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
3439 return rq1;
3440 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
3441 return rq2;
3442
3443 s1 = blk_rq_pos(rq1);
3444 s2 = blk_rq_pos(rq2);
3445
3446 /*
3447 * By definition, 1KiB is 2 sectors.
3448 */
3449 back_max = bfqd->bfq_back_max * 2;
3450
3451 /*
3452 * Strict one way elevator _except_ in the case where we allow
3453 * short backward seeks which are biased as twice the cost of a
3454 * similar forward seek.
3455 */
3456 if (s1 >= last)
3457 d1 = s1 - last;
3458 else if (s1 + back_max >= last)
3459 d1 = (last - s1) * bfqd->bfq_back_penalty;
3460 else
3461 wrap |= BFQ_RQ1_WRAP;
3462
3463 if (s2 >= last)
3464 d2 = s2 - last;
3465 else if (s2 + back_max >= last)
3466 d2 = (last - s2) * bfqd->bfq_back_penalty;
3467 else
3468 wrap |= BFQ_RQ2_WRAP;
3469
3470 /* Found required data */
3471
3472 /*
3473 * By doing switch() on the bit mask "wrap" we avoid having to
3474 * check two variables for all permutations: --> faster!
3475 */
3476 switch (wrap) {
3477 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3478 if (d1 < d2)
3479 return rq1;
3480 else if (d2 < d1)
3481 return rq2;
3482
3483 if (s1 >= s2)
3484 return rq1;
3485 else
3486 return rq2;
3487
3488 case BFQ_RQ2_WRAP:
3489 return rq1;
3490 case BFQ_RQ1_WRAP:
3491 return rq2;
3492 case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
3493 default:
3494 /*
3495 * Since both rqs are wrapped,
3496 * start with the one that's further behind head
3497 * (--> only *one* back seek required),
3498 * since back seek takes more time than forward.
3499 */
3500 if (s1 <= s2)
3501 return rq1;
3502 else
3503 return rq2;
3504 }
3505}
3506
3507/*
3508 * Return expired entry, or NULL to just start from scratch in rbtree.
3509 */
3510static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
3511 struct request *last)
3512{
3513 struct request *rq;
3514
3515 if (bfq_bfqq_fifo_expire(bfqq))
3516 return NULL;
3517
3518 bfq_mark_bfqq_fifo_expire(bfqq);
3519
3520 rq = rq_entry_fifo(bfqq->fifo.next);
3521
3522 if (rq == last || ktime_get_ns() < rq->fifo_time)
3523 return NULL;
3524
3525 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
3526 return rq;
3527}
3528
3529static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
3530 struct bfq_queue *bfqq,
3531 struct request *last)
3532{
3533 struct rb_node *rbnext = rb_next(&last->rb_node);
3534 struct rb_node *rbprev = rb_prev(&last->rb_node);
3535 struct request *next, *prev = NULL;
3536
3537 /* Follow expired path, else get first next available. */
3538 next = bfq_check_fifo(bfqq, last);
3539 if (next)
3540 return next;
3541
3542 if (rbprev)
3543 prev = rb_entry_rq(rbprev);
3544
3545 if (rbnext)
3546 next = rb_entry_rq(rbnext);
3547 else {
3548 rbnext = rb_first(&bfqq->sort_list);
3549 if (rbnext && rbnext != &last->rb_node)
3550 next = rb_entry_rq(rbnext);
3551 }
3552
3553 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
3554}
3555
3556static unsigned long bfq_serv_to_charge(struct request *rq,
3557 struct bfq_queue *bfqq)
3558{
3559 return blk_rq_sectors(rq);
3560}
3561
3562/**
3563 * bfq_updated_next_req - update the queue after a new next_rq selection.
3564 * @bfqd: the device data the queue belongs to.
3565 * @bfqq: the queue to update.
3566 *
3567 * If the first request of a queue changes we make sure that the queue
3568 * has enough budget to serve at least its first request (if the
3569 * request has grown). We do this because if the queue has not enough
3570 * budget for its first request, it has to go through two dispatch
3571 * rounds to actually get it dispatched.
3572 */
3573static void bfq_updated_next_req(struct bfq_data *bfqd,
3574 struct bfq_queue *bfqq)
3575{
3576 struct bfq_entity *entity = &bfqq->entity;
3577 struct request *next_rq = bfqq->next_rq;
3578 unsigned long new_budget;
3579
3580 if (!next_rq)
3581 return;
3582
3583 if (bfqq == bfqd->in_service_queue)
3584 /*
3585 * In order not to break guarantees, budgets cannot be
3586 * changed after an entity has been selected.
3587 */
3588 return;
3589
3590 new_budget = max_t(unsigned long, bfqq->max_budget,
3591 bfq_serv_to_charge(next_rq, bfqq));
3592 if (entity->budget != new_budget) {
3593 entity->budget = new_budget;
3594 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
3595 new_budget);
e21b7a0b 3596 bfq_requeue_bfqq(bfqd, bfqq);
aee69d78
PV
3597 }
3598}
3599
3600static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
3601{
3602 struct bfq_entity *entity = &bfqq->entity;
3603
3604 return entity->budget - entity->service;
3605}
3606
3607/*
3608 * If enough samples have been computed, return the current max budget
3609 * stored in bfqd, which is dynamically updated according to the
3610 * estimated disk peak rate; otherwise return the default max budget
3611 */
3612static int bfq_max_budget(struct bfq_data *bfqd)
3613{
3614 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3615 return bfq_default_max_budget;
3616 else
3617 return bfqd->bfq_max_budget;
3618}
3619
3620/*
3621 * Return min budget, which is a fraction of the current or default
3622 * max budget (trying with 1/32)
3623 */
3624static int bfq_min_budget(struct bfq_data *bfqd)
3625{
3626 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3627 return bfq_default_max_budget / 32;
3628 else
3629 return bfqd->bfq_max_budget / 32;
3630}
3631
3632static void bfq_bfqq_expire(struct bfq_data *bfqd,
3633 struct bfq_queue *bfqq,
3634 bool compensate,
3635 enum bfqq_expiration reason);
3636
3637/*
3638 * The next function, invoked after the input queue bfqq switches from
3639 * idle to busy, updates the budget of bfqq. The function also tells
3640 * whether the in-service queue should be expired, by returning
3641 * true. The purpose of expiring the in-service queue is to give bfqq
3642 * the chance to possibly preempt the in-service queue, and the reason
3643 * for preempting the in-service queue is to achieve the following
3644 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
3645 * expired because it has remained idle.
3646 *
3647 * In particular, bfqq may have expired for one of the following two
3648 * reasons:
3649 *
3650 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3651 * and did not make it to issue a new request before its last
3652 * request was served;
3653 *
3654 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3655 * a new request before the expiration of the idling-time.
3656 *
3657 * Even if bfqq has expired for one of the above reasons, the process
3658 * associated with the queue may be however issuing requests greedily,
3659 * and thus be sensitive to the bandwidth it receives (bfqq may have
3660 * remained idle for other reasons: CPU high load, bfqq not enjoying
3661 * idling, I/O throttling somewhere in the path from the process to
3662 * the I/O scheduler, ...). But if, after every expiration for one of
3663 * the above two reasons, bfqq has to wait for the service of at least
3664 * one full budget of another queue before being served again, then
3665 * bfqq is likely to get a much lower bandwidth or resource time than
3666 * its reserved ones. To address this issue, two countermeasures need
3667 * to be taken.
3668 *
3669 * First, the budget and the timestamps of bfqq need to be updated in
3670 * a special way on bfqq reactivation: they need to be updated as if
3671 * bfqq did not remain idle and did not expire. In fact, if they are
3672 * computed as if bfqq expired and remained idle until reactivation,
3673 * then the process associated with bfqq is treated as if, instead of
3674 * being greedy, it stopped issuing requests when bfqq remained idle,
3675 * and restarts issuing requests only on this reactivation. In other
3676 * words, the scheduler does not help the process recover the "service
3677 * hole" between bfqq expiration and reactivation. As a consequence,
3678 * the process receives a lower bandwidth than its reserved one. In
3679 * contrast, to recover this hole, the budget must be updated as if
3680 * bfqq was not expired at all before this reactivation, i.e., it must
3681 * be set to the value of the remaining budget when bfqq was
3682 * expired. Along the same line, timestamps need to be assigned the
3683 * value they had the last time bfqq was selected for service, i.e.,
3684 * before last expiration. Thus timestamps need to be back-shifted
3685 * with respect to their normal computation (see [1] for more details
3686 * on this tricky aspect).
3687 *
3688 * Secondly, to allow the process to recover the hole, the in-service
3689 * queue must be expired too, to give bfqq the chance to preempt it
3690 * immediately. In fact, if bfqq has to wait for a full budget of the
3691 * in-service queue to be completed, then it may become impossible to
3692 * let the process recover the hole, even if the back-shifted
3693 * timestamps of bfqq are lower than those of the in-service queue. If
3694 * this happens for most or all of the holes, then the process may not
3695 * receive its reserved bandwidth. In this respect, it is worth noting
3696 * that, being the service of outstanding requests unpreemptible, a
3697 * little fraction of the holes may however be unrecoverable, thereby
3698 * causing a little loss of bandwidth.
3699 *
3700 * The last important point is detecting whether bfqq does need this
3701 * bandwidth recovery. In this respect, the next function deems the
3702 * process associated with bfqq greedy, and thus allows it to recover
3703 * the hole, if: 1) the process is waiting for the arrival of a new
3704 * request (which implies that bfqq expired for one of the above two
3705 * reasons), and 2) such a request has arrived soon. The first
3706 * condition is controlled through the flag non_blocking_wait_rq,
3707 * while the second through the flag arrived_in_time. If both
3708 * conditions hold, then the function computes the budget in the
3709 * above-described special way, and signals that the in-service queue
3710 * should be expired. Timestamp back-shifting is done later in
3711 * __bfq_activate_entity.
3712 */
3713static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
3714 struct bfq_queue *bfqq,
3715 bool arrived_in_time)
3716{
3717 struct bfq_entity *entity = &bfqq->entity;
3718
3719 if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
3720 /*
3721 * We do not clear the flag non_blocking_wait_rq here, as
3722 * the latter is used in bfq_activate_bfqq to signal
3723 * that timestamps need to be back-shifted (and is
3724 * cleared right after).
3725 */
3726
3727 /*
3728 * In next assignment we rely on that either
3729 * entity->service or entity->budget are not updated
3730 * on expiration if bfqq is empty (see
3731 * __bfq_bfqq_recalc_budget). Thus both quantities
3732 * remain unchanged after such an expiration, and the
3733 * following statement therefore assigns to
3734 * entity->budget the remaining budget on such an
3735 * expiration. For clarity, entity->service is not
3736 * updated on expiration in any case, and, in normal
3737 * operation, is reset only when bfqq is selected for
3738 * service (see bfq_get_next_queue).
3739 */
3740 entity->budget = min_t(unsigned long,
3741 bfq_bfqq_budget_left(bfqq),
3742 bfqq->max_budget);
3743
3744 return true;
3745 }
3746
3747 entity->budget = max_t(unsigned long, bfqq->max_budget,
3748 bfq_serv_to_charge(bfqq->next_rq, bfqq));
3749 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
3750 return false;
3751}
3752
3753static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
3754 struct bfq_queue *bfqq,
3755 struct request *rq)
3756{
3757 bool bfqq_wants_to_preempt,
3758 /*
3759 * See the comments on
3760 * bfq_bfqq_update_budg_for_activation for
3761 * details on the usage of the next variable.
3762 */
3763 arrived_in_time = ktime_get_ns() <=
3764 bfqq->ttime.last_end_request +
3765 bfqd->bfq_slice_idle * 3;
3766
e21b7a0b
AA
3767 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
3768
aee69d78
PV
3769 /*
3770 * Update budget and check whether bfqq may want to preempt
3771 * the in-service queue.
3772 */
3773 bfqq_wants_to_preempt =
3774 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
3775 arrived_in_time);
3776
3777 if (!bfq_bfqq_IO_bound(bfqq)) {
3778 if (arrived_in_time) {
3779 bfqq->requests_within_timer++;
3780 if (bfqq->requests_within_timer >=
3781 bfqd->bfq_requests_within_timer)
3782 bfq_mark_bfqq_IO_bound(bfqq);
3783 } else
3784 bfqq->requests_within_timer = 0;
3785 }
3786
3787 bfq_add_bfqq_busy(bfqd, bfqq);
3788
3789 /*
3790 * Expire in-service queue only if preemption may be needed
3791 * for guarantees. In this respect, the function
3792 * next_queue_may_preempt just checks a simple, necessary
3793 * condition, and not a sufficient condition based on
3794 * timestamps. In fact, for the latter condition to be
3795 * evaluated, timestamps would need first to be updated, and
3796 * this operation is quite costly (see the comments on the
3797 * function bfq_bfqq_update_budg_for_activation).
3798 */
3799 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
3800 next_queue_may_preempt(bfqd))
3801 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
3802 false, BFQQE_PREEMPTED);
3803}
3804
3805static void bfq_add_request(struct request *rq)
3806{
3807 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3808 struct bfq_data *bfqd = bfqq->bfqd;
3809 struct request *next_rq, *prev;
3810
3811 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
3812 bfqq->queued[rq_is_sync(rq)]++;
3813 bfqd->queued++;
3814
3815 elv_rb_add(&bfqq->sort_list, rq);
3816
3817 /*
3818 * Check if this request is a better next-serve candidate.
3819 */
3820 prev = bfqq->next_rq;
3821 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
3822 bfqq->next_rq = next_rq;
3823
3824 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
3825 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
3826 else if (prev != bfqq->next_rq)
3827 bfq_updated_next_req(bfqd, bfqq);
3828}
3829
3830static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
3831 struct bio *bio,
3832 struct request_queue *q)
3833{
3834 struct bfq_queue *bfqq = bfqd->bio_bfqq;
3835
3836
3837 if (bfqq)
3838 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
3839
3840 return NULL;
3841}
3842
3843#if 0 /* Still not clear if we can do without next two functions */
3844static void bfq_activate_request(struct request_queue *q, struct request *rq)
3845{
3846 struct bfq_data *bfqd = q->elevator->elevator_data;
3847
3848 bfqd->rq_in_driver++;
3849 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
3850 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
3851 (unsigned long long)bfqd->last_position);
3852}
3853
3854static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
3855{
3856 struct bfq_data *bfqd = q->elevator->elevator_data;
3857
3858 bfqd->rq_in_driver--;
3859}
3860#endif
3861
3862static void bfq_remove_request(struct request_queue *q,
3863 struct request *rq)
3864{
3865 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3866 struct bfq_data *bfqd = bfqq->bfqd;
3867 const int sync = rq_is_sync(rq);
3868
3869 if (bfqq->next_rq == rq) {
3870 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
3871 bfq_updated_next_req(bfqd, bfqq);
3872 }
3873
3874 if (rq->queuelist.prev != &rq->queuelist)
3875 list_del_init(&rq->queuelist);
3876 bfqq->queued[sync]--;
3877 bfqd->queued--;
3878 elv_rb_del(&bfqq->sort_list, rq);
3879
3880 elv_rqhash_del(q, rq);
3881 if (q->last_merge == rq)
3882 q->last_merge = NULL;
3883
3884 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
3885 bfqq->next_rq = NULL;
3886
3887 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
e21b7a0b 3888 bfq_del_bfqq_busy(bfqd, bfqq, false);
aee69d78
PV
3889 /*
3890 * bfqq emptied. In normal operation, when
3891 * bfqq is empty, bfqq->entity.service and
3892 * bfqq->entity.budget must contain,
3893 * respectively, the service received and the
3894 * budget used last time bfqq emptied. These
3895 * facts do not hold in this case, as at least
3896 * this last removal occurred while bfqq is
3897 * not in service. To avoid inconsistencies,
3898 * reset both bfqq->entity.service and
3899 * bfqq->entity.budget, if bfqq has still a
3900 * process that may issue I/O requests to it.
3901 */
3902 bfqq->entity.budget = bfqq->entity.service = 0;
3903 }
3904 }
3905
3906 if (rq->cmd_flags & REQ_META)
3907 bfqq->meta_pending--;
e21b7a0b
AA
3908
3909 bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
aee69d78
PV
3910}
3911
3912static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
3913{
3914 struct request_queue *q = hctx->queue;
3915 struct bfq_data *bfqd = q->elevator->elevator_data;
3916 struct request *free = NULL;
3917 /*
3918 * bfq_bic_lookup grabs the queue_lock: invoke it now and
3919 * store its return value for later use, to avoid nesting
3920 * queue_lock inside the bfqd->lock. We assume that the bic
3921 * returned by bfq_bic_lookup does not go away before
3922 * bfqd->lock is taken.
3923 */
3924 struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
3925 bool ret;
3926
3927 spin_lock_irq(&bfqd->lock);
3928
3929 if (bic)
3930 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
3931 else
3932 bfqd->bio_bfqq = NULL;
3933 bfqd->bio_bic = bic;
3934
3935 ret = blk_mq_sched_try_merge(q, bio, &free);
3936
3937 if (free)
3938 blk_mq_free_request(free);
3939 spin_unlock_irq(&bfqd->lock);
3940
3941 return ret;
3942}
3943
3944static int bfq_request_merge(struct request_queue *q, struct request **req,
3945 struct bio *bio)
3946{
3947 struct bfq_data *bfqd = q->elevator->elevator_data;
3948 struct request *__rq;
3949
3950 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
3951 if (__rq && elv_bio_merge_ok(__rq, bio)) {
3952 *req = __rq;
3953 return ELEVATOR_FRONT_MERGE;
3954 }
3955
3956 return ELEVATOR_NO_MERGE;
3957}
3958
3959static void bfq_request_merged(struct request_queue *q, struct request *req,
3960 enum elv_merge type)
3961{
3962 if (type == ELEVATOR_FRONT_MERGE &&
3963 rb_prev(&req->rb_node) &&
3964 blk_rq_pos(req) <
3965 blk_rq_pos(container_of(rb_prev(&req->rb_node),
3966 struct request, rb_node))) {
3967 struct bfq_queue *bfqq = RQ_BFQQ(req);
3968 struct bfq_data *bfqd = bfqq->bfqd;
3969 struct request *prev, *next_rq;
3970
3971 /* Reposition request in its sort_list */
3972 elv_rb_del(&bfqq->sort_list, req);
3973 elv_rb_add(&bfqq->sort_list, req);
3974
3975 /* Choose next request to be served for bfqq */
3976 prev = bfqq->next_rq;
3977 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
3978 bfqd->last_position);
3979 bfqq->next_rq = next_rq;
3980 /*
3981 * If next_rq changes, update the queue's budget to fit
3982 * the new request.
3983 */
3984 if (prev != bfqq->next_rq)
3985 bfq_updated_next_req(bfqd, bfqq);
3986 }
3987}
3988
3989static void bfq_requests_merged(struct request_queue *q, struct request *rq,
3990 struct request *next)
3991{
3992 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
3993
3994 if (!RB_EMPTY_NODE(&rq->rb_node))
e21b7a0b 3995 goto end;
aee69d78
PV
3996 spin_lock_irq(&bfqq->bfqd->lock);
3997
3998 /*
3999 * If next and rq belong to the same bfq_queue and next is older
4000 * than rq, then reposition rq in the fifo (by substituting next
4001 * with rq). Otherwise, if next and rq belong to different
4002 * bfq_queues, never reposition rq: in fact, we would have to
4003 * reposition it with respect to next's position in its own fifo,
4004 * which would most certainly be too expensive with respect to
4005 * the benefits.
4006 */
4007 if (bfqq == next_bfqq &&
4008 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
4009 next->fifo_time < rq->fifo_time) {
4010 list_del_init(&rq->queuelist);
4011 list_replace_init(&next->queuelist, &rq->queuelist);
4012 rq->fifo_time = next->fifo_time;
4013 }
4014
4015 if (bfqq->next_rq == next)
4016 bfqq->next_rq = rq;
4017
4018 bfq_remove_request(q, next);
4019
4020 spin_unlock_irq(&bfqq->bfqd->lock);
e21b7a0b
AA
4021end:
4022 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
aee69d78
PV
4023}
4024
4025static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4026 struct bio *bio)
4027{
4028 struct bfq_data *bfqd = q->elevator->elevator_data;
4029 bool is_sync = op_is_sync(bio->bi_opf);
4030 struct bfq_queue *bfqq = bfqd->bio_bfqq;
4031
4032 /*
4033 * Disallow merge of a sync bio into an async request.
4034 */
4035 if (is_sync && !rq_is_sync(rq))
4036 return false;
4037
4038 /*
4039 * Lookup the bfqq that this bio will be queued with. Allow
4040 * merge only if rq is queued there.
4041 */
4042 if (!bfqq)
4043 return false;
4044
4045 return bfqq == RQ_BFQQ(rq);
4046}
4047
4048static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
4049 struct bfq_queue *bfqq)
4050{
4051 if (bfqq) {
e21b7a0b 4052 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
aee69d78
PV
4053 bfq_mark_bfqq_budget_new(bfqq);
4054 bfq_clear_bfqq_fifo_expire(bfqq);
4055
4056 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
4057
4058 bfq_log_bfqq(bfqd, bfqq,
4059 "set_in_service_queue, cur-budget = %d",
4060 bfqq->entity.budget);
4061 }
4062
4063 bfqd->in_service_queue = bfqq;
4064}
4065
4066/*
4067 * Get and set a new queue for service.
4068 */
4069static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
4070{
4071 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
4072
4073 __bfq_set_in_service_queue(bfqd, bfqq);
4074 return bfqq;
4075}
4076
4077/*
4078 * bfq_default_budget - return the default budget for @bfqq on @bfqd.
4079 * @bfqd: the device descriptor.
4080 * @bfqq: the queue to consider.
4081 *
4082 * We use 3/4 of the @bfqd maximum budget as the default value
4083 * for the max_budget field of the queues. This lets the feedback
4084 * mechanism to start from some middle ground, then the behavior
4085 * of the process will drive the heuristics towards high values, if
4086 * it behaves as a greedy sequential reader, or towards small values
4087 * if it shows a more intermittent behavior.
4088 */
4089static unsigned long bfq_default_budget(struct bfq_data *bfqd,
4090 struct bfq_queue *bfqq)
4091{
4092 unsigned long budget;
4093
4094 /*
4095 * When we need an estimate of the peak rate we need to avoid
4096 * to give budgets that are too short due to previous
4097 * measurements. So, in the first 10 assignments use a
4098 * ``safe'' budget value. For such first assignment the value
4099 * of bfqd->budgets_assigned happens to be lower than 194.
4100 * See __bfq_set_in_service_queue for the formula by which
4101 * this field is computed.
4102 */
4103 if (bfqd->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
4104 budget = bfq_default_max_budget;
4105 else
4106 budget = bfqd->bfq_max_budget;
4107
4108 return budget - budget / 4;
4109}
4110
4111static void bfq_arm_slice_timer(struct bfq_data *bfqd)
4112{
4113 struct bfq_queue *bfqq = bfqd->in_service_queue;
4114 struct bfq_io_cq *bic;
4115 u32 sl;
4116
4117 /* Processes have exited, don't wait. */
4118 bic = bfqd->in_service_bic;
4119 if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
4120 return;
4121
4122 bfq_mark_bfqq_wait_request(bfqq);
4123
4124 /*
4125 * We don't want to idle for seeks, but we do want to allow
4126 * fair distribution of slice time for a process doing back-to-back
4127 * seeks. So allow a little bit of time for him to submit a new rq.
4128 */
4129 sl = bfqd->bfq_slice_idle;
4130 /*
4131 * Grant only minimum idle time if the queue is seeky.
4132 */
4133 if (BFQQ_SEEKY(bfqq))
4134 sl = min_t(u64, sl, BFQ_MIN_TT);
4135
4136 bfqd->last_idling_start = ktime_get();
4137 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
4138 HRTIMER_MODE_REL);
e21b7a0b 4139 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
aee69d78
PV
4140}
4141
4142/*
4143 * Set the maximum time for the in-service queue to consume its
4144 * budget. This prevents seeky processes from lowering the disk
4145 * throughput (always guaranteed with a time slice scheme as in CFQ).
4146 */
4147static void bfq_set_budget_timeout(struct bfq_data *bfqd)
4148{
4149 struct bfq_queue *bfqq = bfqd->in_service_queue;
4150 unsigned int timeout_coeff = bfqq->entity.weight /
4151 bfqq->entity.orig_weight;
4152
4153 bfqd->last_budget_start = ktime_get();
4154
4155 bfq_clear_bfqq_budget_new(bfqq);
4156 bfqq->budget_timeout = jiffies +
4157 bfqd->bfq_timeout * timeout_coeff;
4158
4159 bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
4160 jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
4161}
4162
4163/*
4164 * Remove request from internal lists.
4165 */
4166static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4167{
4168 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4169
4170 /*
4171 * For consistency, the next instruction should have been
4172 * executed after removing the request from the queue and
4173 * dispatching it. We execute instead this instruction before
4174 * bfq_remove_request() (and hence introduce a temporary
4175 * inconsistency), for efficiency. In fact, should this
4176 * dispatch occur for a non in-service bfqq, this anticipated
4177 * increment prevents two counters related to bfqq->dispatched
4178 * from risking to be, first, uselessly decremented, and then
4179 * incremented again when the (new) value of bfqq->dispatched
4180 * happens to be taken into account.
4181 */
4182 bfqq->dispatched++;
4183
4184 bfq_remove_request(q, rq);
4185}
4186
4187static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4188{
aee69d78 4189 if (RB_EMPTY_ROOT(&bfqq->sort_list))
e21b7a0b 4190 bfq_del_bfqq_busy(bfqd, bfqq, true);
aee69d78 4191 else
e21b7a0b
AA
4192 bfq_requeue_bfqq(bfqd, bfqq);
4193
4194 /*
4195 * All in-service entities must have been properly deactivated
4196 * or requeued before executing the next function, which
4197 * resets all in-service entites as no more in service.
4198 */
4199 __bfq_bfqd_reset_in_service(bfqd);
aee69d78
PV
4200}
4201
4202/**
4203 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4204 * @bfqd: device data.
4205 * @bfqq: queue to update.
4206 * @reason: reason for expiration.
4207 *
4208 * Handle the feedback on @bfqq budget at queue expiration.
4209 * See the body for detailed comments.
4210 */
4211static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4212 struct bfq_queue *bfqq,
4213 enum bfqq_expiration reason)
4214{
4215 struct request *next_rq;
4216 int budget, min_budget;
4217
4218 budget = bfqq->max_budget;
4219 min_budget = bfq_min_budget(bfqd);
4220
4221 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
4222 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
4223 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
4224 budget, bfq_min_budget(bfqd));
4225 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
4226 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
4227
4228 if (bfq_bfqq_sync(bfqq)) {
4229 switch (reason) {
4230 /*
4231 * Caveat: in all the following cases we trade latency
4232 * for throughput.
4233 */
4234 case BFQQE_TOO_IDLE:
4235 if (budget > min_budget + BFQ_BUDGET_STEP)
4236 budget -= BFQ_BUDGET_STEP;
4237 else
4238 budget = min_budget;
4239 break;
4240 case BFQQE_BUDGET_TIMEOUT:
4241 budget = bfq_default_budget(bfqd, bfqq);
4242 break;
4243 case BFQQE_BUDGET_EXHAUSTED:
4244 /*
4245 * The process still has backlog, and did not
4246 * let either the budget timeout or the disk
4247 * idling timeout expire. Hence it is not
4248 * seeky, has a short thinktime and may be
4249 * happy with a higher budget too. So
4250 * definitely increase the budget of this good
4251 * candidate to boost the disk throughput.
4252 */
4253 budget = min(budget + 8 * BFQ_BUDGET_STEP,
4254 bfqd->bfq_max_budget);
4255 break;
4256 case BFQQE_NO_MORE_REQUESTS:
4257 /*
4258 * For queues that expire for this reason, it
4259 * is particularly important to keep the
4260 * budget close to the actual service they
4261 * need. Doing so reduces the timestamp
4262 * misalignment problem described in the
4263 * comments in the body of
4264 * __bfq_activate_entity. In fact, suppose
4265 * that a queue systematically expires for
4266 * BFQQE_NO_MORE_REQUESTS and presents a
4267 * new request in time to enjoy timestamp
4268 * back-shifting. The larger the budget of the
4269 * queue is with respect to the service the
4270 * queue actually requests in each service
4271 * slot, the more times the queue can be
4272 * reactivated with the same virtual finish
4273 * time. It follows that, even if this finish
4274 * time is pushed to the system virtual time
4275 * to reduce the consequent timestamp
4276 * misalignment, the queue unjustly enjoys for
4277 * many re-activations a lower finish time
4278 * than all newly activated queues.
4279 *
4280 * The service needed by bfqq is measured
4281 * quite precisely by bfqq->entity.service.
4282 * Since bfqq does not enjoy device idling,
4283 * bfqq->entity.service is equal to the number
4284 * of sectors that the process associated with
4285 * bfqq requested to read/write before waiting
4286 * for request completions, or blocking for
4287 * other reasons.
4288 */
4289 budget = max_t(int, bfqq->entity.service, min_budget);
4290 break;
4291 default:
4292 return;
4293 }
4294 } else {
4295 /*
4296 * Async queues get always the maximum possible
4297 * budget, as for them we do not care about latency
4298 * (in addition, their ability to dispatch is limited
4299 * by the charging factor).
4300 */
4301 budget = bfqd->bfq_max_budget;
4302 }
4303
4304 bfqq->max_budget = budget;
4305
4306 if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
4307 !bfqd->bfq_user_max_budget)
4308 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
4309
4310 /*
4311 * If there is still backlog, then assign a new budget, making
4312 * sure that it is large enough for the next request. Since
4313 * the finish time of bfqq must be kept in sync with the
4314 * budget, be sure to call __bfq_bfqq_expire() *after* this
4315 * update.
4316 *
4317 * If there is no backlog, then no need to update the budget;
4318 * it will be updated on the arrival of a new request.
4319 */
4320 next_rq = bfqq->next_rq;
4321 if (next_rq)
4322 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
4323 bfq_serv_to_charge(next_rq, bfqq));
4324
4325 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
4326 next_rq ? blk_rq_sectors(next_rq) : 0,
4327 bfqq->entity.budget);
4328}
4329
4330static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
4331{
4332 unsigned long max_budget;
4333
4334 /*
4335 * The max_budget calculated when autotuning is equal to the
4336 * amount of sectors transferred in timeout at the estimated
4337 * peak rate. To get this value, peak_rate is, first,
4338 * multiplied by 1000, because timeout is measured in ms,
4339 * while peak_rate is measured in sectors/usecs. Then the
4340 * result of this multiplication is right-shifted by
4341 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
4342 * the peak rate left-shifted by BFQ_RATE_SHIFT.
4343 */
4344 max_budget = (unsigned long)(peak_rate * 1000 *
4345 timeout >> BFQ_RATE_SHIFT);
4346
4347 return max_budget;
4348}
4349
4350/*
4351 * In addition to updating the peak rate, checks whether the process
4352 * is "slow", and returns 1 if so. This slow flag is used, in addition
4353 * to the budget timeout, to reduce the amount of service provided to
4354 * seeky processes, and hence reduce their chances to lower the
4355 * throughput. See the code for more details.
4356 */
4357static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4358 bool compensate)
4359{
4360 u64 bw, usecs, expected, timeout;
4361 ktime_t delta;
4362 int update = 0;
4363
4364 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
4365 return false;
4366
4367 if (compensate)
4368 delta = bfqd->last_idling_start;
4369 else
4370 delta = ktime_get();
4371 delta = ktime_sub(delta, bfqd->last_budget_start);
4372 usecs = ktime_to_us(delta);
4373
4374 /* don't use too short time intervals */
4375 if (usecs < 1000)
4376 return false;
4377
4378 /*
4379 * Calculate the bandwidth for the last slice. We use a 64 bit
4380 * value to store the peak rate, in sectors per usec in fixed
4381 * point math. We do so to have enough precision in the estimate
4382 * and to avoid overflows.
4383 */
4384 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
4385 do_div(bw, (unsigned long)usecs);
4386
4387 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
4388
4389 /*
4390 * Use only long (> 20ms) intervals to filter out spikes for
4391 * the peak rate estimation.
4392 */
4393 if (usecs > 20000) {
4394 if (bw > bfqd->peak_rate) {
4395 bfqd->peak_rate = bw;
4396 update = 1;
4397 bfq_log(bfqd, "new peak_rate=%llu", bw);
4398 }
4399
4400 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
4401
4402 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
4403 bfqd->peak_rate_samples++;
4404
4405 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
4406 update && bfqd->bfq_user_max_budget == 0) {
4407 bfqd->bfq_max_budget =
4408 bfq_calc_max_budget(bfqd->peak_rate,
4409 timeout);
4410 bfq_log(bfqd, "new max_budget=%d",
4411 bfqd->bfq_max_budget);
4412 }
4413 }
4414
4415 /*
4416 * A process is considered ``slow'' (i.e., seeky, so that we
4417 * cannot treat it fairly in the service domain, as it would
4418 * slow down too much the other processes) if, when a slice
4419 * ends for whatever reason, it has received service at a
4420 * rate that would not be high enough to complete the budget
4421 * before the budget timeout expiration.
4422 */
4423 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
4424
4425 /*
4426 * Caveat: processes doing IO in the slower disk zones will
4427 * tend to be slow(er) even if not seeky. And the estimated
4428 * peak rate will actually be an average over the disk
4429 * surface. Hence, to not be too harsh with unlucky processes,
4430 * we keep a budget/3 margin of safety before declaring a
4431 * process slow.
4432 */
4433 return expected > (4 * bfqq->entity.budget) / 3;
4434}
4435
4436/*
4437 * Return the farthest past time instant according to jiffies
4438 * macros.
4439 */
4440static unsigned long bfq_smallest_from_now(void)
4441{
4442 return jiffies - MAX_JIFFY_OFFSET;
4443}
4444
4445/**
4446 * bfq_bfqq_expire - expire a queue.
4447 * @bfqd: device owning the queue.
4448 * @bfqq: the queue to expire.
4449 * @compensate: if true, compensate for the time spent idling.
4450 * @reason: the reason causing the expiration.
4451 *
4452 *
4453 * If the process associated with the queue is slow (i.e., seeky), or
4454 * in case of budget timeout, or, finally, if it is async, we
4455 * artificially charge it an entire budget (independently of the
4456 * actual service it received). As a consequence, the queue will get
4457 * higher timestamps than the correct ones upon reactivation, and
4458 * hence it will be rescheduled as if it had received more service
4459 * than what it actually received. In the end, this class of processes
4460 * will receive less service in proportion to how slowly they consume
4461 * their budgets (and hence how seriously they tend to lower the
4462 * throughput).
4463 *
4464 * In contrast, when a queue expires because it has been idling for
4465 * too much or because it exhausted its budget, we do not touch the
4466 * amount of service it has received. Hence when the queue will be
4467 * reactivated and its timestamps updated, the latter will be in sync
4468 * with the actual service received by the queue until expiration.
4469 *
4470 * Charging a full budget to the first type of queues and the exact
4471 * service to the others has the effect of using the WF2Q+ policy to
4472 * schedule the former on a timeslice basis, without violating the
4473 * service domain guarantees of the latter.
4474 */
4475static void bfq_bfqq_expire(struct bfq_data *bfqd,
4476 struct bfq_queue *bfqq,
4477 bool compensate,
4478 enum bfqq_expiration reason)
4479{
4480 bool slow;
4481 int ref;
4482
4483 /*
4484 * Update device peak rate for autotuning and check whether the
4485 * process is slow (see bfq_update_peak_rate).
4486 */
4487 slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
4488
4489 /*
4490 * As above explained, 'punish' slow (i.e., seeky), timed-out
4491 * and async queues, to favor sequential sync workloads.
4492 */
4493 if (slow || reason == BFQQE_BUDGET_TIMEOUT)
4494 bfq_bfqq_charge_full_budget(bfqq);
4495
4496 if (reason == BFQQE_TOO_IDLE &&
4497 bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
4498 bfq_clear_bfqq_IO_bound(bfqq);
4499
4500 bfq_log_bfqq(bfqd, bfqq,
4501 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
4502 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
4503
4504 /*
4505 * Increase, decrease or leave budget unchanged according to
4506 * reason.
4507 */
4508 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
4509 ref = bfqq->ref;
4510 __bfq_bfqq_expire(bfqd, bfqq);
4511
4512 /* mark bfqq as waiting a request only if a bic still points to it */
4513 if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
4514 reason != BFQQE_BUDGET_TIMEOUT &&
4515 reason != BFQQE_BUDGET_EXHAUSTED)
4516 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
4517}
4518
4519/*
4520 * Budget timeout is not implemented through a dedicated timer, but
4521 * just checked on request arrivals and completions, as well as on
4522 * idle timer expirations.
4523 */
4524static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
4525{
4526 if (bfq_bfqq_budget_new(bfqq) ||
4527 time_is_after_jiffies(bfqq->budget_timeout))
4528 return false;
4529 return true;
4530}
4531
4532/*
4533 * If we expire a queue that is actively waiting (i.e., with the
4534 * device idled) for the arrival of a new request, then we may incur
4535 * the timestamp misalignment problem described in the body of the
4536 * function __bfq_activate_entity. Hence we return true only if this
4537 * condition does not hold, or if the queue is slow enough to deserve
4538 * only to be kicked off for preserving a high throughput.
4539 */
4540static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
4541{
4542 bfq_log_bfqq(bfqq->bfqd, bfqq,
4543 "may_budget_timeout: wait_request %d left %d timeout %d",
4544 bfq_bfqq_wait_request(bfqq),
4545 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
4546 bfq_bfqq_budget_timeout(bfqq));
4547
4548 return (!bfq_bfqq_wait_request(bfqq) ||
4549 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
4550 &&
4551 bfq_bfqq_budget_timeout(bfqq);
4552}
4553
4554/*
4555 * For a queue that becomes empty, device idling is allowed only if
4556 * this function returns true for the queue. And this function returns
4557 * true only if idling is beneficial for throughput.
4558 */
4559static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
4560{
4561 struct bfq_data *bfqd = bfqq->bfqd;
4562 bool idling_boosts_thr;
4563
4564 if (bfqd->strict_guarantees)
4565 return true;
4566
4567 /*
4568 * The value of the next variable is computed considering that
4569 * idling is usually beneficial for the throughput if:
4570 * (a) the device is not NCQ-capable, or
4571 * (b) regardless of the presence of NCQ, the request pattern
4572 * for bfqq is I/O-bound (possible throughput losses
4573 * caused by granting idling to seeky queues are mitigated
4574 * by the fact that, in all scenarios where boosting
4575 * throughput is the best thing to do, i.e., in all
4576 * symmetric scenarios, only a minimal idle time is
4577 * allowed to seeky queues).
4578 */
4579 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
4580
4581 /*
4582 * We have now the components we need to compute the return
4583 * value of the function, which is true only if both the
4584 * following conditions hold:
4585 * 1) bfqq is sync, because idling make sense only for sync queues;
4586 * 2) idling boosts the throughput.
4587 */
4588 return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
4589}
4590
4591/*
4592 * If the in-service queue is empty but the function bfq_bfqq_may_idle
4593 * returns true, then:
4594 * 1) the queue must remain in service and cannot be expired, and
4595 * 2) the device must be idled to wait for the possible arrival of a new
4596 * request for the queue.
4597 * See the comments on the function bfq_bfqq_may_idle for the reasons
4598 * why performing device idling is the best choice to boost the throughput
4599 * and preserve service guarantees when bfq_bfqq_may_idle itself
4600 * returns true.
4601 */
4602static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
4603{
4604 struct bfq_data *bfqd = bfqq->bfqd;
4605
4606 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
4607 bfq_bfqq_may_idle(bfqq);
4608}
4609
4610/*
4611 * Select a queue for service. If we have a current queue in service,
4612 * check whether to continue servicing it, or retrieve and set a new one.
4613 */
4614static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
4615{
4616 struct bfq_queue *bfqq;
4617 struct request *next_rq;
4618 enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
4619
4620 bfqq = bfqd->in_service_queue;
4621 if (!bfqq)
4622 goto new_queue;
4623
4624 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
4625
4626 if (bfq_may_expire_for_budg_timeout(bfqq) &&
4627 !bfq_bfqq_wait_request(bfqq) &&
4628 !bfq_bfqq_must_idle(bfqq))
4629 goto expire;
4630
4631check_queue:
4632 /*
4633 * This loop is rarely executed more than once. Even when it
4634 * happens, it is much more convenient to re-execute this loop
4635 * than to return NULL and trigger a new dispatch to get a
4636 * request served.
4637 */
4638 next_rq = bfqq->next_rq;
4639 /*
4640 * If bfqq has requests queued and it has enough budget left to
4641 * serve them, keep the queue, otherwise expire it.
4642 */
4643 if (next_rq) {
4644 if (bfq_serv_to_charge(next_rq, bfqq) >
4645 bfq_bfqq_budget_left(bfqq)) {
4646 /*
4647 * Expire the queue for budget exhaustion,
4648 * which makes sure that the next budget is
4649 * enough to serve the next request, even if
4650 * it comes from the fifo expired path.
4651 */
4652 reason = BFQQE_BUDGET_EXHAUSTED;
4653 goto expire;
4654 } else {
4655 /*
4656 * The idle timer may be pending because we may
4657 * not disable disk idling even when a new request
4658 * arrives.
4659 */
4660 if (bfq_bfqq_wait_request(bfqq)) {
4661 /*
4662 * If we get here: 1) at least a new request
4663 * has arrived but we have not disabled the
4664 * timer because the request was too small,
4665 * 2) then the block layer has unplugged
4666 * the device, causing the dispatch to be
4667 * invoked.
4668 *
4669 * Since the device is unplugged, now the
4670 * requests are probably large enough to
4671 * provide a reasonable throughput.
4672 * So we disable idling.
4673 */
4674 bfq_clear_bfqq_wait_request(bfqq);
4675 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 4676 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
4677 }
4678 goto keep_queue;
4679 }
4680 }
4681
4682 /*
4683 * No requests pending. However, if the in-service queue is idling
4684 * for a new request, or has requests waiting for a completion and
4685 * may idle after their completion, then keep it anyway.
4686 */
4687 if (bfq_bfqq_wait_request(bfqq) ||
4688 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
4689 bfqq = NULL;
4690 goto keep_queue;
4691 }
4692
4693 reason = BFQQE_NO_MORE_REQUESTS;
4694expire:
4695 bfq_bfqq_expire(bfqd, bfqq, false, reason);
4696new_queue:
4697 bfqq = bfq_set_in_service_queue(bfqd);
4698 if (bfqq) {
4699 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
4700 goto check_queue;
4701 }
4702keep_queue:
4703 if (bfqq)
4704 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
4705 else
4706 bfq_log(bfqd, "select_queue: no queue returned");
4707
4708 return bfqq;
4709}
4710
4711/*
4712 * Dispatch next request from bfqq.
4713 */
4714static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
4715 struct bfq_queue *bfqq)
4716{
4717 struct request *rq = bfqq->next_rq;
4718 unsigned long service_to_charge;
4719
4720 service_to_charge = bfq_serv_to_charge(rq, bfqq);
4721
4722 bfq_bfqq_served(bfqq, service_to_charge);
4723
4724 bfq_dispatch_remove(bfqd->queue, rq);
4725
4726 if (!bfqd->in_service_bic) {
4727 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
4728 bfqd->in_service_bic = RQ_BIC(rq);
4729 }
4730
4731 /*
4732 * Expire bfqq, pretending that its budget expired, if bfqq
4733 * belongs to CLASS_IDLE and other queues are waiting for
4734 * service.
4735 */
4736 if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
4737 goto expire;
4738
4739 return rq;
4740
4741expire:
4742 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
4743 return rq;
4744}
4745
4746static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
4747{
4748 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
4749
4750 /*
4751 * Avoiding lock: a race on bfqd->busy_queues should cause at
4752 * most a call to dispatch for nothing
4753 */
4754 return !list_empty_careful(&bfqd->dispatch) ||
4755 bfqd->busy_queues > 0;
4756}
4757
4758static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
4759{
4760 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
4761 struct request *rq = NULL;
4762 struct bfq_queue *bfqq = NULL;
4763
4764 if (!list_empty(&bfqd->dispatch)) {
4765 rq = list_first_entry(&bfqd->dispatch, struct request,
4766 queuelist);
4767 list_del_init(&rq->queuelist);
4768
4769 bfqq = RQ_BFQQ(rq);
4770
4771 if (bfqq) {
4772 /*
4773 * Increment counters here, because this
4774 * dispatch does not follow the standard
4775 * dispatch flow (where counters are
4776 * incremented)
4777 */
4778 bfqq->dispatched++;
4779
4780 goto inc_in_driver_start_rq;
4781 }
4782
4783 /*
4784 * We exploit the put_rq_private hook to decrement
4785 * rq_in_driver, but put_rq_private will not be
4786 * invoked on this request. So, to avoid unbalance,
4787 * just start this request, without incrementing
4788 * rq_in_driver. As a negative consequence,
4789 * rq_in_driver is deceptively lower than it should be
4790 * while this request is in service. This may cause
4791 * bfq_schedule_dispatch to be invoked uselessly.
4792 *
4793 * As for implementing an exact solution, the
4794 * put_request hook, if defined, is probably invoked
4795 * also on this request. So, by exploiting this hook,
4796 * we could 1) increment rq_in_driver here, and 2)
4797 * decrement it in put_request. Such a solution would
4798 * let the value of the counter be always accurate,
4799 * but it would entail using an extra interface
4800 * function. This cost seems higher than the benefit,
4801 * being the frequency of non-elevator-private
4802 * requests very low.
4803 */
4804 goto start_rq;
4805 }
4806
4807 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
4808
4809 if (bfqd->busy_queues == 0)
4810 goto exit;
4811
4812 /*
4813 * Force device to serve one request at a time if
4814 * strict_guarantees is true. Forcing this service scheme is
4815 * currently the ONLY way to guarantee that the request
4816 * service order enforced by the scheduler is respected by a
4817 * queueing device. Otherwise the device is free even to make
4818 * some unlucky request wait for as long as the device
4819 * wishes.
4820 *
4821 * Of course, serving one request at at time may cause loss of
4822 * throughput.
4823 */
4824 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
4825 goto exit;
4826
4827 bfqq = bfq_select_queue(bfqd);
4828 if (!bfqq)
4829 goto exit;
4830
4831 rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
4832
4833 if (rq) {
4834inc_in_driver_start_rq:
4835 bfqd->rq_in_driver++;
4836start_rq:
4837 rq->rq_flags |= RQF_STARTED;
4838 }
4839exit:
4840 return rq;
4841}
4842
4843static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
4844{
4845 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
4846 struct request *rq;
4847
4848 spin_lock_irq(&bfqd->lock);
4849 rq = __bfq_dispatch_request(hctx);
4850 spin_unlock_irq(&bfqd->lock);
4851
4852 return rq;
4853}
4854
4855/*
4856 * Task holds one reference to the queue, dropped when task exits. Each rq
4857 * in-flight on this queue also holds a reference, dropped when rq is freed.
4858 *
4859 * Scheduler lock must be held here. Recall not to use bfqq after calling
4860 * this function on it.
4861 */
4862static void bfq_put_queue(struct bfq_queue *bfqq)
4863{
e21b7a0b
AA
4864#ifdef CONFIG_BFQ_GROUP_IOSCHED
4865 struct bfq_group *bfqg = bfqq_group(bfqq);
4866#endif
4867
aee69d78
PV
4868 if (bfqq->bfqd)
4869 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
4870 bfqq, bfqq->ref);
4871
4872 bfqq->ref--;
4873 if (bfqq->ref)
4874 return;
4875
e21b7a0b
AA
4876 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
4877
aee69d78 4878 kmem_cache_free(bfq_pool, bfqq);
e21b7a0b
AA
4879#ifdef CONFIG_BFQ_GROUP_IOSCHED
4880 bfqg_put(bfqg);
4881#endif
aee69d78
PV
4882}
4883
4884static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4885{
4886 if (bfqq == bfqd->in_service_queue) {
4887 __bfq_bfqq_expire(bfqd, bfqq);
4888 bfq_schedule_dispatch(bfqd);
4889 }
4890
4891 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
4892
4893 bfq_put_queue(bfqq); /* release process reference */
4894}
4895
4896static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
4897{
4898 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
4899 struct bfq_data *bfqd;
4900
4901 if (bfqq)
4902 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
4903
4904 if (bfqq && bfqd) {
4905 unsigned long flags;
4906
4907 spin_lock_irqsave(&bfqd->lock, flags);
4908 bfq_exit_bfqq(bfqd, bfqq);
4909 bic_set_bfqq(bic, NULL, is_sync);
4910 spin_unlock_irq(&bfqd->lock);
4911 }
4912}
4913
4914static void bfq_exit_icq(struct io_cq *icq)
4915{
4916 struct bfq_io_cq *bic = icq_to_bic(icq);
4917
4918 bfq_exit_icq_bfqq(bic, true);
4919 bfq_exit_icq_bfqq(bic, false);
4920}
4921
4922/*
4923 * Update the entity prio values; note that the new values will not
4924 * be used until the next (re)activation.
4925 */
4926static void
4927bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
4928{
4929 struct task_struct *tsk = current;
4930 int ioprio_class;
4931 struct bfq_data *bfqd = bfqq->bfqd;
4932
4933 if (!bfqd)
4934 return;
4935
4936 ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
4937 switch (ioprio_class) {
4938 default:
4939 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
4940 "bfq: bad prio class %d\n", ioprio_class);
4941 case IOPRIO_CLASS_NONE:
4942 /*
4943 * No prio set, inherit CPU scheduling settings.
4944 */
4945 bfqq->new_ioprio = task_nice_ioprio(tsk);
4946 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
4947 break;
4948 case IOPRIO_CLASS_RT:
4949 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
4950 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
4951 break;
4952 case IOPRIO_CLASS_BE:
4953 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
4954 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
4955 break;
4956 case IOPRIO_CLASS_IDLE:
4957 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
4958 bfqq->new_ioprio = 7;
4959 bfq_clear_bfqq_idle_window(bfqq);
4960 break;
4961 }
4962
4963 if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
4964 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
4965 bfqq->new_ioprio);
4966 bfqq->new_ioprio = IOPRIO_BE_NR;
4967 }
4968
4969 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
4970 bfqq->entity.prio_changed = 1;
4971}
4972
4973static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
4974{
4975 struct bfq_data *bfqd = bic_to_bfqd(bic);
4976 struct bfq_queue *bfqq;
4977 int ioprio = bic->icq.ioc->ioprio;
4978
4979 /*
4980 * This condition may trigger on a newly created bic, be sure to
4981 * drop the lock before returning.
4982 */
4983 if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
4984 return;
4985
4986 bic->ioprio = ioprio;
4987
4988 bfqq = bic_to_bfqq(bic, false);
4989 if (bfqq) {
4990 /* release process reference on this queue */
4991 bfq_put_queue(bfqq);
4992 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
4993 bic_set_bfqq(bic, bfqq, false);
4994 }
4995
4996 bfqq = bic_to_bfqq(bic, true);
4997 if (bfqq)
4998 bfq_set_next_ioprio_data(bfqq, bic);
4999}
5000
5001static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5002 struct bfq_io_cq *bic, pid_t pid, int is_sync)
5003{
5004 RB_CLEAR_NODE(&bfqq->entity.rb_node);
5005 INIT_LIST_HEAD(&bfqq->fifo);
5006
5007 bfqq->ref = 0;
5008 bfqq->bfqd = bfqd;
5009
5010 if (bic)
5011 bfq_set_next_ioprio_data(bfqq, bic);
5012
5013 if (is_sync) {
5014 if (!bfq_class_idle(bfqq))
5015 bfq_mark_bfqq_idle_window(bfqq);
5016 bfq_mark_bfqq_sync(bfqq);
5017 } else
5018 bfq_clear_bfqq_sync(bfqq);
5019
5020 /* set end request to minus infinity from now */
5021 bfqq->ttime.last_end_request = ktime_get_ns() + 1;
5022
5023 bfq_mark_bfqq_IO_bound(bfqq);
5024
5025 bfqq->pid = pid;
5026
5027 /* Tentative initial value to trade off between thr and lat */
5028 bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
5029 bfqq->budget_timeout = bfq_smallest_from_now();
5030 bfqq->pid = pid;
5031
5032 /* first request is almost certainly seeky */
5033 bfqq->seek_history = 1;
5034}
5035
5036static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
e21b7a0b 5037 struct bfq_group *bfqg,
aee69d78
PV
5038 int ioprio_class, int ioprio)
5039{
5040 switch (ioprio_class) {
5041 case IOPRIO_CLASS_RT:
e21b7a0b 5042 return &bfqg->async_bfqq[0][ioprio];
aee69d78
PV
5043 case IOPRIO_CLASS_NONE:
5044 ioprio = IOPRIO_NORM;
5045 /* fall through */
5046 case IOPRIO_CLASS_BE:
e21b7a0b 5047 return &bfqg->async_bfqq[1][ioprio];
aee69d78 5048 case IOPRIO_CLASS_IDLE:
e21b7a0b 5049 return &bfqg->async_idle_bfqq;
aee69d78
PV
5050 default:
5051 return NULL;
5052 }
5053}
5054
5055static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
5056 struct bio *bio, bool is_sync,
5057 struct bfq_io_cq *bic)
5058{
5059 const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
5060 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
5061 struct bfq_queue **async_bfqq = NULL;
5062 struct bfq_queue *bfqq;
e21b7a0b 5063 struct bfq_group *bfqg;
aee69d78
PV
5064
5065 rcu_read_lock();
5066
e21b7a0b
AA
5067 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
5068 if (!bfqg) {
5069 bfqq = &bfqd->oom_bfqq;
5070 goto out;
5071 }
5072
aee69d78 5073 if (!is_sync) {
e21b7a0b 5074 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
aee69d78
PV
5075 ioprio);
5076 bfqq = *async_bfqq;
5077 if (bfqq)
5078 goto out;
5079 }
5080
5081 bfqq = kmem_cache_alloc_node(bfq_pool,
5082 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
5083 bfqd->queue->node);
5084
5085 if (bfqq) {
5086 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
5087 is_sync);
e21b7a0b 5088 bfq_init_entity(&bfqq->entity, bfqg);
aee69d78
PV
5089 bfq_log_bfqq(bfqd, bfqq, "allocated");
5090 } else {
5091 bfqq = &bfqd->oom_bfqq;
5092 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
5093 goto out;
5094 }
5095
5096 /*
5097 * Pin the queue now that it's allocated, scheduler exit will
5098 * prune it.
5099 */
5100 if (async_bfqq) {
e21b7a0b
AA
5101 bfqq->ref++; /*
5102 * Extra group reference, w.r.t. sync
5103 * queue. This extra reference is removed
5104 * only if bfqq->bfqg disappears, to
5105 * guarantee that this queue is not freed
5106 * until its group goes away.
5107 */
5108 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
aee69d78
PV
5109 bfqq, bfqq->ref);
5110 *async_bfqq = bfqq;
5111 }
5112
5113out:
5114 bfqq->ref++; /* get a process reference to this queue */
5115 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
5116 rcu_read_unlock();
5117 return bfqq;
5118}
5119
5120static void bfq_update_io_thinktime(struct bfq_data *bfqd,
5121 struct bfq_queue *bfqq)
5122{
5123 struct bfq_ttime *ttime = &bfqq->ttime;
5124 u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
5125
5126 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
5127
5128 ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
5129 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
5130 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
5131 ttime->ttime_samples);
5132}
5133
5134static void
5135bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5136 struct request *rq)
5137{
5138 sector_t sdist = 0;
5139
5140 if (bfqq->last_request_pos) {
5141 if (bfqq->last_request_pos < blk_rq_pos(rq))
5142 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
5143 else
5144 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
5145 }
5146
5147 bfqq->seek_history <<= 1;
5148 bfqq->seek_history |= sdist > BFQQ_SEEK_THR &&
5149 (!blk_queue_nonrot(bfqd->queue) ||
5150 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5151}
5152
5153/*
5154 * Disable idle window if the process thinks too long or seeks so much that
5155 * it doesn't matter.
5156 */
5157static void bfq_update_idle_window(struct bfq_data *bfqd,
5158 struct bfq_queue *bfqq,
5159 struct bfq_io_cq *bic)
5160{
5161 int enable_idle;
5162
5163 /* Don't idle for async or idle io prio class. */
5164 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
5165 return;
5166
5167 enable_idle = bfq_bfqq_idle_window(bfqq);
5168
5169 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
5170 bfqd->bfq_slice_idle == 0 ||
5171 (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
5172 enable_idle = 0;
5173 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
5174 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
5175 enable_idle = 0;
5176 else
5177 enable_idle = 1;
5178 }
5179 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
5180 enable_idle);
5181
5182 if (enable_idle)
5183 bfq_mark_bfqq_idle_window(bfqq);
5184 else
5185 bfq_clear_bfqq_idle_window(bfqq);
5186}
5187
5188/*
5189 * Called when a new fs request (rq) is added to bfqq. Check if there's
5190 * something we should do about it.
5191 */
5192static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5193 struct request *rq)
5194{
5195 struct bfq_io_cq *bic = RQ_BIC(rq);
5196
5197 if (rq->cmd_flags & REQ_META)
5198 bfqq->meta_pending++;
5199
5200 bfq_update_io_thinktime(bfqd, bfqq);
5201 bfq_update_io_seektime(bfqd, bfqq, rq);
5202 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
5203 !BFQQ_SEEKY(bfqq))
5204 bfq_update_idle_window(bfqd, bfqq, bic);
5205
5206 bfq_log_bfqq(bfqd, bfqq,
5207 "rq_enqueued: idle_window=%d (seeky %d)",
5208 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
5209
5210 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
5211
5212 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
5213 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
5214 blk_rq_sectors(rq) < 32;
5215 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
5216
5217 /*
5218 * There is just this request queued: if the request
5219 * is small and the queue is not to be expired, then
5220 * just exit.
5221 *
5222 * In this way, if the device is being idled to wait
5223 * for a new request from the in-service queue, we
5224 * avoid unplugging the device and committing the
5225 * device to serve just a small request. On the
5226 * contrary, we wait for the block layer to decide
5227 * when to unplug the device: hopefully, new requests
5228 * will be merged to this one quickly, then the device
5229 * will be unplugged and larger requests will be
5230 * dispatched.
5231 */
5232 if (small_req && !budget_timeout)
5233 return;
5234
5235 /*
5236 * A large enough request arrived, or the queue is to
5237 * be expired: in both cases disk idling is to be
5238 * stopped, so clear wait_request flag and reset
5239 * timer.
5240 */
5241 bfq_clear_bfqq_wait_request(bfqq);
5242 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 5243 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
5244
5245 /*
5246 * The queue is not empty, because a new request just
5247 * arrived. Hence we can safely expire the queue, in
5248 * case of budget timeout, without risking that the
5249 * timestamps of the queue are not updated correctly.
5250 * See [1] for more details.
5251 */
5252 if (budget_timeout)
5253 bfq_bfqq_expire(bfqd, bfqq, false,
5254 BFQQE_BUDGET_TIMEOUT);
5255 }
5256}
5257
5258static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
5259{
5260 struct bfq_queue *bfqq = RQ_BFQQ(rq);
5261
5262 bfq_add_request(rq);
5263
5264 rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
5265 list_add_tail(&rq->queuelist, &bfqq->fifo);
5266
5267 bfq_rq_enqueued(bfqd, bfqq, rq);
5268}
5269
5270static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
5271 bool at_head)
5272{
5273 struct request_queue *q = hctx->queue;
5274 struct bfq_data *bfqd = q->elevator->elevator_data;
5275
5276 spin_lock_irq(&bfqd->lock);
5277 if (blk_mq_sched_try_insert_merge(q, rq)) {
5278 spin_unlock_irq(&bfqd->lock);
5279 return;
5280 }
5281
5282 spin_unlock_irq(&bfqd->lock);
5283
5284 blk_mq_sched_request_inserted(rq);
5285
5286 spin_lock_irq(&bfqd->lock);
5287 if (at_head || blk_rq_is_passthrough(rq)) {
5288 if (at_head)
5289 list_add(&rq->queuelist, &bfqd->dispatch);
5290 else
5291 list_add_tail(&rq->queuelist, &bfqd->dispatch);
5292 } else {
5293 __bfq_insert_request(bfqd, rq);
5294
5295 if (rq_mergeable(rq)) {
5296 elv_rqhash_add(q, rq);
5297 if (!q->last_merge)
5298 q->last_merge = rq;
5299 }
5300 }
5301
5302 spin_unlock_irq(&bfqd->lock);
5303}
5304
5305static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
5306 struct list_head *list, bool at_head)
5307{
5308 while (!list_empty(list)) {
5309 struct request *rq;
5310
5311 rq = list_first_entry(list, struct request, queuelist);
5312 list_del_init(&rq->queuelist);
5313 bfq_insert_request(hctx, rq, at_head);
5314 }
5315}
5316
5317static void bfq_update_hw_tag(struct bfq_data *bfqd)
5318{
5319 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
5320 bfqd->rq_in_driver);
5321
5322 if (bfqd->hw_tag == 1)
5323 return;
5324
5325 /*
5326 * This sample is valid if the number of outstanding requests
5327 * is large enough to allow a queueing behavior. Note that the
5328 * sum is not exact, as it's not taking into account deactivated
5329 * requests.
5330 */
5331 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
5332 return;
5333
5334 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
5335 return;
5336
5337 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
5338 bfqd->max_rq_in_driver = 0;
5339 bfqd->hw_tag_samples = 0;
5340}
5341
5342static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5343{
5344 bfq_update_hw_tag(bfqd);
5345
5346 bfqd->rq_in_driver--;
5347 bfqq->dispatched--;
5348
5349 bfqq->ttime.last_end_request = ktime_get_ns();
5350
5351 /*
5352 * If this is the in-service queue, check if it needs to be expired,
5353 * or if we want to idle in case it has no pending requests.
5354 */
5355 if (bfqd->in_service_queue == bfqq) {
5356 if (bfq_bfqq_budget_new(bfqq))
5357 bfq_set_budget_timeout(bfqd);
5358
5359 if (bfq_bfqq_must_idle(bfqq)) {
5360 bfq_arm_slice_timer(bfqd);
5361 return;
5362 } else if (bfq_may_expire_for_budg_timeout(bfqq))
5363 bfq_bfqq_expire(bfqd, bfqq, false,
5364 BFQQE_BUDGET_TIMEOUT);
5365 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
5366 (bfqq->dispatched == 0 ||
5367 !bfq_bfqq_may_idle(bfqq)))
5368 bfq_bfqq_expire(bfqd, bfqq, false,
5369 BFQQE_NO_MORE_REQUESTS);
5370 }
5371}
5372
5373static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
5374{
5375 bfqq->allocated--;
5376
5377 bfq_put_queue(bfqq);
5378}
5379
5380static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
5381{
5382 struct bfq_queue *bfqq = RQ_BFQQ(rq);
5383 struct bfq_data *bfqd = bfqq->bfqd;
5384
e21b7a0b
AA
5385 if (rq->rq_flags & RQF_STARTED)
5386 bfqg_stats_update_completion(bfqq_group(bfqq),
5387 rq_start_time_ns(rq),
5388 rq_io_start_time_ns(rq),
5389 rq->cmd_flags);
aee69d78
PV
5390
5391 if (likely(rq->rq_flags & RQF_STARTED)) {
5392 unsigned long flags;
5393
5394 spin_lock_irqsave(&bfqd->lock, flags);
5395
5396 bfq_completed_request(bfqq, bfqd);
5397 bfq_put_rq_priv_body(bfqq);
5398
5399 spin_unlock_irqrestore(&bfqd->lock, flags);
5400 } else {
5401 /*
5402 * Request rq may be still/already in the scheduler,
5403 * in which case we need to remove it. And we cannot
5404 * defer such a check and removal, to avoid
5405 * inconsistencies in the time interval from the end
5406 * of this function to the start of the deferred work.
5407 * This situation seems to occur only in process
5408 * context, as a consequence of a merge. In the
5409 * current version of the code, this implies that the
5410 * lock is held.
5411 */
5412
5413 if (!RB_EMPTY_NODE(&rq->rb_node))
5414 bfq_remove_request(q, rq);
5415 bfq_put_rq_priv_body(bfqq);
5416 }
5417
5418 rq->elv.priv[0] = NULL;
5419 rq->elv.priv[1] = NULL;
5420}
5421
5422/*
5423 * Allocate bfq data structures associated with this request.
5424 */
5425static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
5426 struct bio *bio)
5427{
5428 struct bfq_data *bfqd = q->elevator->elevator_data;
5429 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
5430 const int is_sync = rq_is_sync(rq);
5431 struct bfq_queue *bfqq;
5432
5433 spin_lock_irq(&bfqd->lock);
5434
5435 bfq_check_ioprio_change(bic, bio);
5436
5437 if (!bic)
5438 goto queue_fail;
5439
e21b7a0b
AA
5440 bfq_bic_update_cgroup(bic, bio);
5441
aee69d78
PV
5442 bfqq = bic_to_bfqq(bic, is_sync);
5443 if (!bfqq || bfqq == &bfqd->oom_bfqq) {
5444 if (bfqq)
5445 bfq_put_queue(bfqq);
5446 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
5447 bic_set_bfqq(bic, bfqq, is_sync);
5448 }
5449
5450 bfqq->allocated++;
5451 bfqq->ref++;
5452 bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
5453 rq, bfqq, bfqq->ref);
5454
5455 rq->elv.priv[0] = bic;
5456 rq->elv.priv[1] = bfqq;
5457
5458 spin_unlock_irq(&bfqd->lock);
5459
5460 return 0;
5461
5462queue_fail:
5463 spin_unlock_irq(&bfqd->lock);
5464
5465 return 1;
5466}
5467
5468static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
5469{
5470 struct bfq_data *bfqd = bfqq->bfqd;
5471 enum bfqq_expiration reason;
5472 unsigned long flags;
5473
5474 spin_lock_irqsave(&bfqd->lock, flags);
5475 bfq_clear_bfqq_wait_request(bfqq);
5476
5477 if (bfqq != bfqd->in_service_queue) {
5478 spin_unlock_irqrestore(&bfqd->lock, flags);
5479 return;
5480 }
5481
5482 if (bfq_bfqq_budget_timeout(bfqq))
5483 /*
5484 * Also here the queue can be safely expired
5485 * for budget timeout without wasting
5486 * guarantees
5487 */
5488 reason = BFQQE_BUDGET_TIMEOUT;
5489 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
5490 /*
5491 * The queue may not be empty upon timer expiration,
5492 * because we may not disable the timer when the
5493 * first request of the in-service queue arrives
5494 * during disk idling.
5495 */
5496 reason = BFQQE_TOO_IDLE;
5497 else
5498 goto schedule_dispatch;
5499
5500 bfq_bfqq_expire(bfqd, bfqq, true, reason);
5501
5502schedule_dispatch:
5503 spin_unlock_irqrestore(&bfqd->lock, flags);
5504 bfq_schedule_dispatch(bfqd);
5505}
5506
5507/*
5508 * Handler of the expiration of the timer running if the in-service queue
5509 * is idling inside its time slice.
5510 */
5511static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
5512{
5513 struct bfq_data *bfqd = container_of(timer, struct bfq_data,
5514 idle_slice_timer);
5515 struct bfq_queue *bfqq = bfqd->in_service_queue;
5516
5517 /*
5518 * Theoretical race here: the in-service queue can be NULL or
5519 * different from the queue that was idling if a new request
5520 * arrives for the current queue and there is a full dispatch
5521 * cycle that changes the in-service queue. This can hardly
5522 * happen, but in the worst case we just expire a queue too
5523 * early.
5524 */
5525 if (bfqq)
5526 bfq_idle_slice_timer_body(bfqq);
5527
5528 return HRTIMER_NORESTART;
5529}
5530
5531static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
5532 struct bfq_queue **bfqq_ptr)
5533{
5534 struct bfq_queue *bfqq = *bfqq_ptr;
5535
5536 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
5537 if (bfqq) {
e21b7a0b
AA
5538 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
5539
aee69d78
PV
5540 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
5541 bfqq, bfqq->ref);
5542 bfq_put_queue(bfqq);
5543 *bfqq_ptr = NULL;
5544 }
5545}
5546
5547/*
e21b7a0b
AA
5548 * Release all the bfqg references to its async queues. If we are
5549 * deallocating the group these queues may still contain requests, so
5550 * we reparent them to the root cgroup (i.e., the only one that will
5551 * exist for sure until all the requests on a device are gone).
aee69d78 5552 */
e21b7a0b 5553static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
aee69d78
PV
5554{
5555 int i, j;
5556
5557 for (i = 0; i < 2; i++)
5558 for (j = 0; j < IOPRIO_BE_NR; j++)
e21b7a0b 5559 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
aee69d78 5560
e21b7a0b 5561 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
aee69d78
PV
5562}
5563
5564static void bfq_exit_queue(struct elevator_queue *e)
5565{
5566 struct bfq_data *bfqd = e->elevator_data;
5567 struct bfq_queue *bfqq, *n;
5568
5569 hrtimer_cancel(&bfqd->idle_slice_timer);
5570
5571 spin_lock_irq(&bfqd->lock);
5572 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
e21b7a0b 5573 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
aee69d78
PV
5574 spin_unlock_irq(&bfqd->lock);
5575
5576 hrtimer_cancel(&bfqd->idle_slice_timer);
5577
e21b7a0b
AA
5578#ifdef CONFIG_BFQ_GROUP_IOSCHED
5579 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
5580#else
5581 spin_lock_irq(&bfqd->lock);
5582 bfq_put_async_queues(bfqd, bfqd->root_group);
5583 kfree(bfqd->root_group);
5584 spin_unlock_irq(&bfqd->lock);
5585#endif
5586
aee69d78
PV
5587 kfree(bfqd);
5588}
5589
e21b7a0b
AA
5590static void bfq_init_root_group(struct bfq_group *root_group,
5591 struct bfq_data *bfqd)
5592{
5593 int i;
5594
5595#ifdef CONFIG_BFQ_GROUP_IOSCHED
5596 root_group->entity.parent = NULL;
5597 root_group->my_entity = NULL;
5598 root_group->bfqd = bfqd;
5599#endif
5600 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
5601 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
5602 root_group->sched_data.bfq_class_idle_last_service = jiffies;
5603}
5604
aee69d78
PV
5605static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5606{
5607 struct bfq_data *bfqd;
5608 struct elevator_queue *eq;
aee69d78
PV
5609
5610 eq = elevator_alloc(q, e);
5611 if (!eq)
5612 return -ENOMEM;
5613
5614 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
5615 if (!bfqd) {
5616 kobject_put(&eq->kobj);
5617 return -ENOMEM;
5618 }
5619 eq->elevator_data = bfqd;
5620
e21b7a0b
AA
5621 spin_lock_irq(q->queue_lock);
5622 q->elevator = eq;
5623 spin_unlock_irq(q->queue_lock);
5624
aee69d78
PV
5625 /*
5626 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
5627 * Grab a permanent reference to it, so that the normal code flow
5628 * will not attempt to free it.
5629 */
5630 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
5631 bfqd->oom_bfqq.ref++;
5632 bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
5633 bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
5634 bfqd->oom_bfqq.entity.new_weight =
5635 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
5636 /*
5637 * Trigger weight initialization, according to ioprio, at the
5638 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
5639 * class won't be changed any more.
5640 */
5641 bfqd->oom_bfqq.entity.prio_changed = 1;
5642
5643 bfqd->queue = q;
5644
e21b7a0b 5645 INIT_LIST_HEAD(&bfqd->dispatch);
aee69d78
PV
5646
5647 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
5648 HRTIMER_MODE_REL);
5649 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5650
5651 INIT_LIST_HEAD(&bfqd->active_list);
5652 INIT_LIST_HEAD(&bfqd->idle_list);
5653
5654 bfqd->hw_tag = -1;
5655
5656 bfqd->bfq_max_budget = bfq_default_max_budget;
5657
5658 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
5659 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
5660 bfqd->bfq_back_max = bfq_back_max;
5661 bfqd->bfq_back_penalty = bfq_back_penalty;
5662 bfqd->bfq_slice_idle = bfq_slice_idle;
aee69d78
PV
5663 bfqd->bfq_timeout = bfq_timeout;
5664
5665 bfqd->bfq_requests_within_timer = 120;
5666
5667 spin_lock_init(&bfqd->lock);
aee69d78 5668
e21b7a0b
AA
5669 /*
5670 * The invocation of the next bfq_create_group_hierarchy
5671 * function is the head of a chain of function calls
5672 * (bfq_create_group_hierarchy->blkcg_activate_policy->
5673 * blk_mq_freeze_queue) that may lead to the invocation of the
5674 * has_work hook function. For this reason,
5675 * bfq_create_group_hierarchy is invoked only after all
5676 * scheduler data has been initialized, apart from the fields
5677 * that can be initialized only after invoking
5678 * bfq_create_group_hierarchy. This, in particular, enables
5679 * has_work to correctly return false. Of course, to avoid
5680 * other inconsistencies, the blk-mq stack must then refrain
5681 * from invoking further scheduler hooks before this init
5682 * function is finished.
5683 */
5684 bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
5685 if (!bfqd->root_group)
5686 goto out_free;
5687 bfq_init_root_group(bfqd->root_group, bfqd);
5688 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
5689
aee69d78
PV
5690
5691 return 0;
e21b7a0b
AA
5692
5693out_free:
5694 kfree(bfqd);
5695 kobject_put(&eq->kobj);
5696 return -ENOMEM;
aee69d78
PV
5697}
5698
5699static void bfq_slab_kill(void)
5700{
5701 kmem_cache_destroy(bfq_pool);
5702}
5703
5704static int __init bfq_slab_setup(void)
5705{
5706 bfq_pool = KMEM_CACHE(bfq_queue, 0);
5707 if (!bfq_pool)
5708 return -ENOMEM;
5709 return 0;
5710}
5711
5712static ssize_t bfq_var_show(unsigned int var, char *page)
5713{
5714 return sprintf(page, "%u\n", var);
5715}
5716
5717static ssize_t bfq_var_store(unsigned long *var, const char *page,
5718 size_t count)
5719{
5720 unsigned long new_val;
5721 int ret = kstrtoul(page, 10, &new_val);
5722
5723 if (ret == 0)
5724 *var = new_val;
5725
5726 return count;
5727}
5728
5729#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
5730static ssize_t __FUNC(struct elevator_queue *e, char *page) \
5731{ \
5732 struct bfq_data *bfqd = e->elevator_data; \
5733 u64 __data = __VAR; \
5734 if (__CONV == 1) \
5735 __data = jiffies_to_msecs(__data); \
5736 else if (__CONV == 2) \
5737 __data = div_u64(__data, NSEC_PER_MSEC); \
5738 return bfq_var_show(__data, (page)); \
5739}
5740SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
5741SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
5742SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
5743SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
5744SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
5745SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
5746SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
5747SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
5748#undef SHOW_FUNCTION
5749
5750#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
5751static ssize_t __FUNC(struct elevator_queue *e, char *page) \
5752{ \
5753 struct bfq_data *bfqd = e->elevator_data; \
5754 u64 __data = __VAR; \
5755 __data = div_u64(__data, NSEC_PER_USEC); \
5756 return bfq_var_show(__data, (page)); \
5757}
5758USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
5759#undef USEC_SHOW_FUNCTION
5760
5761#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
5762static ssize_t \
5763__FUNC(struct elevator_queue *e, const char *page, size_t count) \
5764{ \
5765 struct bfq_data *bfqd = e->elevator_data; \
5766 unsigned long uninitialized_var(__data); \
5767 int ret = bfq_var_store(&__data, (page), count); \
5768 if (__data < (MIN)) \
5769 __data = (MIN); \
5770 else if (__data > (MAX)) \
5771 __data = (MAX); \
5772 if (__CONV == 1) \
5773 *(__PTR) = msecs_to_jiffies(__data); \
5774 else if (__CONV == 2) \
5775 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
5776 else \
5777 *(__PTR) = __data; \
5778 return ret; \
5779}
5780STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
5781 INT_MAX, 2);
5782STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
5783 INT_MAX, 2);
5784STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
5785STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
5786 INT_MAX, 0);
5787STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
5788#undef STORE_FUNCTION
5789
5790#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
5791static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
5792{ \
5793 struct bfq_data *bfqd = e->elevator_data; \
5794 unsigned long uninitialized_var(__data); \
5795 int ret = bfq_var_store(&__data, (page), count); \
5796 if (__data < (MIN)) \
5797 __data = (MIN); \
5798 else if (__data > (MAX)) \
5799 __data = (MAX); \
5800 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
5801 return ret; \
5802}
5803USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
5804 UINT_MAX);
5805#undef USEC_STORE_FUNCTION
5806
5807static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
5808{
5809 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
5810
5811 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
5812 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
5813 else
5814 return bfq_default_max_budget;
5815}
5816
5817static ssize_t bfq_max_budget_store(struct elevator_queue *e,
5818 const char *page, size_t count)
5819{
5820 struct bfq_data *bfqd = e->elevator_data;
5821 unsigned long uninitialized_var(__data);
5822 int ret = bfq_var_store(&__data, (page), count);
5823
5824 if (__data == 0)
5825 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
5826 else {
5827 if (__data > INT_MAX)
5828 __data = INT_MAX;
5829 bfqd->bfq_max_budget = __data;
5830 }
5831
5832 bfqd->bfq_user_max_budget = __data;
5833
5834 return ret;
5835}
5836
5837/*
5838 * Leaving this name to preserve name compatibility with cfq
5839 * parameters, but this timeout is used for both sync and async.
5840 */
5841static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
5842 const char *page, size_t count)
5843{
5844 struct bfq_data *bfqd = e->elevator_data;
5845 unsigned long uninitialized_var(__data);
5846 int ret = bfq_var_store(&__data, (page), count);
5847
5848 if (__data < 1)
5849 __data = 1;
5850 else if (__data > INT_MAX)
5851 __data = INT_MAX;
5852
5853 bfqd->bfq_timeout = msecs_to_jiffies(__data);
5854 if (bfqd->bfq_user_max_budget == 0)
5855 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
5856
5857 return ret;
5858}
5859
5860static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
5861 const char *page, size_t count)
5862{
5863 struct bfq_data *bfqd = e->elevator_data;
5864 unsigned long uninitialized_var(__data);
5865 int ret = bfq_var_store(&__data, (page), count);
5866
5867 if (__data > 1)
5868 __data = 1;
5869 if (!bfqd->strict_guarantees && __data == 1
5870 && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
5871 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
5872
5873 bfqd->strict_guarantees = __data;
5874
5875 return ret;
5876}
5877
5878#define BFQ_ATTR(name) \
5879 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
5880
5881static struct elv_fs_entry bfq_attrs[] = {
5882 BFQ_ATTR(fifo_expire_sync),
5883 BFQ_ATTR(fifo_expire_async),
5884 BFQ_ATTR(back_seek_max),
5885 BFQ_ATTR(back_seek_penalty),
5886 BFQ_ATTR(slice_idle),
5887 BFQ_ATTR(slice_idle_us),
5888 BFQ_ATTR(max_budget),
5889 BFQ_ATTR(timeout_sync),
5890 BFQ_ATTR(strict_guarantees),
5891 __ATTR_NULL
5892};
5893
5894static struct elevator_type iosched_bfq_mq = {
5895 .ops.mq = {
5896 .get_rq_priv = bfq_get_rq_private,
5897 .put_rq_priv = bfq_put_rq_private,
5898 .exit_icq = bfq_exit_icq,
5899 .insert_requests = bfq_insert_requests,
5900 .dispatch_request = bfq_dispatch_request,
5901 .next_request = elv_rb_latter_request,
5902 .former_request = elv_rb_former_request,
5903 .allow_merge = bfq_allow_bio_merge,
5904 .bio_merge = bfq_bio_merge,
5905 .request_merge = bfq_request_merge,
5906 .requests_merged = bfq_requests_merged,
5907 .request_merged = bfq_request_merged,
5908 .has_work = bfq_has_work,
5909 .init_sched = bfq_init_queue,
5910 .exit_sched = bfq_exit_queue,
5911 },
5912
5913 .uses_mq = true,
5914 .icq_size = sizeof(struct bfq_io_cq),
5915 .icq_align = __alignof__(struct bfq_io_cq),
5916 .elevator_attrs = bfq_attrs,
5917 .elevator_name = "bfq",
5918 .elevator_owner = THIS_MODULE,
5919};
5920
e21b7a0b
AA
5921#ifdef CONFIG_BFQ_GROUP_IOSCHED
5922static struct blkcg_policy blkcg_policy_bfq = {
5923 .dfl_cftypes = bfq_blkg_files,
5924 .legacy_cftypes = bfq_blkcg_legacy_files,
5925
5926 .cpd_alloc_fn = bfq_cpd_alloc,
5927 .cpd_init_fn = bfq_cpd_init,
5928 .cpd_bind_fn = bfq_cpd_init,
5929 .cpd_free_fn = bfq_cpd_free,
5930
5931 .pd_alloc_fn = bfq_pd_alloc,
5932 .pd_init_fn = bfq_pd_init,
5933 .pd_offline_fn = bfq_pd_offline,
5934 .pd_free_fn = bfq_pd_free,
5935 .pd_reset_stats_fn = bfq_pd_reset_stats,
5936};
5937#endif
5938
aee69d78
PV
5939static int __init bfq_init(void)
5940{
5941 int ret;
5942
e21b7a0b
AA
5943#ifdef CONFIG_BFQ_GROUP_IOSCHED
5944 ret = blkcg_policy_register(&blkcg_policy_bfq);
5945 if (ret)
5946 return ret;
5947#endif
5948
aee69d78
PV
5949 ret = -ENOMEM;
5950 if (bfq_slab_setup())
5951 goto err_pol_unreg;
5952
5953 ret = elv_register(&iosched_bfq_mq);
5954 if (ret)
5955 goto err_pol_unreg;
5956
5957 return 0;
5958
5959err_pol_unreg:
e21b7a0b
AA
5960#ifdef CONFIG_BFQ_GROUP_IOSCHED
5961 blkcg_policy_unregister(&blkcg_policy_bfq);
5962#endif
aee69d78
PV
5963 return ret;
5964}
5965
5966static void __exit bfq_exit(void)
5967{
5968 elv_unregister(&iosched_bfq_mq);
e21b7a0b
AA
5969#ifdef CONFIG_BFQ_GROUP_IOSCHED
5970 blkcg_policy_unregister(&blkcg_policy_bfq);
5971#endif
aee69d78
PV
5972 bfq_slab_kill();
5973}
5974
5975module_init(bfq_init);
5976module_exit(bfq_exit);
5977
5978MODULE_AUTHOR("Paolo Valente");
5979MODULE_LICENSE("GPL");
5980MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");