]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/bfq-iosched.c
block, bfq: reduce latency during request-pool saturation
[mirror_ubuntu-bionic-kernel.git] / block / bfq-iosched.c
CommitLineData
aee69d78
PV
1/*
2 * Budget Fair Queueing (BFQ) I/O scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
12 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
30 *
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
48 * applications.
49 *
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
58 *
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
67 *
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
71 * in [3].
72 *
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
77 *
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
80 * Oct 1997.
81 *
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
83 *
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
87 *
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
89 */
90#include <linux/module.h>
91#include <linux/slab.h>
92#include <linux/blkdev.h>
e21b7a0b 93#include <linux/cgroup.h>
aee69d78
PV
94#include <linux/elevator.h>
95#include <linux/ktime.h>
96#include <linux/rbtree.h>
97#include <linux/ioprio.h>
98#include <linux/sbitmap.h>
99#include <linux/delay.h>
100
101#include "blk.h"
102#include "blk-mq.h"
103#include "blk-mq-tag.h"
104#include "blk-mq-sched.h"
105#include <linux/blktrace_api.h>
106#include <linux/hrtimer.h>
107#include <linux/blk-cgroup.h>
108
109#define BFQ_IOPRIO_CLASSES 3
110#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
111
112#define BFQ_MIN_WEIGHT 1
113#define BFQ_MAX_WEIGHT 1000
114#define BFQ_WEIGHT_CONVERSION_COEFF 10
115
116#define BFQ_DEFAULT_QUEUE_IOPRIO 4
117
e21b7a0b 118#define BFQ_WEIGHT_LEGACY_DFL 100
aee69d78
PV
119#define BFQ_DEFAULT_GRP_IOPRIO 0
120#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
121
77b7dcea
PV
122/*
123 * Soft real-time applications are extremely more latency sensitive
124 * than interactive ones. Over-raise the weight of the former to
125 * privilege them against the latter.
126 */
127#define BFQ_SOFTRT_WEIGHT_FACTOR 100
128
aee69d78
PV
129struct bfq_entity;
130
131/**
132 * struct bfq_service_tree - per ioprio_class service tree.
133 *
134 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
135 * ioprio_class has its own independent scheduler, and so its own
136 * bfq_service_tree. All the fields are protected by the queue lock
137 * of the containing bfqd.
138 */
139struct bfq_service_tree {
140 /* tree for active entities (i.e., those backlogged) */
141 struct rb_root active;
142 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
143 struct rb_root idle;
144
145 /* idle entity with minimum F_i */
146 struct bfq_entity *first_idle;
147 /* idle entity with maximum F_i */
148 struct bfq_entity *last_idle;
149
150 /* scheduler virtual time */
151 u64 vtime;
152 /* scheduler weight sum; active and idle entities contribute to it */
153 unsigned long wsum;
154};
155
156/**
157 * struct bfq_sched_data - multi-class scheduler.
158 *
159 * bfq_sched_data is the basic scheduler queue. It supports three
e21b7a0b
AA
160 * ioprio_classes, and can be used either as a toplevel queue or as an
161 * intermediate queue on a hierarchical setup. @next_in_service
162 * points to the active entity of the sched_data service trees that
163 * will be scheduled next. It is used to reduce the number of steps
164 * needed for each hierarchical-schedule update.
aee69d78
PV
165 *
166 * The supported ioprio_classes are the same as in CFQ, in descending
167 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
168 * Requests from higher priority queues are served before all the
169 * requests from lower priority queues; among requests of the same
170 * queue requests are served according to B-WF2Q+.
171 * All the fields are protected by the queue lock of the containing bfqd.
172 */
173struct bfq_sched_data {
174 /* entity in service */
175 struct bfq_entity *in_service_entity;
e21b7a0b 176 /* head-of-line entity (see comments above) */
aee69d78
PV
177 struct bfq_entity *next_in_service;
178 /* array of service trees, one per ioprio_class */
179 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
e21b7a0b
AA
180 /* last time CLASS_IDLE was served */
181 unsigned long bfq_class_idle_last_service;
182
aee69d78
PV
183};
184
185/**
186 * struct bfq_entity - schedulable entity.
187 *
e21b7a0b
AA
188 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
189 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
190 * entity belongs to the sched_data of the parent group in the cgroup
191 * hierarchy. Non-leaf entities have also their own sched_data, stored
192 * in @my_sched_data.
aee69d78
PV
193 *
194 * Each entity stores independently its priority values; this would
195 * allow different weights on different devices, but this
196 * functionality is not exported to userspace by now. Priorities and
197 * weights are updated lazily, first storing the new values into the
198 * new_* fields, then setting the @prio_changed flag. As soon as
199 * there is a transition in the entity state that allows the priority
200 * update to take place the effective and the requested priority
201 * values are synchronized.
202 *
e21b7a0b
AA
203 * Unless cgroups are used, the weight value is calculated from the
204 * ioprio to export the same interface as CFQ. When dealing with
205 * ``well-behaved'' queues (i.e., queues that do not spend too much
206 * time to consume their budget and have true sequential behavior, and
207 * when there are no external factors breaking anticipation) the
208 * relative weights at each level of the cgroups hierarchy should be
209 * guaranteed. All the fields are protected by the queue lock of the
210 * containing bfqd.
aee69d78
PV
211 */
212struct bfq_entity {
213 /* service_tree member */
214 struct rb_node rb_node;
215
216 /*
e21b7a0b
AA
217 * Flag, true if the entity is on a tree (either the active or
218 * the idle one of its service_tree) or is in service.
aee69d78 219 */
e21b7a0b 220 bool on_st;
aee69d78
PV
221
222 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
223 u64 start, finish;
224
225 /* tree the entity is enqueued into; %NULL if not on a tree */
226 struct rb_root *tree;
227
228 /*
229 * minimum start time of the (active) subtree rooted at this
230 * entity; used for O(log N) lookups into active trees
231 */
232 u64 min_start;
233
234 /* amount of service received during the last service slot */
235 int service;
236
237 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
238 int budget;
239
240 /* weight of the queue */
241 int weight;
242 /* next weight if a change is in progress */
243 int new_weight;
244
245 /* original weight, used to implement weight boosting */
246 int orig_weight;
247
248 /* parent entity, for hierarchical scheduling */
249 struct bfq_entity *parent;
250
251 /*
252 * For non-leaf nodes in the hierarchy, the associated
253 * scheduler queue, %NULL on leaf nodes.
254 */
255 struct bfq_sched_data *my_sched_data;
256 /* the scheduler queue this entity belongs to */
257 struct bfq_sched_data *sched_data;
258
259 /* flag, set to request a weight, ioprio or ioprio_class change */
260 int prio_changed;
261};
262
e21b7a0b
AA
263struct bfq_group;
264
aee69d78
PV
265/**
266 * struct bfq_ttime - per process thinktime stats.
267 */
268struct bfq_ttime {
269 /* completion time of the last request */
270 u64 last_end_request;
271
272 /* total process thinktime */
273 u64 ttime_total;
274 /* number of thinktime samples */
275 unsigned long ttime_samples;
276 /* average process thinktime */
277 u64 ttime_mean;
278};
279
280/**
281 * struct bfq_queue - leaf schedulable entity.
282 *
283 * A bfq_queue is a leaf request queue; it can be associated with an
e21b7a0b
AA
284 * io_context or more, if it is async. @cgroup holds a reference to
285 * the cgroup, to be sure that it does not disappear while a bfqq
286 * still references it (mostly to avoid races between request issuing
287 * and task migration followed by cgroup destruction). All the fields
288 * are protected by the queue lock of the containing bfqd.
aee69d78
PV
289 */
290struct bfq_queue {
291 /* reference counter */
292 int ref;
293 /* parent bfq_data */
294 struct bfq_data *bfqd;
295
296 /* current ioprio and ioprio class */
297 unsigned short ioprio, ioprio_class;
298 /* next ioprio and ioprio class if a change is in progress */
299 unsigned short new_ioprio, new_ioprio_class;
300
301 /* sorted list of pending requests */
302 struct rb_root sort_list;
303 /* if fifo isn't expired, next request to serve */
304 struct request *next_rq;
305 /* number of sync and async requests queued */
306 int queued[2];
307 /* number of requests currently allocated */
308 int allocated;
309 /* number of pending metadata requests */
310 int meta_pending;
311 /* fifo list of requests in sort_list */
312 struct list_head fifo;
313
314 /* entity representing this queue in the scheduler */
315 struct bfq_entity entity;
316
317 /* maximum budget allowed from the feedback mechanism */
318 int max_budget;
319 /* budget expiration (in jiffies) */
320 unsigned long budget_timeout;
321
322 /* number of requests on the dispatch list or inside driver */
323 int dispatched;
324
325 /* status flags */
326 unsigned long flags;
327
328 /* node for active/idle bfqq list inside parent bfqd */
329 struct list_head bfqq_list;
330
331 /* associated @bfq_ttime struct */
332 struct bfq_ttime ttime;
333
334 /* bit vector: a 1 for each seeky requests in history */
335 u32 seek_history;
336 /* position of the last request enqueued */
337 sector_t last_request_pos;
338
339 /* Number of consecutive pairs of request completion and
340 * arrival, such that the queue becomes idle after the
341 * completion, but the next request arrives within an idle
342 * time slice; used only if the queue's IO_bound flag has been
343 * cleared.
344 */
345 unsigned int requests_within_timer;
346
347 /* pid of the process owning the queue, used for logging purposes */
348 pid_t pid;
44e44a1b
PV
349
350 /* current maximum weight-raising time for this queue */
351 unsigned long wr_cur_max_time;
77b7dcea
PV
352 /*
353 * Minimum time instant such that, only if a new request is
354 * enqueued after this time instant in an idle @bfq_queue with
355 * no outstanding requests, then the task associated with the
356 * queue it is deemed as soft real-time (see the comments on
357 * the function bfq_bfqq_softrt_next_start())
358 */
359 unsigned long soft_rt_next_start;
44e44a1b
PV
360 /*
361 * Start time of the current weight-raising period if
362 * the @bfq-queue is being weight-raised, otherwise
363 * finish time of the last weight-raising period.
364 */
365 unsigned long last_wr_start_finish;
366 /* factor by which the weight of this queue is multiplied */
367 unsigned int wr_coeff;
77b7dcea
PV
368 /*
369 * Time of the last transition of the @bfq_queue from idle to
370 * backlogged.
371 */
372 unsigned long last_idle_bklogged;
373 /*
374 * Cumulative service received from the @bfq_queue since the
375 * last transition from idle to backlogged.
376 */
377 unsigned long service_from_backlogged;
378 /*
379 * Value of wr start time when switching to soft rt
380 */
381 unsigned long wr_start_at_switch_to_srt;
aee69d78
PV
382};
383
384/**
385 * struct bfq_io_cq - per (request_queue, io_context) structure.
386 */
387struct bfq_io_cq {
388 /* associated io_cq structure */
389 struct io_cq icq; /* must be the first member */
390 /* array of two process queues, the sync and the async */
391 struct bfq_queue *bfqq[2];
392 /* per (request_queue, blkcg) ioprio */
393 int ioprio;
e21b7a0b
AA
394#ifdef CONFIG_BFQ_GROUP_IOSCHED
395 uint64_t blkcg_serial_nr; /* the current blkcg serial */
396#endif
aee69d78
PV
397};
398
44e44a1b
PV
399enum bfq_device_speed {
400 BFQ_BFQD_FAST,
401 BFQ_BFQD_SLOW,
402};
403
aee69d78
PV
404/**
405 * struct bfq_data - per-device data structure.
406 *
407 * All the fields are protected by @lock.
408 */
409struct bfq_data {
410 /* device request queue */
411 struct request_queue *queue;
412 /* dispatch queue */
413 struct list_head dispatch;
414
e21b7a0b
AA
415 /* root bfq_group for the device */
416 struct bfq_group *root_group;
aee69d78
PV
417
418 /*
419 * Number of bfq_queues containing requests (including the
420 * queue in service, even if it is idling).
421 */
422 int busy_queues;
cfd69712
PV
423 /* number of weight-raised busy @bfq_queues */
424 int wr_busy_queues;
aee69d78
PV
425 /* number of queued requests */
426 int queued;
427 /* number of requests dispatched and waiting for completion */
428 int rq_in_driver;
429
430 /*
431 * Maximum number of requests in driver in the last
432 * @hw_tag_samples completed requests.
433 */
434 int max_rq_in_driver;
435 /* number of samples used to calculate hw_tag */
436 int hw_tag_samples;
437 /* flag set to one if the driver is showing a queueing behavior */
438 int hw_tag;
439
440 /* number of budgets assigned */
441 int budgets_assigned;
442
443 /*
444 * Timer set when idling (waiting) for the next request from
445 * the queue in service.
446 */
447 struct hrtimer idle_slice_timer;
448
449 /* bfq_queue in service */
450 struct bfq_queue *in_service_queue;
451 /* bfq_io_cq (bic) associated with the @in_service_queue */
452 struct bfq_io_cq *in_service_bic;
453
454 /* on-disk position of the last served request */
455 sector_t last_position;
456
ab0e43e9
PV
457 /* time of last request completion (ns) */
458 u64 last_completion;
459
460 /* time of first rq dispatch in current observation interval (ns) */
461 u64 first_dispatch;
462 /* time of last rq dispatch in current observation interval (ns) */
463 u64 last_dispatch;
464
aee69d78
PV
465 /* beginning of the last budget */
466 ktime_t last_budget_start;
467 /* beginning of the last idle slice */
468 ktime_t last_idling_start;
ab0e43e9
PV
469
470 /* number of samples in current observation interval */
aee69d78 471 int peak_rate_samples;
ab0e43e9
PV
472 /* num of samples of seq dispatches in current observation interval */
473 u32 sequential_samples;
474 /* total num of sectors transferred in current observation interval */
475 u64 tot_sectors_dispatched;
476 /* max rq size seen during current observation interval (sectors) */
477 u32 last_rq_max_size;
478 /* time elapsed from first dispatch in current observ. interval (us) */
479 u64 delta_from_first;
aee69d78 480 /*
ab0e43e9
PV
481 * Current estimate of the device peak rate, measured in
482 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
483 * BFQ_RATE_SHIFT is performed to increase precision in
aee69d78
PV
484 * fixed-point calculations.
485 */
ab0e43e9
PV
486 u32 peak_rate;
487
aee69d78
PV
488 /* maximum budget allotted to a bfq_queue before rescheduling */
489 int bfq_max_budget;
490
491 /* list of all the bfq_queues active on the device */
492 struct list_head active_list;
493 /* list of all the bfq_queues idle on the device */
494 struct list_head idle_list;
495
496 /*
497 * Timeout for async/sync requests; when it fires, requests
498 * are served in fifo order.
499 */
500 u64 bfq_fifo_expire[2];
501 /* weight of backward seeks wrt forward ones */
502 unsigned int bfq_back_penalty;
503 /* maximum allowed backward seek */
504 unsigned int bfq_back_max;
505 /* maximum idling time */
506 u32 bfq_slice_idle;
aee69d78
PV
507
508 /* user-configured max budget value (0 for auto-tuning) */
509 int bfq_user_max_budget;
510 /*
511 * Timeout for bfq_queues to consume their budget; used to
512 * prevent seeky queues from imposing long latencies to
513 * sequential or quasi-sequential ones (this also implies that
514 * seeky queues cannot receive guarantees in the service
515 * domain; after a timeout they are charged for the time they
516 * have been in service, to preserve fairness among them, but
517 * without service-domain guarantees).
518 */
519 unsigned int bfq_timeout;
520
521 /*
522 * Number of consecutive requests that must be issued within
523 * the idle time slice to set again idling to a queue which
524 * was marked as non-I/O-bound (see the definition of the
525 * IO_bound flag for further details).
526 */
527 unsigned int bfq_requests_within_timer;
528
529 /*
530 * Force device idling whenever needed to provide accurate
531 * service guarantees, without caring about throughput
532 * issues. CAVEAT: this may even increase latencies, in case
533 * of useless idling for processes that did stop doing I/O.
534 */
535 bool strict_guarantees;
536
44e44a1b
PV
537 /* if set to true, low-latency heuristics are enabled */
538 bool low_latency;
539 /*
540 * Maximum factor by which the weight of a weight-raised queue
541 * is multiplied.
542 */
543 unsigned int bfq_wr_coeff;
544 /* maximum duration of a weight-raising period (jiffies) */
545 unsigned int bfq_wr_max_time;
77b7dcea
PV
546
547 /* Maximum weight-raising duration for soft real-time processes */
548 unsigned int bfq_wr_rt_max_time;
44e44a1b
PV
549 /*
550 * Minimum idle period after which weight-raising may be
551 * reactivated for a queue (in jiffies).
552 */
553 unsigned int bfq_wr_min_idle_time;
554 /*
555 * Minimum period between request arrivals after which
556 * weight-raising may be reactivated for an already busy async
557 * queue (in jiffies).
558 */
559 unsigned long bfq_wr_min_inter_arr_async;
77b7dcea
PV
560
561 /* Max service-rate for a soft real-time queue, in sectors/sec */
562 unsigned int bfq_wr_max_softrt_rate;
44e44a1b
PV
563 /*
564 * Cached value of the product R*T, used for computing the
565 * maximum duration of weight raising automatically.
566 */
567 u64 RT_prod;
568 /* device-speed class for the low-latency heuristic */
569 enum bfq_device_speed device_speed;
570
aee69d78
PV
571 /* fallback dummy bfqq for extreme OOM conditions */
572 struct bfq_queue oom_bfqq;
573
574 spinlock_t lock;
575
576 /*
577 * bic associated with the task issuing current bio for
578 * merging. This and the next field are used as a support to
579 * be able to perform the bic lookup, needed by bio-merge
580 * functions, before the scheduler lock is taken, and thus
581 * avoid taking the request-queue lock while the scheduler
582 * lock is being held.
583 */
584 struct bfq_io_cq *bio_bic;
585 /* bfqq associated with the task issuing current bio for merging */
586 struct bfq_queue *bio_bfqq;
587};
588
589enum bfqq_state_flags {
590 BFQQF_busy = 0, /* has requests or is in service */
591 BFQQF_wait_request, /* waiting for a request */
592 BFQQF_non_blocking_wait_rq, /*
593 * waiting for a request
594 * without idling the device
595 */
596 BFQQF_fifo_expire, /* FIFO checked in this slice */
597 BFQQF_idle_window, /* slice idling enabled */
598 BFQQF_sync, /* synchronous queue */
aee69d78
PV
599 BFQQF_IO_bound, /*
600 * bfqq has timed-out at least once
601 * having consumed at most 2/10 of
602 * its budget
603 */
77b7dcea
PV
604 BFQQF_softrt_update, /*
605 * may need softrt-next-start
606 * update
607 */
aee69d78
PV
608};
609
610#define BFQ_BFQQ_FNS(name) \
611static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
612{ \
613 __set_bit(BFQQF_##name, &(bfqq)->flags); \
614} \
615static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
616{ \
617 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
618} \
619static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
620{ \
621 return test_bit(BFQQF_##name, &(bfqq)->flags); \
622}
623
624BFQ_BFQQ_FNS(busy);
625BFQ_BFQQ_FNS(wait_request);
626BFQ_BFQQ_FNS(non_blocking_wait_rq);
627BFQ_BFQQ_FNS(fifo_expire);
628BFQ_BFQQ_FNS(idle_window);
629BFQ_BFQQ_FNS(sync);
aee69d78 630BFQ_BFQQ_FNS(IO_bound);
77b7dcea 631BFQ_BFQQ_FNS(softrt_update);
aee69d78
PV
632#undef BFQ_BFQQ_FNS
633
634/* Logging facilities. */
e21b7a0b
AA
635#ifdef CONFIG_BFQ_GROUP_IOSCHED
636static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
637static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
638
639#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
640 char __pbuf[128]; \
641 \
642 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
643 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
644 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
645 __pbuf, ##args); \
646} while (0)
647
648#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
649 char __pbuf[128]; \
650 \
651 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
652 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
653} while (0)
654
655#else /* CONFIG_BFQ_GROUP_IOSCHED */
656
657#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
658 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
659 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
660 ##args)
661#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
662
663#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
664
665#define bfq_log(bfqd, fmt, args...) \
666 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
667
668/* Expiration reasons. */
669enum bfqq_expiration {
670 BFQQE_TOO_IDLE = 0, /*
671 * queue has been idling for
672 * too long
673 */
674 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
675 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
676 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
677 BFQQE_PREEMPTED /* preemption in progress */
678};
679
e21b7a0b
AA
680struct bfqg_stats {
681#ifdef CONFIG_BFQ_GROUP_IOSCHED
682 /* number of ios merged */
683 struct blkg_rwstat merged;
684 /* total time spent on device in ns, may not be accurate w/ queueing */
685 struct blkg_rwstat service_time;
686 /* total time spent waiting in scheduler queue in ns */
687 struct blkg_rwstat wait_time;
688 /* number of IOs queued up */
689 struct blkg_rwstat queued;
690 /* total disk time and nr sectors dispatched by this group */
691 struct blkg_stat time;
692 /* sum of number of ios queued across all samples */
693 struct blkg_stat avg_queue_size_sum;
694 /* count of samples taken for average */
695 struct blkg_stat avg_queue_size_samples;
696 /* how many times this group has been removed from service tree */
697 struct blkg_stat dequeue;
698 /* total time spent waiting for it to be assigned a timeslice. */
699 struct blkg_stat group_wait_time;
700 /* time spent idling for this blkcg_gq */
701 struct blkg_stat idle_time;
702 /* total time with empty current active q with other requests queued */
703 struct blkg_stat empty_time;
704 /* fields after this shouldn't be cleared on stat reset */
705 uint64_t start_group_wait_time;
706 uint64_t start_idle_time;
707 uint64_t start_empty_time;
708 uint16_t flags;
709#endif /* CONFIG_BFQ_GROUP_IOSCHED */
710};
711
712#ifdef CONFIG_BFQ_GROUP_IOSCHED
713
714/*
715 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
716 *
717 * @ps: @blkcg_policy_storage that this structure inherits
718 * @weight: weight of the bfq_group
719 */
720struct bfq_group_data {
721 /* must be the first member */
722 struct blkcg_policy_data pd;
723
44e44a1b 724 unsigned int weight;
e21b7a0b
AA
725};
726
727/**
728 * struct bfq_group - per (device, cgroup) data structure.
729 * @entity: schedulable entity to insert into the parent group sched_data.
730 * @sched_data: own sched_data, to contain child entities (they may be
731 * both bfq_queues and bfq_groups).
732 * @bfqd: the bfq_data for the device this group acts upon.
733 * @async_bfqq: array of async queues for all the tasks belonging to
734 * the group, one queue per ioprio value per ioprio_class,
735 * except for the idle class that has only one queue.
736 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
737 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
738 * to avoid too many special cases during group creation/
739 * migration.
740 * @stats: stats for this bfqg.
741 *
742 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
743 * there is a set of bfq_groups, each one collecting the lower-level
744 * entities belonging to the group that are acting on the same device.
745 *
746 * Locking works as follows:
747 * o @bfqd is protected by the queue lock, RCU is used to access it
748 * from the readers.
749 * o All the other fields are protected by the @bfqd queue lock.
750 */
751struct bfq_group {
752 /* must be the first member */
753 struct blkg_policy_data pd;
754
755 struct bfq_entity entity;
756 struct bfq_sched_data sched_data;
757
758 void *bfqd;
759
760 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
761 struct bfq_queue *async_idle_bfqq;
762
763 struct bfq_entity *my_entity;
764
765 struct bfqg_stats stats;
766};
767
768#else
769struct bfq_group {
770 struct bfq_sched_data sched_data;
771
772 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
773 struct bfq_queue *async_idle_bfqq;
774
775 struct rb_root rq_pos_tree;
776};
777#endif
778
aee69d78
PV
779static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
780
e21b7a0b
AA
781static unsigned int bfq_class_idx(struct bfq_entity *entity)
782{
783 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
784
785 return bfqq ? bfqq->ioprio_class - 1 :
786 BFQ_DEFAULT_GRP_CLASS - 1;
787}
788
aee69d78
PV
789static struct bfq_service_tree *
790bfq_entity_service_tree(struct bfq_entity *entity)
791{
792 struct bfq_sched_data *sched_data = entity->sched_data;
e21b7a0b 793 unsigned int idx = bfq_class_idx(entity);
aee69d78
PV
794
795 return sched_data->service_tree + idx;
796}
797
798static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
799{
800 return bic->bfqq[is_sync];
801}
802
803static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
804 bool is_sync)
805{
806 bic->bfqq[is_sync] = bfqq;
807}
808
809static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
810{
811 return bic->icq.q->elevator->elevator_data;
812}
813
814static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
815static void bfq_put_queue(struct bfq_queue *bfqq);
816static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
817 struct bio *bio, bool is_sync,
818 struct bfq_io_cq *bic);
44e44a1b
PV
819static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
820 struct bfq_group *bfqg);
e21b7a0b 821static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
aee69d78
PV
822static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
823
aee69d78
PV
824/* Expiration time of sync (0) and async (1) requests, in ns. */
825static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
826
827/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
828static const int bfq_back_max = 16 * 1024;
829
830/* Penalty of a backwards seek, in number of sectors. */
831static const int bfq_back_penalty = 2;
832
833/* Idling period duration, in ns. */
834static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
835
836/* Minimum number of assigned budgets for which stats are safe to compute. */
837static const int bfq_stats_min_budgets = 194;
838
839/* Default maximum budget values, in sectors and number of requests. */
840static const int bfq_default_max_budget = 16 * 1024;
841
c074170e
PV
842/*
843 * Async to sync throughput distribution is controlled as follows:
844 * when an async request is served, the entity is charged the number
845 * of sectors of the request, multiplied by the factor below
846 */
847static const int bfq_async_charge_factor = 10;
848
aee69d78
PV
849/* Default timeout values, in jiffies, approximating CFQ defaults. */
850static const int bfq_timeout = HZ / 8;
851
852static struct kmem_cache *bfq_pool;
853
ab0e43e9 854/* Below this threshold (in ns), we consider thinktime immediate. */
aee69d78
PV
855#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
856
857/* hw_tag detection: parallel requests threshold and min samples needed. */
858#define BFQ_HW_QUEUE_THRESHOLD 4
859#define BFQ_HW_QUEUE_SAMPLES 32
860
861#define BFQQ_SEEK_THR (sector_t)(8 * 100)
862#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
863#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
864#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
865
ab0e43e9
PV
866/* Min number of samples required to perform peak-rate update */
867#define BFQ_RATE_MIN_SAMPLES 32
868/* Min observation time interval required to perform a peak-rate update (ns) */
869#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
870/* Target observation time interval for a peak-rate update (ns) */
871#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
aee69d78
PV
872
873/* Shift used for peak rate fixed precision calculations. */
874#define BFQ_RATE_SHIFT 16
875
44e44a1b
PV
876/*
877 * By default, BFQ computes the duration of the weight raising for
878 * interactive applications automatically, using the following formula:
879 * duration = (R / r) * T, where r is the peak rate of the device, and
880 * R and T are two reference parameters.
881 * In particular, R is the peak rate of the reference device (see below),
882 * and T is a reference time: given the systems that are likely to be
883 * installed on the reference device according to its speed class, T is
884 * about the maximum time needed, under BFQ and while reading two files in
885 * parallel, to load typical large applications on these systems.
886 * In practice, the slower/faster the device at hand is, the more/less it
887 * takes to load applications with respect to the reference device.
888 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
889 * applications.
890 *
891 * BFQ uses four different reference pairs (R, T), depending on:
892 * . whether the device is rotational or non-rotational;
893 * . whether the device is slow, such as old or portable HDDs, as well as
894 * SD cards, or fast, such as newer HDDs and SSDs.
895 *
896 * The device's speed class is dynamically (re)detected in
897 * bfq_update_peak_rate() every time the estimated peak rate is updated.
898 *
899 * In the following definitions, R_slow[0]/R_fast[0] and
900 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
901 * rotational device, whereas R_slow[1]/R_fast[1] and
902 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
903 * non-rotational device. Finally, device_speed_thresh are the
904 * thresholds used to switch between speed classes. The reference
905 * rates are not the actual peak rates of the devices used as a
906 * reference, but slightly lower values. The reason for using these
907 * slightly lower values is that the peak-rate estimator tends to
908 * yield slightly lower values than the actual peak rate (it can yield
909 * the actual peak rate only if there is only one process doing I/O,
910 * and the process does sequential I/O).
911 *
912 * Both the reference peak rates and the thresholds are measured in
913 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
914 */
915static int R_slow[2] = {1000, 10700};
916static int R_fast[2] = {14000, 33000};
917/*
918 * To improve readability, a conversion function is used to initialize the
919 * following arrays, which entails that they can be initialized only in a
920 * function.
921 */
922static int T_slow[2];
923static int T_fast[2];
924static int device_speed_thresh[2];
925
aee69d78
PV
926#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
927 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
928
929#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
930#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
931
932/**
933 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
934 * @icq: the iocontext queue.
935 */
936static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
937{
938 /* bic->icq is the first member, %NULL will convert to %NULL */
939 return container_of(icq, struct bfq_io_cq, icq);
940}
941
942/**
943 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
944 * @bfqd: the lookup key.
945 * @ioc: the io_context of the process doing I/O.
946 * @q: the request queue.
947 */
948static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
949 struct io_context *ioc,
950 struct request_queue *q)
951{
952 if (ioc) {
953 unsigned long flags;
954 struct bfq_io_cq *icq;
955
956 spin_lock_irqsave(q->queue_lock, flags);
957 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
958 spin_unlock_irqrestore(q->queue_lock, flags);
959
960 return icq;
961 }
962
963 return NULL;
964}
965
966/*
e21b7a0b
AA
967 * Scheduler run of queue, if there are requests pending and no one in the
968 * driver that will restart queueing.
969 */
970static void bfq_schedule_dispatch(struct bfq_data *bfqd)
971{
972 if (bfqd->queued != 0) {
973 bfq_log(bfqd, "schedule dispatch");
974 blk_mq_run_hw_queues(bfqd->queue, true);
975 }
976}
977
978/**
979 * bfq_gt - compare two timestamps.
980 * @a: first ts.
981 * @b: second ts.
982 *
983 * Return @a > @b, dealing with wrapping correctly.
984 */
985static int bfq_gt(u64 a, u64 b)
986{
987 return (s64)(a - b) > 0;
988}
989
990static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
991{
992 struct rb_node *node = tree->rb_node;
993
994 return rb_entry(node, struct bfq_entity, rb_node);
995}
996
997static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
998
999static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
1000
1001/**
1002 * bfq_update_next_in_service - update sd->next_in_service
1003 * @sd: sched_data for which to perform the update.
1004 * @new_entity: if not NULL, pointer to the entity whose activation,
1005 * requeueing or repositionig triggered the invocation of
1006 * this function.
1007 *
1008 * This function is called to update sd->next_in_service, which, in
1009 * its turn, may change as a consequence of the insertion or
1010 * extraction of an entity into/from one of the active trees of
1011 * sd. These insertions/extractions occur as a consequence of
1012 * activations/deactivations of entities, with some activations being
1013 * 'true' activations, and other activations being requeueings (i.e.,
1014 * implementing the second, requeueing phase of the mechanism used to
1015 * reposition an entity in its active tree; see comments on
1016 * __bfq_activate_entity and __bfq_requeue_entity for details). In
1017 * both the last two activation sub-cases, new_entity points to the
1018 * just activated or requeued entity.
1019 *
1020 * Returns true if sd->next_in_service changes in such a way that
1021 * entity->parent may become the next_in_service for its parent
1022 * entity.
aee69d78 1023 */
e21b7a0b
AA
1024static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
1025 struct bfq_entity *new_entity)
1026{
1027 struct bfq_entity *next_in_service = sd->next_in_service;
1028 bool parent_sched_may_change = false;
1029
1030 /*
1031 * If this update is triggered by the activation, requeueing
1032 * or repositiong of an entity that does not coincide with
1033 * sd->next_in_service, then a full lookup in the active tree
1034 * can be avoided. In fact, it is enough to check whether the
1035 * just-modified entity has a higher priority than
1036 * sd->next_in_service, or, even if it has the same priority
1037 * as sd->next_in_service, is eligible and has a lower virtual
1038 * finish time than sd->next_in_service. If this compound
1039 * condition holds, then the new entity becomes the new
1040 * next_in_service. Otherwise no change is needed.
1041 */
1042 if (new_entity && new_entity != sd->next_in_service) {
1043 /*
1044 * Flag used to decide whether to replace
1045 * sd->next_in_service with new_entity. Tentatively
1046 * set to true, and left as true if
1047 * sd->next_in_service is NULL.
1048 */
1049 bool replace_next = true;
1050
1051 /*
1052 * If there is already a next_in_service candidate
1053 * entity, then compare class priorities or timestamps
1054 * to decide whether to replace sd->service_tree with
1055 * new_entity.
1056 */
1057 if (next_in_service) {
1058 unsigned int new_entity_class_idx =
1059 bfq_class_idx(new_entity);
1060 struct bfq_service_tree *st =
1061 sd->service_tree + new_entity_class_idx;
1062
1063 /*
1064 * For efficiency, evaluate the most likely
1065 * sub-condition first.
1066 */
1067 replace_next =
1068 (new_entity_class_idx ==
1069 bfq_class_idx(next_in_service)
1070 &&
1071 !bfq_gt(new_entity->start, st->vtime)
1072 &&
1073 bfq_gt(next_in_service->finish,
1074 new_entity->finish))
1075 ||
1076 new_entity_class_idx <
1077 bfq_class_idx(next_in_service);
1078 }
1079
1080 if (replace_next)
1081 next_in_service = new_entity;
1082 } else /* invoked because of a deactivation: lookup needed */
1083 next_in_service = bfq_lookup_next_entity(sd);
1084
1085 if (next_in_service) {
1086 parent_sched_may_change = !sd->next_in_service ||
1087 bfq_update_parent_budget(next_in_service);
1088 }
1089
1090 sd->next_in_service = next_in_service;
1091
1092 if (!next_in_service)
1093 return parent_sched_may_change;
1094
1095 return parent_sched_may_change;
1096}
1097
1098#ifdef CONFIG_BFQ_GROUP_IOSCHED
1099/* both next loops stop at one of the child entities of the root group */
aee69d78 1100#define for_each_entity(entity) \
e21b7a0b 1101 for (; entity ; entity = entity->parent)
aee69d78 1102
e21b7a0b
AA
1103/*
1104 * For each iteration, compute parent in advance, so as to be safe if
1105 * entity is deallocated during the iteration. Such a deallocation may
1106 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1107 * containing entity.
1108 */
aee69d78 1109#define for_each_entity_safe(entity, parent) \
e21b7a0b 1110 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
aee69d78 1111
e21b7a0b
AA
1112/*
1113 * Returns true if this budget changes may let next_in_service->parent
1114 * become the next_in_service entity for its parent entity.
1115 */
1116static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 1117{
e21b7a0b
AA
1118 struct bfq_entity *bfqg_entity;
1119 struct bfq_group *bfqg;
1120 struct bfq_sched_data *group_sd;
1121 bool ret = false;
1122
1123 group_sd = next_in_service->sched_data;
1124
1125 bfqg = container_of(group_sd, struct bfq_group, sched_data);
1126 /*
1127 * bfq_group's my_entity field is not NULL only if the group
1128 * is not the root group. We must not touch the root entity
1129 * as it must never become an in-service entity.
1130 */
1131 bfqg_entity = bfqg->my_entity;
1132 if (bfqg_entity) {
1133 if (bfqg_entity->budget > next_in_service->budget)
1134 ret = true;
1135 bfqg_entity->budget = next_in_service->budget;
1136 }
1137
1138 return ret;
1139}
1140
1141/*
1142 * This function tells whether entity stops being a candidate for next
1143 * service, according to the following logic.
1144 *
1145 * This function is invoked for an entity that is about to be set in
1146 * service. If such an entity is a queue, then the entity is no longer
1147 * a candidate for next service (i.e, a candidate entity to serve
1148 * after the in-service entity is expired). The function then returns
1149 * true.
1150 */
1151static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
1152{
1153 if (bfq_entity_to_bfqq(entity))
1154 return true;
1155
1156 return false;
aee69d78
PV
1157}
1158
e21b7a0b
AA
1159#else /* CONFIG_BFQ_GROUP_IOSCHED */
1160/*
1161 * Next two macros are fake loops when cgroups support is not
1162 * enabled. I fact, in such a case, there is only one level to go up
1163 * (to reach the root group).
1164 */
1165#define for_each_entity(entity) \
1166 for (; entity ; entity = NULL)
1167
1168#define for_each_entity_safe(entity, parent) \
1169 for (parent = NULL; entity ; entity = parent)
1170
1171static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 1172{
e21b7a0b 1173 return false;
aee69d78
PV
1174}
1175
e21b7a0b 1176static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
aee69d78 1177{
e21b7a0b 1178 return true;
aee69d78
PV
1179}
1180
e21b7a0b
AA
1181#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1182
aee69d78
PV
1183/*
1184 * Shift for timestamp calculations. This actually limits the maximum
1185 * service allowed in one timestamp delta (small shift values increase it),
1186 * the maximum total weight that can be used for the queues in the system
1187 * (big shift values increase it), and the period of virtual time
1188 * wraparounds.
1189 */
1190#define WFQ_SERVICE_SHIFT 22
1191
aee69d78
PV
1192static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1193{
1194 struct bfq_queue *bfqq = NULL;
1195
1196 if (!entity->my_sched_data)
1197 bfqq = container_of(entity, struct bfq_queue, entity);
1198
1199 return bfqq;
1200}
1201
1202
1203/**
1204 * bfq_delta - map service into the virtual time domain.
1205 * @service: amount of service.
1206 * @weight: scale factor (weight of an entity or weight sum).
1207 */
1208static u64 bfq_delta(unsigned long service, unsigned long weight)
1209{
1210 u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1211
1212 do_div(d, weight);
1213 return d;
1214}
1215
1216/**
1217 * bfq_calc_finish - assign the finish time to an entity.
1218 * @entity: the entity to act upon.
1219 * @service: the service to be charged to the entity.
1220 */
1221static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
1222{
1223 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1224
1225 entity->finish = entity->start +
1226 bfq_delta(service, entity->weight);
1227
1228 if (bfqq) {
1229 bfq_log_bfqq(bfqq->bfqd, bfqq,
1230 "calc_finish: serv %lu, w %d",
1231 service, entity->weight);
1232 bfq_log_bfqq(bfqq->bfqd, bfqq,
1233 "calc_finish: start %llu, finish %llu, delta %llu",
1234 entity->start, entity->finish,
1235 bfq_delta(service, entity->weight));
1236 }
1237}
1238
1239/**
1240 * bfq_entity_of - get an entity from a node.
1241 * @node: the node field of the entity.
1242 *
1243 * Convert a node pointer to the relative entity. This is used only
1244 * to simplify the logic of some functions and not as the generic
1245 * conversion mechanism because, e.g., in the tree walking functions,
1246 * the check for a %NULL value would be redundant.
1247 */
1248static struct bfq_entity *bfq_entity_of(struct rb_node *node)
1249{
1250 struct bfq_entity *entity = NULL;
1251
1252 if (node)
1253 entity = rb_entry(node, struct bfq_entity, rb_node);
1254
1255 return entity;
1256}
1257
1258/**
1259 * bfq_extract - remove an entity from a tree.
1260 * @root: the tree root.
1261 * @entity: the entity to remove.
1262 */
1263static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
1264{
1265 entity->tree = NULL;
1266 rb_erase(&entity->rb_node, root);
1267}
1268
1269/**
1270 * bfq_idle_extract - extract an entity from the idle tree.
1271 * @st: the service tree of the owning @entity.
1272 * @entity: the entity being removed.
1273 */
1274static void bfq_idle_extract(struct bfq_service_tree *st,
1275 struct bfq_entity *entity)
1276{
1277 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1278 struct rb_node *next;
1279
1280 if (entity == st->first_idle) {
1281 next = rb_next(&entity->rb_node);
1282 st->first_idle = bfq_entity_of(next);
1283 }
1284
1285 if (entity == st->last_idle) {
1286 next = rb_prev(&entity->rb_node);
1287 st->last_idle = bfq_entity_of(next);
1288 }
1289
1290 bfq_extract(&st->idle, entity);
1291
1292 if (bfqq)
1293 list_del(&bfqq->bfqq_list);
1294}
1295
1296/**
1297 * bfq_insert - generic tree insertion.
1298 * @root: tree root.
1299 * @entity: entity to insert.
1300 *
1301 * This is used for the idle and the active tree, since they are both
1302 * ordered by finish time.
1303 */
1304static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
1305{
1306 struct bfq_entity *entry;
1307 struct rb_node **node = &root->rb_node;
1308 struct rb_node *parent = NULL;
1309
1310 while (*node) {
1311 parent = *node;
1312 entry = rb_entry(parent, struct bfq_entity, rb_node);
1313
1314 if (bfq_gt(entry->finish, entity->finish))
1315 node = &parent->rb_left;
1316 else
1317 node = &parent->rb_right;
1318 }
1319
1320 rb_link_node(&entity->rb_node, parent, node);
1321 rb_insert_color(&entity->rb_node, root);
1322
1323 entity->tree = root;
1324}
1325
1326/**
1327 * bfq_update_min - update the min_start field of a entity.
1328 * @entity: the entity to update.
1329 * @node: one of its children.
1330 *
1331 * This function is called when @entity may store an invalid value for
1332 * min_start due to updates to the active tree. The function assumes
1333 * that the subtree rooted at @node (which may be its left or its right
1334 * child) has a valid min_start value.
1335 */
1336static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
1337{
1338 struct bfq_entity *child;
1339
1340 if (node) {
1341 child = rb_entry(node, struct bfq_entity, rb_node);
1342 if (bfq_gt(entity->min_start, child->min_start))
1343 entity->min_start = child->min_start;
1344 }
1345}
1346
1347/**
1348 * bfq_update_active_node - recalculate min_start.
1349 * @node: the node to update.
1350 *
1351 * @node may have changed position or one of its children may have moved,
1352 * this function updates its min_start value. The left and right subtrees
1353 * are assumed to hold a correct min_start value.
1354 */
1355static void bfq_update_active_node(struct rb_node *node)
1356{
1357 struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
1358
1359 entity->min_start = entity->start;
1360 bfq_update_min(entity, node->rb_right);
1361 bfq_update_min(entity, node->rb_left);
1362}
1363
1364/**
1365 * bfq_update_active_tree - update min_start for the whole active tree.
1366 * @node: the starting node.
1367 *
1368 * @node must be the deepest modified node after an update. This function
1369 * updates its min_start using the values held by its children, assuming
1370 * that they did not change, and then updates all the nodes that may have
1371 * changed in the path to the root. The only nodes that may have changed
1372 * are the ones in the path or their siblings.
1373 */
1374static void bfq_update_active_tree(struct rb_node *node)
1375{
1376 struct rb_node *parent;
1377
1378up:
1379 bfq_update_active_node(node);
1380
1381 parent = rb_parent(node);
1382 if (!parent)
1383 return;
1384
1385 if (node == parent->rb_left && parent->rb_right)
1386 bfq_update_active_node(parent->rb_right);
1387 else if (parent->rb_left)
1388 bfq_update_active_node(parent->rb_left);
1389
1390 node = parent;
1391 goto up;
1392}
1393
1394/**
1395 * bfq_active_insert - insert an entity in the active tree of its
1396 * group/device.
1397 * @st: the service tree of the entity.
1398 * @entity: the entity being inserted.
1399 *
1400 * The active tree is ordered by finish time, but an extra key is kept
1401 * per each node, containing the minimum value for the start times of
1402 * its children (and the node itself), so it's possible to search for
1403 * the eligible node with the lowest finish time in logarithmic time.
1404 */
1405static void bfq_active_insert(struct bfq_service_tree *st,
1406 struct bfq_entity *entity)
1407{
1408 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1409 struct rb_node *node = &entity->rb_node;
e21b7a0b
AA
1410#ifdef CONFIG_BFQ_GROUP_IOSCHED
1411 struct bfq_sched_data *sd = NULL;
1412 struct bfq_group *bfqg = NULL;
1413 struct bfq_data *bfqd = NULL;
1414#endif
aee69d78
PV
1415
1416 bfq_insert(&st->active, entity);
1417
1418 if (node->rb_left)
1419 node = node->rb_left;
1420 else if (node->rb_right)
1421 node = node->rb_right;
1422
1423 bfq_update_active_tree(node);
1424
e21b7a0b
AA
1425#ifdef CONFIG_BFQ_GROUP_IOSCHED
1426 sd = entity->sched_data;
1427 bfqg = container_of(sd, struct bfq_group, sched_data);
1428 bfqd = (struct bfq_data *)bfqg->bfqd;
1429#endif
aee69d78
PV
1430 if (bfqq)
1431 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
1432}
1433
1434/**
1435 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1436 * @ioprio: the ioprio value to convert.
1437 */
1438static unsigned short bfq_ioprio_to_weight(int ioprio)
1439{
1440 return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
1441}
1442
1443/**
1444 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1445 * @weight: the weight value to convert.
1446 *
1447 * To preserve as much as possible the old only-ioprio user interface,
1448 * 0 is used as an escape ioprio value for weights (numerically) equal or
1449 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1450 */
1451static unsigned short bfq_weight_to_ioprio(int weight)
1452{
1453 return max_t(int, 0,
1454 IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
1455}
1456
1457static void bfq_get_entity(struct bfq_entity *entity)
1458{
1459 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1460
1461 if (bfqq) {
1462 bfqq->ref++;
1463 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1464 bfqq, bfqq->ref);
1465 }
1466}
1467
1468/**
1469 * bfq_find_deepest - find the deepest node that an extraction can modify.
1470 * @node: the node being removed.
1471 *
1472 * Do the first step of an extraction in an rb tree, looking for the
1473 * node that will replace @node, and returning the deepest node that
1474 * the following modifications to the tree can touch. If @node is the
1475 * last node in the tree return %NULL.
1476 */
1477static struct rb_node *bfq_find_deepest(struct rb_node *node)
1478{
1479 struct rb_node *deepest;
1480
1481 if (!node->rb_right && !node->rb_left)
1482 deepest = rb_parent(node);
1483 else if (!node->rb_right)
1484 deepest = node->rb_left;
1485 else if (!node->rb_left)
1486 deepest = node->rb_right;
1487 else {
1488 deepest = rb_next(node);
1489 if (deepest->rb_right)
1490 deepest = deepest->rb_right;
1491 else if (rb_parent(deepest) != node)
1492 deepest = rb_parent(deepest);
1493 }
1494
1495 return deepest;
1496}
1497
1498/**
1499 * bfq_active_extract - remove an entity from the active tree.
1500 * @st: the service_tree containing the tree.
1501 * @entity: the entity being removed.
1502 */
1503static void bfq_active_extract(struct bfq_service_tree *st,
1504 struct bfq_entity *entity)
1505{
1506 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1507 struct rb_node *node;
e21b7a0b
AA
1508#ifdef CONFIG_BFQ_GROUP_IOSCHED
1509 struct bfq_sched_data *sd = NULL;
1510 struct bfq_group *bfqg = NULL;
1511 struct bfq_data *bfqd = NULL;
1512#endif
aee69d78
PV
1513
1514 node = bfq_find_deepest(&entity->rb_node);
1515 bfq_extract(&st->active, entity);
1516
1517 if (node)
1518 bfq_update_active_tree(node);
1519
e21b7a0b
AA
1520#ifdef CONFIG_BFQ_GROUP_IOSCHED
1521 sd = entity->sched_data;
1522 bfqg = container_of(sd, struct bfq_group, sched_data);
1523 bfqd = (struct bfq_data *)bfqg->bfqd;
1524#endif
aee69d78
PV
1525 if (bfqq)
1526 list_del(&bfqq->bfqq_list);
1527}
1528
1529/**
1530 * bfq_idle_insert - insert an entity into the idle tree.
1531 * @st: the service tree containing the tree.
1532 * @entity: the entity to insert.
1533 */
1534static void bfq_idle_insert(struct bfq_service_tree *st,
1535 struct bfq_entity *entity)
1536{
1537 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1538 struct bfq_entity *first_idle = st->first_idle;
1539 struct bfq_entity *last_idle = st->last_idle;
1540
1541 if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
1542 st->first_idle = entity;
1543 if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
1544 st->last_idle = entity;
1545
1546 bfq_insert(&st->idle, entity);
1547
1548 if (bfqq)
1549 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
1550}
1551
1552/**
1553 * bfq_forget_entity - do not consider entity any longer for scheduling
1554 * @st: the service tree.
1555 * @entity: the entity being removed.
1556 * @is_in_service: true if entity is currently the in-service entity.
1557 *
1558 * Forget everything about @entity. In addition, if entity represents
1559 * a queue, and the latter is not in service, then release the service
1560 * reference to the queue (the one taken through bfq_get_entity). In
1561 * fact, in this case, there is really no more service reference to
1562 * the queue, as the latter is also outside any service tree. If,
1563 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1564 * will take care of putting the reference when the queue finally
1565 * stops being served.
1566 */
1567static void bfq_forget_entity(struct bfq_service_tree *st,
1568 struct bfq_entity *entity,
1569 bool is_in_service)
1570{
1571 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1572
e21b7a0b 1573 entity->on_st = false;
aee69d78
PV
1574 st->wsum -= entity->weight;
1575 if (bfqq && !is_in_service)
1576 bfq_put_queue(bfqq);
1577}
1578
1579/**
1580 * bfq_put_idle_entity - release the idle tree ref of an entity.
1581 * @st: service tree for the entity.
1582 * @entity: the entity being released.
1583 */
1584static void bfq_put_idle_entity(struct bfq_service_tree *st,
1585 struct bfq_entity *entity)
1586{
1587 bfq_idle_extract(st, entity);
1588 bfq_forget_entity(st, entity,
1589 entity == entity->sched_data->in_service_entity);
1590}
1591
1592/**
1593 * bfq_forget_idle - update the idle tree if necessary.
1594 * @st: the service tree to act upon.
1595 *
1596 * To preserve the global O(log N) complexity we only remove one entry here;
1597 * as the idle tree will not grow indefinitely this can be done safely.
1598 */
1599static void bfq_forget_idle(struct bfq_service_tree *st)
1600{
1601 struct bfq_entity *first_idle = st->first_idle;
1602 struct bfq_entity *last_idle = st->last_idle;
1603
1604 if (RB_EMPTY_ROOT(&st->active) && last_idle &&
1605 !bfq_gt(last_idle->finish, st->vtime)) {
1606 /*
1607 * Forget the whole idle tree, increasing the vtime past
1608 * the last finish time of idle entities.
1609 */
1610 st->vtime = last_idle->finish;
1611 }
1612
1613 if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
1614 bfq_put_idle_entity(st, first_idle);
1615}
1616
1617static struct bfq_service_tree *
1618__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
e21b7a0b 1619 struct bfq_entity *entity)
aee69d78
PV
1620{
1621 struct bfq_service_tree *new_st = old_st;
1622
1623 if (entity->prio_changed) {
1624 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
44e44a1b 1625 unsigned int prev_weight, new_weight;
aee69d78 1626 struct bfq_data *bfqd = NULL;
e21b7a0b
AA
1627#ifdef CONFIG_BFQ_GROUP_IOSCHED
1628 struct bfq_sched_data *sd;
1629 struct bfq_group *bfqg;
1630#endif
aee69d78
PV
1631
1632 if (bfqq)
1633 bfqd = bfqq->bfqd;
e21b7a0b
AA
1634#ifdef CONFIG_BFQ_GROUP_IOSCHED
1635 else {
1636 sd = entity->my_sched_data;
1637 bfqg = container_of(sd, struct bfq_group, sched_data);
1638 bfqd = (struct bfq_data *)bfqg->bfqd;
1639 }
1640#endif
aee69d78
PV
1641
1642 old_st->wsum -= entity->weight;
1643
1644 if (entity->new_weight != entity->orig_weight) {
1645 if (entity->new_weight < BFQ_MIN_WEIGHT ||
1646 entity->new_weight > BFQ_MAX_WEIGHT) {
1647 pr_crit("update_weight_prio: new_weight %d\n",
1648 entity->new_weight);
1649 if (entity->new_weight < BFQ_MIN_WEIGHT)
1650 entity->new_weight = BFQ_MIN_WEIGHT;
1651 else
1652 entity->new_weight = BFQ_MAX_WEIGHT;
1653 }
1654 entity->orig_weight = entity->new_weight;
1655 if (bfqq)
1656 bfqq->ioprio =
1657 bfq_weight_to_ioprio(entity->orig_weight);
1658 }
1659
1660 if (bfqq)
1661 bfqq->ioprio_class = bfqq->new_ioprio_class;
1662 entity->prio_changed = 0;
1663
1664 /*
1665 * NOTE: here we may be changing the weight too early,
1666 * this will cause unfairness. The correct approach
1667 * would have required additional complexity to defer
1668 * weight changes to the proper time instants (i.e.,
1669 * when entity->finish <= old_st->vtime).
1670 */
1671 new_st = bfq_entity_service_tree(entity);
1672
1673 prev_weight = entity->weight;
44e44a1b
PV
1674 new_weight = entity->orig_weight *
1675 (bfqq ? bfqq->wr_coeff : 1);
aee69d78
PV
1676 entity->weight = new_weight;
1677
1678 new_st->wsum += entity->weight;
1679
1680 if (new_st != old_st)
1681 entity->start = new_st->vtime;
1682 }
1683
1684 return new_st;
1685}
1686
e21b7a0b
AA
1687static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1688static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1689
aee69d78
PV
1690/**
1691 * bfq_bfqq_served - update the scheduler status after selection for
1692 * service.
1693 * @bfqq: the queue being served.
1694 * @served: bytes to transfer.
1695 *
1696 * NOTE: this can be optimized, as the timestamps of upper level entities
1697 * are synchronized every time a new bfqq is selected for service. By now,
1698 * we keep it to better check consistency.
1699 */
1700static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1701{
1702 struct bfq_entity *entity = &bfqq->entity;
1703 struct bfq_service_tree *st;
1704
1705 for_each_entity(entity) {
1706 st = bfq_entity_service_tree(entity);
1707
1708 entity->service += served;
1709
1710 st->vtime += bfq_delta(served, st->wsum);
1711 bfq_forget_idle(st);
1712 }
e21b7a0b 1713 bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
aee69d78
PV
1714 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1715}
1716
1717/**
c074170e
PV
1718 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1719 * of the time interval during which bfqq has been in
1720 * service.
1721 * @bfqd: the device
aee69d78 1722 * @bfqq: the queue that needs a service update.
c074170e 1723 * @time_ms: the amount of time during which the queue has received service
aee69d78 1724 *
c074170e
PV
1725 * If a queue does not consume its budget fast enough, then providing
1726 * the queue with service fairness may impair throughput, more or less
1727 * severely. For this reason, queues that consume their budget slowly
1728 * are provided with time fairness instead of service fairness. This
1729 * goal is achieved through the BFQ scheduling engine, even if such an
1730 * engine works in the service, and not in the time domain. The trick
1731 * is charging these queues with an inflated amount of service, equal
1732 * to the amount of service that they would have received during their
1733 * service slot if they had been fast, i.e., if their requests had
1734 * been dispatched at a rate equal to the estimated peak rate.
1735 *
1736 * It is worth noting that time fairness can cause important
1737 * distortions in terms of bandwidth distribution, on devices with
1738 * internal queueing. The reason is that I/O requests dispatched
1739 * during the service slot of a queue may be served after that service
1740 * slot is finished, and may have a total processing time loosely
1741 * correlated with the duration of the service slot. This is
1742 * especially true for short service slots.
aee69d78 1743 */
c074170e
PV
1744static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1745 unsigned long time_ms)
aee69d78
PV
1746{
1747 struct bfq_entity *entity = &bfqq->entity;
c074170e
PV
1748 int tot_serv_to_charge = entity->service;
1749 unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
1750
1751 if (time_ms > 0 && time_ms < timeout_ms)
1752 tot_serv_to_charge =
1753 (bfqd->bfq_max_budget * time_ms) / timeout_ms;
aee69d78 1754
c074170e
PV
1755 if (tot_serv_to_charge < entity->service)
1756 tot_serv_to_charge = entity->service;
aee69d78 1757
c074170e
PV
1758 /* Increase budget to avoid inconsistencies */
1759 if (tot_serv_to_charge > entity->budget)
1760 entity->budget = tot_serv_to_charge;
1761
1762 bfq_bfqq_served(bfqq,
1763 max_t(int, 0, tot_serv_to_charge - entity->service));
aee69d78
PV
1764}
1765
e21b7a0b
AA
1766static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1767 struct bfq_service_tree *st,
1768 bool backshifted)
aee69d78 1769{
44e44a1b
PV
1770 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1771
aee69d78
PV
1772 st = __bfq_entity_update_weight_prio(st, entity);
1773 bfq_calc_finish(entity, entity->budget);
1774
1775 /*
1776 * If some queues enjoy backshifting for a while, then their
1777 * (virtual) finish timestamps may happen to become lower and
1778 * lower than the system virtual time. In particular, if
1779 * these queues often happen to be idle for short time
1780 * periods, and during such time periods other queues with
1781 * higher timestamps happen to be busy, then the backshifted
1782 * timestamps of the former queues can become much lower than
1783 * the system virtual time. In fact, to serve the queues with
1784 * higher timestamps while the ones with lower timestamps are
1785 * idle, the system virtual time may be pushed-up to much
1786 * higher values than the finish timestamps of the idle
1787 * queues. As a consequence, the finish timestamps of all new
1788 * or newly activated queues may end up being much larger than
1789 * those of lucky queues with backshifted timestamps. The
1790 * latter queues may then monopolize the device for a lot of
1791 * time. This would simply break service guarantees.
1792 *
1793 * To reduce this problem, push up a little bit the
1794 * backshifted timestamps of the queue associated with this
1795 * entity (only a queue can happen to have the backshifted
1796 * flag set): just enough to let the finish timestamp of the
1797 * queue be equal to the current value of the system virtual
1798 * time. This may introduce a little unfairness among queues
1799 * with backshifted timestamps, but it does not break
1800 * worst-case fairness guarantees.
44e44a1b
PV
1801 *
1802 * As a special case, if bfqq is weight-raised, push up
1803 * timestamps much less, to keep very low the probability that
1804 * this push up causes the backshifted finish timestamps of
1805 * weight-raised queues to become higher than the backshifted
1806 * finish timestamps of non weight-raised queues.
aee69d78
PV
1807 */
1808 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
1809 unsigned long delta = st->vtime - entity->finish;
1810
44e44a1b
PV
1811 if (bfqq)
1812 delta /= bfqq->wr_coeff;
1813
aee69d78
PV
1814 entity->start += delta;
1815 entity->finish += delta;
1816 }
1817
1818 bfq_active_insert(st, entity);
1819}
1820
1821/**
e21b7a0b
AA
1822 * __bfq_activate_entity - handle activation of entity.
1823 * @entity: the entity being activated.
1824 * @non_blocking_wait_rq: true if entity was waiting for a request
1825 *
1826 * Called for a 'true' activation, i.e., if entity is not active and
1827 * one of its children receives a new request.
1828 *
1829 * Basically, this function updates the timestamps of entity and
1830 * inserts entity into its active tree, ater possible extracting it
1831 * from its idle tree.
1832 */
1833static void __bfq_activate_entity(struct bfq_entity *entity,
1834 bool non_blocking_wait_rq)
1835{
1836 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1837 bool backshifted = false;
1838 unsigned long long min_vstart;
1839
1840 /* See comments on bfq_fqq_update_budg_for_activation */
1841 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1842 backshifted = true;
1843 min_vstart = entity->finish;
1844 } else
1845 min_vstart = st->vtime;
1846
1847 if (entity->tree == &st->idle) {
1848 /*
1849 * Must be on the idle tree, bfq_idle_extract() will
1850 * check for that.
1851 */
1852 bfq_idle_extract(st, entity);
1853 entity->start = bfq_gt(min_vstart, entity->finish) ?
1854 min_vstart : entity->finish;
1855 } else {
1856 /*
1857 * The finish time of the entity may be invalid, and
1858 * it is in the past for sure, otherwise the queue
1859 * would have been on the idle tree.
1860 */
1861 entity->start = min_vstart;
1862 st->wsum += entity->weight;
1863 /*
1864 * entity is about to be inserted into a service tree,
1865 * and then set in service: get a reference to make
1866 * sure entity does not disappear until it is no
1867 * longer in service or scheduled for service.
1868 */
1869 bfq_get_entity(entity);
1870
1871 entity->on_st = true;
1872 }
1873
1874 bfq_update_fin_time_enqueue(entity, st, backshifted);
1875}
1876
1877/**
1878 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1879 * @entity: the entity being requeued or repositioned.
1880 *
1881 * Requeueing is needed if this entity stops being served, which
1882 * happens if a leaf descendant entity has expired. On the other hand,
1883 * repositioning is needed if the next_inservice_entity for the child
1884 * entity has changed. See the comments inside the function for
1885 * details.
1886 *
1887 * Basically, this function: 1) removes entity from its active tree if
1888 * present there, 2) updates the timestamps of entity and 3) inserts
1889 * entity back into its active tree (in the new, right position for
1890 * the new values of the timestamps).
1891 */
1892static void __bfq_requeue_entity(struct bfq_entity *entity)
1893{
1894 struct bfq_sched_data *sd = entity->sched_data;
1895 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1896
1897 if (entity == sd->in_service_entity) {
1898 /*
1899 * We are requeueing the current in-service entity,
1900 * which may have to be done for one of the following
1901 * reasons:
1902 * - entity represents the in-service queue, and the
1903 * in-service queue is being requeued after an
1904 * expiration;
1905 * - entity represents a group, and its budget has
1906 * changed because one of its child entities has
1907 * just been either activated or requeued for some
1908 * reason; the timestamps of the entity need then to
1909 * be updated, and the entity needs to be enqueued
1910 * or repositioned accordingly.
1911 *
1912 * In particular, before requeueing, the start time of
1913 * the entity must be moved forward to account for the
1914 * service that the entity has received while in
1915 * service. This is done by the next instructions. The
1916 * finish time will then be updated according to this
1917 * new value of the start time, and to the budget of
1918 * the entity.
1919 */
1920 bfq_calc_finish(entity, entity->service);
1921 entity->start = entity->finish;
1922 /*
1923 * In addition, if the entity had more than one child
1924 * when set in service, then was not extracted from
1925 * the active tree. This implies that the position of
1926 * the entity in the active tree may need to be
1927 * changed now, because we have just updated the start
1928 * time of the entity, and we will update its finish
1929 * time in a moment (the requeueing is then, more
1930 * precisely, a repositioning in this case). To
1931 * implement this repositioning, we: 1) dequeue the
1932 * entity here, 2) update the finish time and
1933 * requeue the entity according to the new
1934 * timestamps below.
1935 */
1936 if (entity->tree)
1937 bfq_active_extract(st, entity);
1938 } else { /* The entity is already active, and not in service */
1939 /*
1940 * In this case, this function gets called only if the
1941 * next_in_service entity below this entity has
1942 * changed, and this change has caused the budget of
1943 * this entity to change, which, finally implies that
1944 * the finish time of this entity must be
1945 * updated. Such an update may cause the scheduling,
1946 * i.e., the position in the active tree, of this
1947 * entity to change. We handle this change by: 1)
1948 * dequeueing the entity here, 2) updating the finish
1949 * time and requeueing the entity according to the new
1950 * timestamps below. This is the same approach as the
1951 * non-extracted-entity sub-case above.
1952 */
1953 bfq_active_extract(st, entity);
1954 }
1955
1956 bfq_update_fin_time_enqueue(entity, st, false);
1957}
1958
1959static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1960 struct bfq_sched_data *sd,
1961 bool non_blocking_wait_rq)
1962{
1963 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1964
1965 if (sd->in_service_entity == entity || entity->tree == &st->active)
1966 /*
1967 * in service or already queued on the active tree,
1968 * requeue or reposition
1969 */
1970 __bfq_requeue_entity(entity);
1971 else
1972 /*
1973 * Not in service and not queued on its active tree:
1974 * the activity is idle and this is a true activation.
1975 */
1976 __bfq_activate_entity(entity, non_blocking_wait_rq);
1977}
1978
1979
1980/**
1981 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1982 * and activate, requeue or reposition all ancestors
1983 * for which such an update becomes necessary.
aee69d78
PV
1984 * @entity: the entity to activate.
1985 * @non_blocking_wait_rq: true if this entity was waiting for a request
e21b7a0b
AA
1986 * @requeue: true if this is a requeue, which implies that bfqq is
1987 * being expired; thus ALL its ancestors stop being served and must
1988 * therefore be requeued
aee69d78 1989 */
e21b7a0b
AA
1990static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1991 bool non_blocking_wait_rq,
1992 bool requeue)
aee69d78
PV
1993{
1994 struct bfq_sched_data *sd;
1995
1996 for_each_entity(entity) {
aee69d78 1997 sd = entity->sched_data;
e21b7a0b
AA
1998 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1999
2000 if (!bfq_update_next_in_service(sd, entity) && !requeue)
aee69d78
PV
2001 break;
2002 }
2003}
2004
2005/**
2006 * __bfq_deactivate_entity - deactivate an entity from its service tree.
2007 * @entity: the entity to deactivate.
e21b7a0b
AA
2008 * @ins_into_idle_tree: if false, the entity will not be put into the
2009 * idle tree.
aee69d78 2010 *
e21b7a0b
AA
2011 * Deactivates an entity, independently from its previous state. Must
2012 * be invoked only if entity is on a service tree. Extracts the entity
2013 * from that tree, and if necessary and allowed, puts it on the idle
2014 * tree.
aee69d78 2015 */
e21b7a0b
AA
2016static bool __bfq_deactivate_entity(struct bfq_entity *entity,
2017 bool ins_into_idle_tree)
aee69d78
PV
2018{
2019 struct bfq_sched_data *sd = entity->sched_data;
2020 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
2021 int is_in_service = entity == sd->in_service_entity;
aee69d78 2022
e21b7a0b
AA
2023 if (!entity->on_st) /* entity never activated, or already inactive */
2024 return false;
aee69d78 2025
e21b7a0b 2026 if (is_in_service)
aee69d78 2027 bfq_calc_finish(entity, entity->service);
e21b7a0b
AA
2028
2029 if (entity->tree == &st->active)
aee69d78 2030 bfq_active_extract(st, entity);
e21b7a0b 2031 else if (!is_in_service && entity->tree == &st->idle)
aee69d78
PV
2032 bfq_idle_extract(st, entity);
2033
e21b7a0b 2034 if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
aee69d78
PV
2035 bfq_forget_entity(st, entity, is_in_service);
2036 else
2037 bfq_idle_insert(st, entity);
2038
e21b7a0b 2039 return true;
aee69d78
PV
2040}
2041
2042/**
e21b7a0b 2043 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
aee69d78 2044 * @entity: the entity to deactivate.
e21b7a0b 2045 * @ins_into_idle_tree: true if the entity can be put on the idle tree
aee69d78 2046 */
e21b7a0b
AA
2047static void bfq_deactivate_entity(struct bfq_entity *entity,
2048 bool ins_into_idle_tree,
2049 bool expiration)
aee69d78
PV
2050{
2051 struct bfq_sched_data *sd;
2052 struct bfq_entity *parent = NULL;
2053
2054 for_each_entity_safe(entity, parent) {
2055 sd = entity->sched_data;
2056
e21b7a0b 2057 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
aee69d78 2058 /*
e21b7a0b
AA
2059 * entity is not in any tree any more, so
2060 * this deactivation is a no-op, and there is
2061 * nothing to change for upper-level entities
2062 * (in case of expiration, this can never
2063 * happen).
aee69d78 2064 */
e21b7a0b
AA
2065 return;
2066 }
2067
2068 if (sd->next_in_service == entity)
2069 /*
2070 * entity was the next_in_service entity,
2071 * then, since entity has just been
2072 * deactivated, a new one must be found.
2073 */
2074 bfq_update_next_in_service(sd, NULL);
aee69d78
PV
2075
2076 if (sd->next_in_service)
2077 /*
e21b7a0b
AA
2078 * The parent entity is still backlogged,
2079 * because next_in_service is not NULL. So, no
2080 * further upwards deactivation must be
2081 * performed. Yet, next_in_service has
2082 * changed. Then the schedule does need to be
2083 * updated upwards.
aee69d78 2084 */
e21b7a0b 2085 break;
aee69d78
PV
2086
2087 /*
e21b7a0b
AA
2088 * If we get here, then the parent is no more
2089 * backlogged and we need to propagate the
2090 * deactivation upwards. Thus let the loop go on.
aee69d78 2091 */
aee69d78 2092
e21b7a0b
AA
2093 /*
2094 * Also let parent be queued into the idle tree on
2095 * deactivation, to preserve service guarantees, and
2096 * assuming that who invoked this function does not
2097 * need parent entities too to be removed completely.
2098 */
2099 ins_into_idle_tree = true;
2100 }
aee69d78 2101
e21b7a0b
AA
2102 /*
2103 * If the deactivation loop is fully executed, then there are
2104 * no more entities to touch and next loop is not executed at
2105 * all. Otherwise, requeue remaining entities if they are
2106 * about to stop receiving service, or reposition them if this
2107 * is not the case.
2108 */
aee69d78
PV
2109 entity = parent;
2110 for_each_entity(entity) {
e21b7a0b
AA
2111 /*
2112 * Invoke __bfq_requeue_entity on entity, even if
2113 * already active, to requeue/reposition it in the
2114 * active tree (because sd->next_in_service has
2115 * changed)
2116 */
2117 __bfq_requeue_entity(entity);
aee69d78
PV
2118
2119 sd = entity->sched_data;
e21b7a0b
AA
2120 if (!bfq_update_next_in_service(sd, entity) &&
2121 !expiration)
2122 /*
2123 * next_in_service unchanged or not causing
2124 * any change in entity->parent->sd, and no
2125 * requeueing needed for expiration: stop
2126 * here.
2127 */
aee69d78
PV
2128 break;
2129 }
2130}
2131
2132/**
e21b7a0b
AA
2133 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2134 * if needed, to have at least one entity eligible.
aee69d78
PV
2135 * @st: the service tree to act upon.
2136 *
e21b7a0b 2137 * Assumes that st is not empty.
aee69d78 2138 */
e21b7a0b 2139static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
aee69d78 2140{
e21b7a0b
AA
2141 struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
2142
2143 if (bfq_gt(root_entity->min_start, st->vtime))
2144 return root_entity->min_start;
2145
2146 return st->vtime;
2147}
aee69d78 2148
e21b7a0b
AA
2149static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
2150{
2151 if (new_value > st->vtime) {
2152 st->vtime = new_value;
aee69d78
PV
2153 bfq_forget_idle(st);
2154 }
2155}
2156
2157/**
2158 * bfq_first_active_entity - find the eligible entity with
2159 * the smallest finish time
2160 * @st: the service tree to select from.
e21b7a0b 2161 * @vtime: the system virtual to use as a reference for eligibility
aee69d78
PV
2162 *
2163 * This function searches the first schedulable entity, starting from the
2164 * root of the tree and going on the left every time on this side there is
2165 * a subtree with at least one eligible (start >= vtime) entity. The path on
2166 * the right is followed only if a) the left subtree contains no eligible
2167 * entities and b) no eligible entity has been found yet.
2168 */
e21b7a0b
AA
2169static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
2170 u64 vtime)
aee69d78
PV
2171{
2172 struct bfq_entity *entry, *first = NULL;
2173 struct rb_node *node = st->active.rb_node;
2174
2175 while (node) {
2176 entry = rb_entry(node, struct bfq_entity, rb_node);
2177left:
e21b7a0b 2178 if (!bfq_gt(entry->start, vtime))
aee69d78
PV
2179 first = entry;
2180
2181 if (node->rb_left) {
2182 entry = rb_entry(node->rb_left,
2183 struct bfq_entity, rb_node);
e21b7a0b 2184 if (!bfq_gt(entry->min_start, vtime)) {
aee69d78
PV
2185 node = node->rb_left;
2186 goto left;
2187 }
2188 }
2189 if (first)
2190 break;
2191 node = node->rb_right;
2192 }
2193
e21b7a0b
AA
2194 return first;
2195}
2196
2197/**
2198 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2199 * @st: the service tree.
2200 *
2201 * If there is no in-service entity for the sched_data st belongs to,
2202 * then return the entity that will be set in service if:
2203 * 1) the parent entity this st belongs to is set in service;
2204 * 2) no entity belonging to such parent entity undergoes a state change
2205 * that would influence the timestamps of the entity (e.g., becomes idle,
2206 * becomes backlogged, changes its budget, ...).
2207 *
2208 * In this first case, update the virtual time in @st too (see the
2209 * comments on this update inside the function).
2210 *
2211 * In constrast, if there is an in-service entity, then return the
2212 * entity that would be set in service if not only the above
2213 * conditions, but also the next one held true: the currently
2214 * in-service entity, on expiration,
2215 * 1) gets a finish time equal to the current one, or
2216 * 2) is not eligible any more, or
2217 * 3) is idle.
2218 */
2219static struct bfq_entity *
2220__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
2221{
2222 struct bfq_entity *entity;
2223 u64 new_vtime;
2224
2225 if (RB_EMPTY_ROOT(&st->active))
2226 return NULL;
2227
2228 /*
2229 * Get the value of the system virtual time for which at
2230 * least one entity is eligible.
2231 */
2232 new_vtime = bfq_calc_vtime_jump(st);
2233
2234 /*
2235 * If there is no in-service entity for the sched_data this
2236 * active tree belongs to, then push the system virtual time
2237 * up to the value that guarantees that at least one entity is
2238 * eligible. If, instead, there is an in-service entity, then
2239 * do not make any such update, because there is already an
2240 * eligible entity, namely the in-service one (even if the
2241 * entity is not on st, because it was extracted when set in
2242 * service).
2243 */
2244 if (!in_service)
2245 bfq_update_vtime(st, new_vtime);
2246
2247 entity = bfq_first_active_entity(st, new_vtime);
2248
2249 return entity;
2250}
2251
2252/**
2253 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2254 * @sd: the sched_data.
2255 *
2256 * This function is invoked when there has been a change in the trees
2257 * for sd, and we need know what is the new next entity after this
2258 * change.
2259 */
2260static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
2261{
2262 struct bfq_service_tree *st = sd->service_tree;
2263 struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
2264 struct bfq_entity *entity = NULL;
2265 int class_idx = 0;
2266
2267 /*
2268 * Choose from idle class, if needed to guarantee a minimum
2269 * bandwidth to this class (and if there is some active entity
2270 * in idle class). This should also mitigate
2271 * priority-inversion problems in case a low priority task is
2272 * holding file system resources.
2273 */
2274 if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
2275 BFQ_CL_IDLE_TIMEOUT)) {
2276 if (!RB_EMPTY_ROOT(&idle_class_st->active))
2277 class_idx = BFQ_IOPRIO_CLASSES - 1;
2278 /* About to be served if backlogged, or not yet backlogged */
2279 sd->bfq_class_idle_last_service = jiffies;
2280 }
2281
2282 /*
2283 * Find the next entity to serve for the highest-priority
2284 * class, unless the idle class needs to be served.
2285 */
2286 for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
2287 entity = __bfq_lookup_next_entity(st + class_idx,
2288 sd->in_service_entity);
2289
2290 if (entity)
2291 break;
2292 }
2293
2294 if (!entity)
2295 return NULL;
2296
2297 return entity;
2298}
2299
2300static bool next_queue_may_preempt(struct bfq_data *bfqd)
2301{
2302 struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
2303
2304 return sd->next_in_service != sd->in_service_entity;
2305}
2306
2307/*
2308 * Get next queue for service.
2309 */
2310static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
2311{
2312 struct bfq_entity *entity = NULL;
2313 struct bfq_sched_data *sd;
2314 struct bfq_queue *bfqq;
2315
2316 if (bfqd->busy_queues == 0)
2317 return NULL;
2318
2319 /*
2320 * Traverse the path from the root to the leaf entity to
2321 * serve. Set in service all the entities visited along the
2322 * way.
2323 */
2324 sd = &bfqd->root_group->sched_data;
2325 for (; sd ; sd = entity->my_sched_data) {
2326 /*
2327 * WARNING. We are about to set the in-service entity
2328 * to sd->next_in_service, i.e., to the (cached) value
2329 * returned by bfq_lookup_next_entity(sd) the last
2330 * time it was invoked, i.e., the last time when the
2331 * service order in sd changed as a consequence of the
2332 * activation or deactivation of an entity. In this
2333 * respect, if we execute bfq_lookup_next_entity(sd)
2334 * in this very moment, it may, although with low
2335 * probability, yield a different entity than that
2336 * pointed to by sd->next_in_service. This rare event
2337 * happens in case there was no CLASS_IDLE entity to
2338 * serve for sd when bfq_lookup_next_entity(sd) was
2339 * invoked for the last time, while there is now one
2340 * such entity.
2341 *
2342 * If the above event happens, then the scheduling of
2343 * such entity in CLASS_IDLE is postponed until the
2344 * service of the sd->next_in_service entity
2345 * finishes. In fact, when the latter is expired,
2346 * bfq_lookup_next_entity(sd) gets called again,
2347 * exactly to update sd->next_in_service.
2348 */
2349
2350 /* Make next_in_service entity become in_service_entity */
2351 entity = sd->next_in_service;
2352 sd->in_service_entity = entity;
2353
2354 /*
2355 * Reset the accumulator of the amount of service that
2356 * the entity is about to receive.
2357 */
2358 entity->service = 0;
2359
2360 /*
2361 * If entity is no longer a candidate for next
2362 * service, then we extract it from its active tree,
2363 * for the following reason. To further boost the
2364 * throughput in some special case, BFQ needs to know
2365 * which is the next candidate entity to serve, while
2366 * there is already an entity in service. In this
2367 * respect, to make it easy to compute/update the next
2368 * candidate entity to serve after the current
2369 * candidate has been set in service, there is a case
2370 * where it is necessary to extract the current
2371 * candidate from its service tree. Such a case is
2372 * when the entity just set in service cannot be also
2373 * a candidate for next service. Details about when
2374 * this conditions holds are reported in the comments
2375 * on the function bfq_no_longer_next_in_service()
2376 * invoked below.
2377 */
2378 if (bfq_no_longer_next_in_service(entity))
2379 bfq_active_extract(bfq_entity_service_tree(entity),
2380 entity);
2381
2382 /*
2383 * For the same reason why we may have just extracted
2384 * entity from its active tree, we may need to update
2385 * next_in_service for the sched_data of entity too,
2386 * regardless of whether entity has been extracted.
2387 * In fact, even if entity has not been extracted, a
2388 * descendant entity may get extracted. Such an event
2389 * would cause a change in next_in_service for the
2390 * level of the descendant entity, and thus possibly
2391 * back to upper levels.
2392 *
2393 * We cannot perform the resulting needed update
2394 * before the end of this loop, because, to know which
2395 * is the correct next-to-serve candidate entity for
2396 * each level, we need first to find the leaf entity
2397 * to set in service. In fact, only after we know
2398 * which is the next-to-serve leaf entity, we can
2399 * discover whether the parent entity of the leaf
2400 * entity becomes the next-to-serve, and so on.
2401 */
2402
2403 }
2404
2405 bfqq = bfq_entity_to_bfqq(entity);
2406
2407 /*
2408 * We can finally update all next-to-serve entities along the
2409 * path from the leaf entity just set in service to the root.
2410 */
2411 for_each_entity(entity) {
2412 struct bfq_sched_data *sd = entity->sched_data;
2413
2414 if (!bfq_update_next_in_service(sd, NULL))
2415 break;
2416 }
2417
2418 return bfqq;
2419}
2420
2421static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
2422{
2423 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
2424 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
2425 struct bfq_entity *entity = in_serv_entity;
2426
2427 if (bfqd->in_service_bic) {
2428 put_io_context(bfqd->in_service_bic->icq.ioc);
2429 bfqd->in_service_bic = NULL;
2430 }
2431
2432 bfq_clear_bfqq_wait_request(in_serv_bfqq);
2433 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
2434 bfqd->in_service_queue = NULL;
2435
2436 /*
2437 * When this function is called, all in-service entities have
2438 * been properly deactivated or requeued, so we can safely
2439 * execute the final step: reset in_service_entity along the
2440 * path from entity to the root.
2441 */
2442 for_each_entity(entity)
2443 entity->sched_data->in_service_entity = NULL;
2444
2445 /*
2446 * in_serv_entity is no longer in service, so, if it is in no
2447 * service tree either, then release the service reference to
2448 * the queue it represents (taken with bfq_get_entity).
2449 */
2450 if (!in_serv_entity->on_st)
2451 bfq_put_queue(in_serv_bfqq);
2452}
2453
2454static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2455 bool ins_into_idle_tree, bool expiration)
2456{
2457 struct bfq_entity *entity = &bfqq->entity;
2458
2459 bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
2460}
2461
2462static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2463{
2464 struct bfq_entity *entity = &bfqq->entity;
2465
2466 bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
2467 false);
2468 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
2469}
2470
2471static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2472{
2473 struct bfq_entity *entity = &bfqq->entity;
2474
2475 bfq_activate_requeue_entity(entity, false,
2476 bfqq == bfqd->in_service_queue);
2477}
2478
2479static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
2480
2481/*
2482 * Called when the bfqq no longer has requests pending, remove it from
2483 * the service tree. As a special case, it can be invoked during an
2484 * expiration.
2485 */
2486static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2487 bool expiration)
2488{
2489 bfq_log_bfqq(bfqd, bfqq, "del from busy");
2490
2491 bfq_clear_bfqq_busy(bfqq);
2492
2493 bfqd->busy_queues--;
2494
cfd69712
PV
2495 if (bfqq->wr_coeff > 1)
2496 bfqd->wr_busy_queues--;
2497
e21b7a0b
AA
2498 bfqg_stats_update_dequeue(bfqq_group(bfqq));
2499
2500 bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
2501}
2502
2503/*
2504 * Called when an inactive queue receives a new request.
2505 */
2506static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2507{
2508 bfq_log_bfqq(bfqd, bfqq, "add to busy");
2509
2510 bfq_activate_bfqq(bfqd, bfqq);
2511
2512 bfq_mark_bfqq_busy(bfqq);
2513 bfqd->busy_queues++;
cfd69712
PV
2514
2515 if (bfqq->wr_coeff > 1)
2516 bfqd->wr_busy_queues++;
e21b7a0b
AA
2517}
2518
2519#ifdef CONFIG_BFQ_GROUP_IOSCHED
2520
2521/* bfqg stats flags */
2522enum bfqg_stats_flags {
2523 BFQG_stats_waiting = 0,
2524 BFQG_stats_idling,
2525 BFQG_stats_empty,
2526};
2527
2528#define BFQG_FLAG_FNS(name) \
2529static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2530{ \
2531 stats->flags |= (1 << BFQG_stats_##name); \
2532} \
2533static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2534{ \
2535 stats->flags &= ~(1 << BFQG_stats_##name); \
2536} \
2537static int bfqg_stats_##name(struct bfqg_stats *stats) \
2538{ \
2539 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2540} \
2541
2542BFQG_FLAG_FNS(waiting)
2543BFQG_FLAG_FNS(idling)
2544BFQG_FLAG_FNS(empty)
2545#undef BFQG_FLAG_FNS
2546
2547/* This should be called with the queue_lock held. */
2548static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
2549{
2550 unsigned long long now;
2551
2552 if (!bfqg_stats_waiting(stats))
2553 return;
2554
2555 now = sched_clock();
2556 if (time_after64(now, stats->start_group_wait_time))
2557 blkg_stat_add(&stats->group_wait_time,
2558 now - stats->start_group_wait_time);
2559 bfqg_stats_clear_waiting(stats);
2560}
2561
2562/* This should be called with the queue_lock held. */
2563static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
2564 struct bfq_group *curr_bfqg)
2565{
2566 struct bfqg_stats *stats = &bfqg->stats;
2567
2568 if (bfqg_stats_waiting(stats))
2569 return;
2570 if (bfqg == curr_bfqg)
2571 return;
2572 stats->start_group_wait_time = sched_clock();
2573 bfqg_stats_mark_waiting(stats);
2574}
2575
2576/* This should be called with the queue_lock held. */
2577static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
2578{
2579 unsigned long long now;
2580
2581 if (!bfqg_stats_empty(stats))
2582 return;
2583
2584 now = sched_clock();
2585 if (time_after64(now, stats->start_empty_time))
2586 blkg_stat_add(&stats->empty_time,
2587 now - stats->start_empty_time);
2588 bfqg_stats_clear_empty(stats);
2589}
2590
2591static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
2592{
2593 blkg_stat_add(&bfqg->stats.dequeue, 1);
2594}
2595
2596static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
2597{
2598 struct bfqg_stats *stats = &bfqg->stats;
2599
2600 if (blkg_rwstat_total(&stats->queued))
2601 return;
2602
2603 /*
2604 * group is already marked empty. This can happen if bfqq got new
2605 * request in parent group and moved to this group while being added
2606 * to service tree. Just ignore the event and move on.
2607 */
2608 if (bfqg_stats_empty(stats))
2609 return;
2610
2611 stats->start_empty_time = sched_clock();
2612 bfqg_stats_mark_empty(stats);
2613}
2614
2615static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
2616{
2617 struct bfqg_stats *stats = &bfqg->stats;
2618
2619 if (bfqg_stats_idling(stats)) {
2620 unsigned long long now = sched_clock();
2621
2622 if (time_after64(now, stats->start_idle_time))
2623 blkg_stat_add(&stats->idle_time,
2624 now - stats->start_idle_time);
2625 bfqg_stats_clear_idling(stats);
2626 }
2627}
2628
2629static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
2630{
2631 struct bfqg_stats *stats = &bfqg->stats;
2632
2633 stats->start_idle_time = sched_clock();
2634 bfqg_stats_mark_idling(stats);
2635}
2636
2637static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
2638{
2639 struct bfqg_stats *stats = &bfqg->stats;
2640
2641 blkg_stat_add(&stats->avg_queue_size_sum,
2642 blkg_rwstat_total(&stats->queued));
2643 blkg_stat_add(&stats->avg_queue_size_samples, 1);
2644 bfqg_stats_update_group_wait_time(stats);
2645}
2646
2647/*
2648 * blk-cgroup policy-related handlers
2649 * The following functions help in converting between blk-cgroup
2650 * internal structures and BFQ-specific structures.
2651 */
2652
2653static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
2654{
2655 return pd ? container_of(pd, struct bfq_group, pd) : NULL;
2656}
2657
2658static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
2659{
2660 return pd_to_blkg(&bfqg->pd);
2661}
2662
2663static struct blkcg_policy blkcg_policy_bfq;
2664
2665static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
2666{
2667 return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
2668}
2669
2670/*
2671 * bfq_group handlers
2672 * The following functions help in navigating the bfq_group hierarchy
2673 * by allowing to find the parent of a bfq_group or the bfq_group
2674 * associated to a bfq_queue.
2675 */
2676
2677static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
2678{
2679 struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
2680
2681 return pblkg ? blkg_to_bfqg(pblkg) : NULL;
2682}
2683
2684static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
2685{
2686 struct bfq_entity *group_entity = bfqq->entity.parent;
2687
2688 return group_entity ? container_of(group_entity, struct bfq_group,
2689 entity) :
2690 bfqq->bfqd->root_group;
2691}
2692
2693/*
2694 * The following two functions handle get and put of a bfq_group by
2695 * wrapping the related blk-cgroup hooks.
2696 */
2697
2698static void bfqg_get(struct bfq_group *bfqg)
2699{
2700 return blkg_get(bfqg_to_blkg(bfqg));
2701}
2702
2703static void bfqg_put(struct bfq_group *bfqg)
2704{
2705 return blkg_put(bfqg_to_blkg(bfqg));
2706}
2707
2708static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
2709 struct bfq_queue *bfqq,
2710 unsigned int op)
2711{
2712 blkg_rwstat_add(&bfqg->stats.queued, op, 1);
2713 bfqg_stats_end_empty_time(&bfqg->stats);
2714 if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
2715 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
2716}
2717
2718static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
2719{
2720 blkg_rwstat_add(&bfqg->stats.queued, op, -1);
2721}
2722
2723static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
2724{
2725 blkg_rwstat_add(&bfqg->stats.merged, op, 1);
2726}
2727
2728static void bfqg_stats_update_completion(struct bfq_group *bfqg,
2729 uint64_t start_time, uint64_t io_start_time,
2730 unsigned int op)
2731{
2732 struct bfqg_stats *stats = &bfqg->stats;
2733 unsigned long long now = sched_clock();
2734
2735 if (time_after64(now, io_start_time))
2736 blkg_rwstat_add(&stats->service_time, op,
2737 now - io_start_time);
2738 if (time_after64(io_start_time, start_time))
2739 blkg_rwstat_add(&stats->wait_time, op,
2740 io_start_time - start_time);
2741}
2742
2743/* @stats = 0 */
2744static void bfqg_stats_reset(struct bfqg_stats *stats)
2745{
2746 /* queued stats shouldn't be cleared */
2747 blkg_rwstat_reset(&stats->merged);
2748 blkg_rwstat_reset(&stats->service_time);
2749 blkg_rwstat_reset(&stats->wait_time);
2750 blkg_stat_reset(&stats->time);
2751 blkg_stat_reset(&stats->avg_queue_size_sum);
2752 blkg_stat_reset(&stats->avg_queue_size_samples);
2753 blkg_stat_reset(&stats->dequeue);
2754 blkg_stat_reset(&stats->group_wait_time);
2755 blkg_stat_reset(&stats->idle_time);
2756 blkg_stat_reset(&stats->empty_time);
2757}
2758
2759/* @to += @from */
2760static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
2761{
2762 if (!to || !from)
2763 return;
2764
2765 /* queued stats shouldn't be cleared */
2766 blkg_rwstat_add_aux(&to->merged, &from->merged);
2767 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
2768 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
2769 blkg_stat_add_aux(&from->time, &from->time);
2770 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
2771 blkg_stat_add_aux(&to->avg_queue_size_samples,
2772 &from->avg_queue_size_samples);
2773 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
2774 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
2775 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
2776 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
2777}
2778
2779/*
2780 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2781 * recursive stats can still account for the amount used by this bfqg after
2782 * it's gone.
2783 */
2784static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
2785{
2786 struct bfq_group *parent;
2787
2788 if (!bfqg) /* root_group */
2789 return;
2790
2791 parent = bfqg_parent(bfqg);
2792
2793 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
2794
2795 if (unlikely(!parent))
2796 return;
2797
2798 bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
2799 bfqg_stats_reset(&bfqg->stats);
2800}
2801
2802static void bfq_init_entity(struct bfq_entity *entity,
2803 struct bfq_group *bfqg)
2804{
2805 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2806
2807 entity->weight = entity->new_weight;
2808 entity->orig_weight = entity->new_weight;
2809 if (bfqq) {
2810 bfqq->ioprio = bfqq->new_ioprio;
2811 bfqq->ioprio_class = bfqq->new_ioprio_class;
2812 bfqg_get(bfqg);
2813 }
2814 entity->parent = bfqg->my_entity; /* NULL for root group */
2815 entity->sched_data = &bfqg->sched_data;
2816}
2817
2818static void bfqg_stats_exit(struct bfqg_stats *stats)
2819{
2820 blkg_rwstat_exit(&stats->merged);
2821 blkg_rwstat_exit(&stats->service_time);
2822 blkg_rwstat_exit(&stats->wait_time);
2823 blkg_rwstat_exit(&stats->queued);
2824 blkg_stat_exit(&stats->time);
2825 blkg_stat_exit(&stats->avg_queue_size_sum);
2826 blkg_stat_exit(&stats->avg_queue_size_samples);
2827 blkg_stat_exit(&stats->dequeue);
2828 blkg_stat_exit(&stats->group_wait_time);
2829 blkg_stat_exit(&stats->idle_time);
2830 blkg_stat_exit(&stats->empty_time);
2831}
2832
2833static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
2834{
2835 if (blkg_rwstat_init(&stats->merged, gfp) ||
2836 blkg_rwstat_init(&stats->service_time, gfp) ||
2837 blkg_rwstat_init(&stats->wait_time, gfp) ||
2838 blkg_rwstat_init(&stats->queued, gfp) ||
2839 blkg_stat_init(&stats->time, gfp) ||
2840 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
2841 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
2842 blkg_stat_init(&stats->dequeue, gfp) ||
2843 blkg_stat_init(&stats->group_wait_time, gfp) ||
2844 blkg_stat_init(&stats->idle_time, gfp) ||
2845 blkg_stat_init(&stats->empty_time, gfp)) {
2846 bfqg_stats_exit(stats);
2847 return -ENOMEM;
2848 }
2849
2850 return 0;
2851}
2852
2853static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
2854{
2855 return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
2856}
2857
2858static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
2859{
2860 return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
2861}
2862
2863static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
2864{
2865 struct bfq_group_data *bgd;
2866
2867 bgd = kzalloc(sizeof(*bgd), gfp);
2868 if (!bgd)
2869 return NULL;
2870 return &bgd->pd;
2871}
2872
2873static void bfq_cpd_init(struct blkcg_policy_data *cpd)
2874{
2875 struct bfq_group_data *d = cpd_to_bfqgd(cpd);
2876
2877 d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
2878 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
2879}
2880
2881static void bfq_cpd_free(struct blkcg_policy_data *cpd)
2882{
2883 kfree(cpd_to_bfqgd(cpd));
2884}
2885
2886static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
2887{
2888 struct bfq_group *bfqg;
2889
2890 bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
2891 if (!bfqg)
2892 return NULL;
2893
2894 if (bfqg_stats_init(&bfqg->stats, gfp)) {
2895 kfree(bfqg);
2896 return NULL;
2897 }
2898
2899 return &bfqg->pd;
2900}
2901
2902static void bfq_pd_init(struct blkg_policy_data *pd)
2903{
2904 struct blkcg_gq *blkg = pd_to_blkg(pd);
2905 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
2906 struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
2907 struct bfq_entity *entity = &bfqg->entity;
2908 struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
2909
2910 entity->orig_weight = entity->weight = entity->new_weight = d->weight;
2911 entity->my_sched_data = &bfqg->sched_data;
2912 bfqg->my_entity = entity; /*
2913 * the root_group's will be set to NULL
2914 * in bfq_init_queue()
2915 */
2916 bfqg->bfqd = bfqd;
2917}
2918
2919static void bfq_pd_free(struct blkg_policy_data *pd)
2920{
2921 struct bfq_group *bfqg = pd_to_bfqg(pd);
2922
2923 bfqg_stats_exit(&bfqg->stats);
2924 return kfree(bfqg);
2925}
2926
2927static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
2928{
2929 struct bfq_group *bfqg = pd_to_bfqg(pd);
2930
2931 bfqg_stats_reset(&bfqg->stats);
2932}
2933
2934static void bfq_group_set_parent(struct bfq_group *bfqg,
2935 struct bfq_group *parent)
2936{
2937 struct bfq_entity *entity;
2938
2939 entity = &bfqg->entity;
2940 entity->parent = parent->my_entity;
2941 entity->sched_data = &parent->sched_data;
2942}
2943
2944static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
2945 struct blkcg *blkcg)
2946{
2947 struct blkcg_gq *blkg;
2948
2949 blkg = blkg_lookup(blkcg, bfqd->queue);
2950 if (likely(blkg))
2951 return blkg_to_bfqg(blkg);
2952 return NULL;
2953}
2954
2955static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
2956 struct blkcg *blkcg)
2957{
2958 struct bfq_group *bfqg, *parent;
2959 struct bfq_entity *entity;
2960
2961 bfqg = bfq_lookup_bfqg(bfqd, blkcg);
2962
2963 if (unlikely(!bfqg))
2964 return NULL;
2965
2966 /*
2967 * Update chain of bfq_groups as we might be handling a leaf group
2968 * which, along with some of its relatives, has not been hooked yet
2969 * to the private hierarchy of BFQ.
2970 */
2971 entity = &bfqg->entity;
2972 for_each_entity(entity) {
2973 bfqg = container_of(entity, struct bfq_group, entity);
2974 if (bfqg != bfqd->root_group) {
2975 parent = bfqg_parent(bfqg);
2976 if (!parent)
2977 parent = bfqd->root_group;
2978 bfq_group_set_parent(bfqg, parent);
2979 }
2980 }
2981
2982 return bfqg;
2983}
2984
2985static void bfq_bfqq_expire(struct bfq_data *bfqd,
2986 struct bfq_queue *bfqq,
2987 bool compensate,
2988 enum bfqq_expiration reason);
2989
2990/**
2991 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2992 * @bfqd: queue descriptor.
2993 * @bfqq: the queue to move.
2994 * @bfqg: the group to move to.
2995 *
2996 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2997 * it on the new one. Avoid putting the entity on the old group idle tree.
2998 *
2999 * Must be called under the queue lock; the cgroup owning @bfqg must
3000 * not disappear (by now this just means that we are called under
3001 * rcu_read_lock()).
3002 */
3003static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3004 struct bfq_group *bfqg)
3005{
3006 struct bfq_entity *entity = &bfqq->entity;
3007
3008 /* If bfqq is empty, then bfq_bfqq_expire also invokes
3009 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3010 * from data structures related to current group. Otherwise we
3011 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3012 * we do below.
3013 */
3014 if (bfqq == bfqd->in_service_queue)
3015 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
3016 false, BFQQE_PREEMPTED);
3017
3018 if (bfq_bfqq_busy(bfqq))
3019 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
3020 else if (entity->on_st)
3021 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
3022 bfqg_put(bfqq_group(bfqq));
3023
3024 /*
3025 * Here we use a reference to bfqg. We don't need a refcounter
3026 * as the cgroup reference will not be dropped, so that its
3027 * destroy() callback will not be invoked.
3028 */
3029 entity->parent = bfqg->my_entity;
3030 entity->sched_data = &bfqg->sched_data;
3031 bfqg_get(bfqg);
3032
3033 if (bfq_bfqq_busy(bfqq))
3034 bfq_activate_bfqq(bfqd, bfqq);
3035
3036 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
3037 bfq_schedule_dispatch(bfqd);
3038}
3039
3040/**
3041 * __bfq_bic_change_cgroup - move @bic to @cgroup.
3042 * @bfqd: the queue descriptor.
3043 * @bic: the bic to move.
3044 * @blkcg: the blk-cgroup to move to.
3045 *
3046 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3047 * has to make sure that the reference to cgroup is valid across the call.
3048 *
3049 * NOTE: an alternative approach might have been to store the current
3050 * cgroup in bfqq and getting a reference to it, reducing the lookup
3051 * time here, at the price of slightly more complex code.
3052 */
3053static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
3054 struct bfq_io_cq *bic,
3055 struct blkcg *blkcg)
3056{
3057 struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
3058 struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
3059 struct bfq_group *bfqg;
3060 struct bfq_entity *entity;
3061
3062 bfqg = bfq_find_set_group(bfqd, blkcg);
3063
3064 if (unlikely(!bfqg))
3065 bfqg = bfqd->root_group;
3066
3067 if (async_bfqq) {
3068 entity = &async_bfqq->entity;
3069
3070 if (entity->sched_data != &bfqg->sched_data) {
3071 bic_set_bfqq(bic, NULL, 0);
3072 bfq_log_bfqq(bfqd, async_bfqq,
3073 "bic_change_group: %p %d",
3074 async_bfqq,
3075 async_bfqq->ref);
3076 bfq_put_queue(async_bfqq);
3077 }
3078 }
3079
3080 if (sync_bfqq) {
3081 entity = &sync_bfqq->entity;
3082 if (entity->sched_data != &bfqg->sched_data)
3083 bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
3084 }
3085
3086 return bfqg;
3087}
3088
3089static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
3090{
3091 struct bfq_data *bfqd = bic_to_bfqd(bic);
3092 struct bfq_group *bfqg = NULL;
3093 uint64_t serial_nr;
3094
3095 rcu_read_lock();
3096 serial_nr = bio_blkcg(bio)->css.serial_nr;
3097
3098 /*
3099 * Check whether blkcg has changed. The condition may trigger
3100 * spuriously on a newly created cic but there's no harm.
3101 */
3102 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
3103 goto out;
3104
3105 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
3106 bic->blkcg_serial_nr = serial_nr;
3107out:
3108 rcu_read_unlock();
3109}
3110
3111/**
3112 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3113 * @st: the service tree being flushed.
3114 */
3115static void bfq_flush_idle_tree(struct bfq_service_tree *st)
3116{
3117 struct bfq_entity *entity = st->first_idle;
3118
3119 for (; entity ; entity = st->first_idle)
3120 __bfq_deactivate_entity(entity, false);
3121}
3122
3123/**
3124 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3125 * @bfqd: the device data structure with the root group.
3126 * @entity: the entity to move.
3127 */
3128static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
3129 struct bfq_entity *entity)
3130{
3131 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3132
3133 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
aee69d78
PV
3134}
3135
3136/**
e21b7a0b
AA
3137 * bfq_reparent_active_entities - move to the root group all active
3138 * entities.
3139 * @bfqd: the device data structure with the root group.
3140 * @bfqg: the group to move from.
3141 * @st: the service tree with the entities.
aee69d78 3142 *
e21b7a0b 3143 * Needs queue_lock to be taken and reference to be valid over the call.
aee69d78 3144 */
e21b7a0b
AA
3145static void bfq_reparent_active_entities(struct bfq_data *bfqd,
3146 struct bfq_group *bfqg,
3147 struct bfq_service_tree *st)
aee69d78 3148{
e21b7a0b
AA
3149 struct rb_root *active = &st->active;
3150 struct bfq_entity *entity = NULL;
aee69d78 3151
e21b7a0b
AA
3152 if (!RB_EMPTY_ROOT(&st->active))
3153 entity = bfq_entity_of(rb_first(active));
aee69d78 3154
e21b7a0b
AA
3155 for (; entity ; entity = bfq_entity_of(rb_first(active)))
3156 bfq_reparent_leaf_entity(bfqd, entity);
aee69d78 3157
e21b7a0b
AA
3158 if (bfqg->sched_data.in_service_entity)
3159 bfq_reparent_leaf_entity(bfqd,
3160 bfqg->sched_data.in_service_entity);
aee69d78
PV
3161}
3162
3163/**
e21b7a0b
AA
3164 * bfq_pd_offline - deactivate the entity associated with @pd,
3165 * and reparent its children entities.
3166 * @pd: descriptor of the policy going offline.
aee69d78 3167 *
e21b7a0b
AA
3168 * blkio already grabs the queue_lock for us, so no need to use
3169 * RCU-based magic
aee69d78 3170 */
e21b7a0b 3171static void bfq_pd_offline(struct blkg_policy_data *pd)
aee69d78 3172{
e21b7a0b
AA
3173 struct bfq_service_tree *st;
3174 struct bfq_group *bfqg = pd_to_bfqg(pd);
3175 struct bfq_data *bfqd = bfqg->bfqd;
3176 struct bfq_entity *entity = bfqg->my_entity;
3177 unsigned long flags;
3178 int i;
aee69d78 3179
e21b7a0b
AA
3180 if (!entity) /* root group */
3181 return;
3182
3183 spin_lock_irqsave(&bfqd->lock, flags);
aee69d78 3184 /*
e21b7a0b
AA
3185 * Empty all service_trees belonging to this group before
3186 * deactivating the group itself.
aee69d78 3187 */
e21b7a0b
AA
3188 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
3189 st = bfqg->sched_data.service_tree + i;
3190
3191 /*
3192 * The idle tree may still contain bfq_queues belonging
3193 * to exited task because they never migrated to a different
3194 * cgroup from the one being destroyed now. No one else
3195 * can access them so it's safe to act without any lock.
3196 */
3197 bfq_flush_idle_tree(st);
3198
3199 /*
3200 * It may happen that some queues are still active
3201 * (busy) upon group destruction (if the corresponding
3202 * processes have been forced to terminate). We move
3203 * all the leaf entities corresponding to these queues
3204 * to the root_group.
3205 * Also, it may happen that the group has an entity
3206 * in service, which is disconnected from the active
3207 * tree: it must be moved, too.
3208 * There is no need to put the sync queues, as the
3209 * scheduler has taken no reference.
3210 */
3211 bfq_reparent_active_entities(bfqd, bfqg, st);
aee69d78
PV
3212 }
3213
e21b7a0b
AA
3214 __bfq_deactivate_entity(entity, false);
3215 bfq_put_async_queues(bfqd, bfqg);
3216
3217 spin_unlock_irqrestore(&bfqd->lock, flags);
3218 /*
3219 * @blkg is going offline and will be ignored by
3220 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3221 * that they don't get lost. If IOs complete after this point, the
3222 * stats for them will be lost. Oh well...
3223 */
3224 bfqg_stats_xfer_dead(bfqg);
aee69d78
PV
3225}
3226
44e44a1b
PV
3227static void bfq_end_wr_async(struct bfq_data *bfqd)
3228{
3229 struct blkcg_gq *blkg;
3230
3231 list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
3232 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3233
3234 bfq_end_wr_async_queues(bfqd, bfqg);
3235 }
3236 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3237}
3238
e21b7a0b 3239static int bfq_io_show_weight(struct seq_file *sf, void *v)
aee69d78 3240{
e21b7a0b
AA
3241 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3242 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3243 unsigned int val = 0;
aee69d78 3244
e21b7a0b
AA
3245 if (bfqgd)
3246 val = bfqgd->weight;
aee69d78 3247
e21b7a0b 3248 seq_printf(sf, "%u\n", val);
aee69d78 3249
e21b7a0b
AA
3250 return 0;
3251}
3252
3253static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
3254 struct cftype *cftype,
3255 u64 val)
aee69d78 3256{
e21b7a0b
AA
3257 struct blkcg *blkcg = css_to_blkcg(css);
3258 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3259 struct blkcg_gq *blkg;
3260 int ret = -ERANGE;
aee69d78 3261
e21b7a0b
AA
3262 if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
3263 return ret;
aee69d78 3264
e21b7a0b
AA
3265 ret = 0;
3266 spin_lock_irq(&blkcg->lock);
3267 bfqgd->weight = (unsigned short)val;
3268 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3269 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3270
3271 if (!bfqg)
3272 continue;
3273 /*
3274 * Setting the prio_changed flag of the entity
3275 * to 1 with new_weight == weight would re-set
3276 * the value of the weight to its ioprio mapping.
3277 * Set the flag only if necessary.
3278 */
3279 if ((unsigned short)val != bfqg->entity.new_weight) {
3280 bfqg->entity.new_weight = (unsigned short)val;
3281 /*
3282 * Make sure that the above new value has been
3283 * stored in bfqg->entity.new_weight before
3284 * setting the prio_changed flag. In fact,
3285 * this flag may be read asynchronously (in
3286 * critical sections protected by a different
3287 * lock than that held here), and finding this
3288 * flag set may cause the execution of the code
3289 * for updating parameters whose value may
3290 * depend also on bfqg->entity.new_weight (in
3291 * __bfq_entity_update_weight_prio).
3292 * This barrier makes sure that the new value
3293 * of bfqg->entity.new_weight is correctly
3294 * seen in that code.
3295 */
3296 smp_wmb();
3297 bfqg->entity.prio_changed = 1;
3298 }
aee69d78 3299 }
e21b7a0b 3300 spin_unlock_irq(&blkcg->lock);
aee69d78 3301
e21b7a0b
AA
3302 return ret;
3303}
aee69d78 3304
e21b7a0b
AA
3305static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
3306 char *buf, size_t nbytes,
3307 loff_t off)
3308{
3309 u64 weight;
3310 /* First unsigned long found in the file is used */
3311 int ret = kstrtoull(strim(buf), 0, &weight);
3312
3313 if (ret)
3314 return ret;
3315
3316 return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
aee69d78
PV
3317}
3318
e21b7a0b 3319static int bfqg_print_stat(struct seq_file *sf, void *v)
aee69d78 3320{
e21b7a0b
AA
3321 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
3322 &blkcg_policy_bfq, seq_cft(sf)->private, false);
3323 return 0;
3324}
aee69d78 3325
e21b7a0b
AA
3326static int bfqg_print_rwstat(struct seq_file *sf, void *v)
3327{
3328 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
3329 &blkcg_policy_bfq, seq_cft(sf)->private, true);
3330 return 0;
3331}
aee69d78 3332
e21b7a0b
AA
3333static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
3334 struct blkg_policy_data *pd, int off)
3335{
3336 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
3337 &blkcg_policy_bfq, off);
3338 return __blkg_prfill_u64(sf, pd, sum);
3339}
aee69d78 3340
e21b7a0b
AA
3341static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
3342 struct blkg_policy_data *pd, int off)
3343{
3344 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
3345 &blkcg_policy_bfq,
3346 off);
3347 return __blkg_prfill_rwstat(sf, pd, &sum);
aee69d78
PV
3348}
3349
e21b7a0b 3350static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
aee69d78 3351{
e21b7a0b
AA
3352 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3353 bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
3354 seq_cft(sf)->private, false);
3355 return 0;
3356}
aee69d78 3357
e21b7a0b
AA
3358static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
3359{
3360 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3361 bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
3362 seq_cft(sf)->private, true);
3363 return 0;
aee69d78
PV
3364}
3365
e21b7a0b
AA
3366static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
3367 int off)
aee69d78 3368{
e21b7a0b 3369 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
aee69d78 3370
e21b7a0b 3371 return __blkg_prfill_u64(sf, pd, sum >> 9);
aee69d78
PV
3372}
3373
e21b7a0b 3374static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
aee69d78 3375{
e21b7a0b
AA
3376 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3377 bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
3378 return 0;
3379}
aee69d78 3380
e21b7a0b
AA
3381static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
3382 struct blkg_policy_data *pd, int off)
3383{
3384 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
3385 offsetof(struct blkcg_gq, stat_bytes));
3386 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
3387 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
aee69d78 3388
e21b7a0b
AA
3389 return __blkg_prfill_u64(sf, pd, sum >> 9);
3390}
aee69d78 3391
e21b7a0b
AA
3392static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
3393{
3394 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3395 bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
3396 false);
3397 return 0;
aee69d78
PV
3398}
3399
e21b7a0b
AA
3400static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
3401 struct blkg_policy_data *pd, int off)
aee69d78 3402{
e21b7a0b
AA
3403 struct bfq_group *bfqg = pd_to_bfqg(pd);
3404 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
3405 u64 v = 0;
aee69d78 3406
e21b7a0b
AA
3407 if (samples) {
3408 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
3409 v = div64_u64(v, samples);
3410 }
3411 __blkg_prfill_u64(sf, pd, v);
3412 return 0;
3413}
aee69d78 3414
e21b7a0b
AA
3415/* print avg_queue_size */
3416static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
3417{
3418 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3419 bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
3420 0, false);
3421 return 0;
3422}
3423
3424static struct bfq_group *
3425bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
3426{
3427 int ret;
3428
3429 ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
3430 if (ret)
3431 return NULL;
3432
3433 return blkg_to_bfqg(bfqd->queue->root_blkg);
aee69d78
PV
3434}
3435
e21b7a0b
AA
3436static struct cftype bfq_blkcg_legacy_files[] = {
3437 {
3438 .name = "bfq.weight",
3439 .flags = CFTYPE_NOT_ON_ROOT,
3440 .seq_show = bfq_io_show_weight,
3441 .write_u64 = bfq_io_set_weight_legacy,
3442 },
3443
3444 /* statistics, covers only the tasks in the bfqg */
3445 {
3446 .name = "bfq.time",
3447 .private = offsetof(struct bfq_group, stats.time),
3448 .seq_show = bfqg_print_stat,
3449 },
3450 {
3451 .name = "bfq.sectors",
3452 .seq_show = bfqg_print_stat_sectors,
3453 },
3454 {
3455 .name = "bfq.io_service_bytes",
3456 .private = (unsigned long)&blkcg_policy_bfq,
3457 .seq_show = blkg_print_stat_bytes,
3458 },
3459 {
3460 .name = "bfq.io_serviced",
3461 .private = (unsigned long)&blkcg_policy_bfq,
3462 .seq_show = blkg_print_stat_ios,
3463 },
3464 {
3465 .name = "bfq.io_service_time",
3466 .private = offsetof(struct bfq_group, stats.service_time),
3467 .seq_show = bfqg_print_rwstat,
3468 },
3469 {
3470 .name = "bfq.io_wait_time",
3471 .private = offsetof(struct bfq_group, stats.wait_time),
3472 .seq_show = bfqg_print_rwstat,
3473 },
3474 {
3475 .name = "bfq.io_merged",
3476 .private = offsetof(struct bfq_group, stats.merged),
3477 .seq_show = bfqg_print_rwstat,
3478 },
3479 {
3480 .name = "bfq.io_queued",
3481 .private = offsetof(struct bfq_group, stats.queued),
3482 .seq_show = bfqg_print_rwstat,
3483 },
3484
3485 /* the same statictics which cover the bfqg and its descendants */
3486 {
3487 .name = "bfq.time_recursive",
3488 .private = offsetof(struct bfq_group, stats.time),
3489 .seq_show = bfqg_print_stat_recursive,
3490 },
3491 {
3492 .name = "bfq.sectors_recursive",
3493 .seq_show = bfqg_print_stat_sectors_recursive,
3494 },
3495 {
3496 .name = "bfq.io_service_bytes_recursive",
3497 .private = (unsigned long)&blkcg_policy_bfq,
3498 .seq_show = blkg_print_stat_bytes_recursive,
3499 },
3500 {
3501 .name = "bfq.io_serviced_recursive",
3502 .private = (unsigned long)&blkcg_policy_bfq,
3503 .seq_show = blkg_print_stat_ios_recursive,
3504 },
3505 {
3506 .name = "bfq.io_service_time_recursive",
3507 .private = offsetof(struct bfq_group, stats.service_time),
3508 .seq_show = bfqg_print_rwstat_recursive,
3509 },
3510 {
3511 .name = "bfq.io_wait_time_recursive",
3512 .private = offsetof(struct bfq_group, stats.wait_time),
3513 .seq_show = bfqg_print_rwstat_recursive,
3514 },
3515 {
3516 .name = "bfq.io_merged_recursive",
3517 .private = offsetof(struct bfq_group, stats.merged),
3518 .seq_show = bfqg_print_rwstat_recursive,
3519 },
3520 {
3521 .name = "bfq.io_queued_recursive",
3522 .private = offsetof(struct bfq_group, stats.queued),
3523 .seq_show = bfqg_print_rwstat_recursive,
3524 },
3525 {
3526 .name = "bfq.avg_queue_size",
3527 .seq_show = bfqg_print_avg_queue_size,
3528 },
3529 {
3530 .name = "bfq.group_wait_time",
3531 .private = offsetof(struct bfq_group, stats.group_wait_time),
3532 .seq_show = bfqg_print_stat,
3533 },
3534 {
3535 .name = "bfq.idle_time",
3536 .private = offsetof(struct bfq_group, stats.idle_time),
3537 .seq_show = bfqg_print_stat,
3538 },
3539 {
3540 .name = "bfq.empty_time",
3541 .private = offsetof(struct bfq_group, stats.empty_time),
3542 .seq_show = bfqg_print_stat,
3543 },
3544 {
3545 .name = "bfq.dequeue",
3546 .private = offsetof(struct bfq_group, stats.dequeue),
3547 .seq_show = bfqg_print_stat,
3548 },
3549 { } /* terminate */
3550};
3551
3552static struct cftype bfq_blkg_files[] = {
3553 {
3554 .name = "bfq.weight",
3555 .flags = CFTYPE_NOT_ON_ROOT,
3556 .seq_show = bfq_io_show_weight,
3557 .write = bfq_io_set_weight,
3558 },
3559 {} /* terminate */
3560};
3561
3562#else /* CONFIG_BFQ_GROUP_IOSCHED */
3563
3564static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
3565 struct bfq_queue *bfqq, unsigned int op) { }
3566static inline void
3567bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
3568static inline void
3569bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
3570static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
3571 uint64_t start_time, uint64_t io_start_time,
3572 unsigned int op) { }
3573static inline void
3574bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
3575 struct bfq_group *curr_bfqg) { }
3576static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
3577static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
3578static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
3579static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
3580static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
3581static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
3582
3583static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3584 struct bfq_group *bfqg) {}
3585
3586static void bfq_init_entity(struct bfq_entity *entity,
3587 struct bfq_group *bfqg)
aee69d78
PV
3588{
3589 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3590
3591 entity->weight = entity->new_weight;
3592 entity->orig_weight = entity->new_weight;
e21b7a0b
AA
3593 if (bfqq) {
3594 bfqq->ioprio = bfqq->new_ioprio;
3595 bfqq->ioprio_class = bfqq->new_ioprio_class;
3596 }
3597 entity->sched_data = &bfqg->sched_data;
3598}
3599
3600static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3601
44e44a1b
PV
3602static void bfq_end_wr_async(struct bfq_data *bfqd)
3603{
3604 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3605}
3606
e21b7a0b
AA
3607static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3608 struct blkcg *blkcg)
3609{
3610 return bfqd->root_group;
3611}
3612
3613static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
3614{
3615 return bfqq->bfqd->root_group;
3616}
aee69d78 3617
e21b7a0b
AA
3618static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
3619 int node)
3620{
3621 struct bfq_group *bfqg;
3622 int i;
3623
3624 bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
3625 if (!bfqg)
3626 return NULL;
aee69d78 3627
e21b7a0b
AA
3628 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3629 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3630
3631 return bfqg;
aee69d78 3632}
e21b7a0b 3633#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
3634
3635#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3636#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3637
3638#define bfq_sample_valid(samples) ((samples) > 80)
3639
aee69d78
PV
3640/*
3641 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3642 * We choose the request that is closesr to the head right now. Distance
3643 * behind the head is penalized and only allowed to a certain extent.
3644 */
3645static struct request *bfq_choose_req(struct bfq_data *bfqd,
3646 struct request *rq1,
3647 struct request *rq2,
3648 sector_t last)
3649{
3650 sector_t s1, s2, d1 = 0, d2 = 0;
3651 unsigned long back_max;
3652#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3653#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3654 unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
3655
3656 if (!rq1 || rq1 == rq2)
3657 return rq2;
3658 if (!rq2)
3659 return rq1;
3660
3661 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
3662 return rq1;
3663 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
3664 return rq2;
3665 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
3666 return rq1;
3667 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
3668 return rq2;
3669
3670 s1 = blk_rq_pos(rq1);
3671 s2 = blk_rq_pos(rq2);
3672
3673 /*
3674 * By definition, 1KiB is 2 sectors.
3675 */
3676 back_max = bfqd->bfq_back_max * 2;
3677
3678 /*
3679 * Strict one way elevator _except_ in the case where we allow
3680 * short backward seeks which are biased as twice the cost of a
3681 * similar forward seek.
3682 */
3683 if (s1 >= last)
3684 d1 = s1 - last;
3685 else if (s1 + back_max >= last)
3686 d1 = (last - s1) * bfqd->bfq_back_penalty;
3687 else
3688 wrap |= BFQ_RQ1_WRAP;
3689
3690 if (s2 >= last)
3691 d2 = s2 - last;
3692 else if (s2 + back_max >= last)
3693 d2 = (last - s2) * bfqd->bfq_back_penalty;
3694 else
3695 wrap |= BFQ_RQ2_WRAP;
3696
3697 /* Found required data */
3698
3699 /*
3700 * By doing switch() on the bit mask "wrap" we avoid having to
3701 * check two variables for all permutations: --> faster!
3702 */
3703 switch (wrap) {
3704 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3705 if (d1 < d2)
3706 return rq1;
3707 else if (d2 < d1)
3708 return rq2;
3709
3710 if (s1 >= s2)
3711 return rq1;
3712 else
3713 return rq2;
3714
3715 case BFQ_RQ2_WRAP:
3716 return rq1;
3717 case BFQ_RQ1_WRAP:
3718 return rq2;
3719 case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
3720 default:
3721 /*
3722 * Since both rqs are wrapped,
3723 * start with the one that's further behind head
3724 * (--> only *one* back seek required),
3725 * since back seek takes more time than forward.
3726 */
3727 if (s1 <= s2)
3728 return rq1;
3729 else
3730 return rq2;
3731 }
3732}
3733
3734/*
3735 * Return expired entry, or NULL to just start from scratch in rbtree.
3736 */
3737static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
3738 struct request *last)
3739{
3740 struct request *rq;
3741
3742 if (bfq_bfqq_fifo_expire(bfqq))
3743 return NULL;
3744
3745 bfq_mark_bfqq_fifo_expire(bfqq);
3746
3747 rq = rq_entry_fifo(bfqq->fifo.next);
3748
3749 if (rq == last || ktime_get_ns() < rq->fifo_time)
3750 return NULL;
3751
3752 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
3753 return rq;
3754}
3755
3756static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
3757 struct bfq_queue *bfqq,
3758 struct request *last)
3759{
3760 struct rb_node *rbnext = rb_next(&last->rb_node);
3761 struct rb_node *rbprev = rb_prev(&last->rb_node);
3762 struct request *next, *prev = NULL;
3763
3764 /* Follow expired path, else get first next available. */
3765 next = bfq_check_fifo(bfqq, last);
3766 if (next)
3767 return next;
3768
3769 if (rbprev)
3770 prev = rb_entry_rq(rbprev);
3771
3772 if (rbnext)
3773 next = rb_entry_rq(rbnext);
3774 else {
3775 rbnext = rb_first(&bfqq->sort_list);
3776 if (rbnext && rbnext != &last->rb_node)
3777 next = rb_entry_rq(rbnext);
3778 }
3779
3780 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
3781}
3782
c074170e 3783/* see the definition of bfq_async_charge_factor for details */
aee69d78
PV
3784static unsigned long bfq_serv_to_charge(struct request *rq,
3785 struct bfq_queue *bfqq)
3786{
44e44a1b 3787 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
c074170e
PV
3788 return blk_rq_sectors(rq);
3789
cfd69712
PV
3790 /*
3791 * If there are no weight-raised queues, then amplify service
3792 * by just the async charge factor; otherwise amplify service
3793 * by twice the async charge factor, to further reduce latency
3794 * for weight-raised queues.
3795 */
3796 if (bfqq->bfqd->wr_busy_queues == 0)
3797 return blk_rq_sectors(rq) * bfq_async_charge_factor;
3798
3799 return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
aee69d78
PV
3800}
3801
3802/**
3803 * bfq_updated_next_req - update the queue after a new next_rq selection.
3804 * @bfqd: the device data the queue belongs to.
3805 * @bfqq: the queue to update.
3806 *
3807 * If the first request of a queue changes we make sure that the queue
3808 * has enough budget to serve at least its first request (if the
3809 * request has grown). We do this because if the queue has not enough
3810 * budget for its first request, it has to go through two dispatch
3811 * rounds to actually get it dispatched.
3812 */
3813static void bfq_updated_next_req(struct bfq_data *bfqd,
3814 struct bfq_queue *bfqq)
3815{
3816 struct bfq_entity *entity = &bfqq->entity;
3817 struct request *next_rq = bfqq->next_rq;
3818 unsigned long new_budget;
3819
3820 if (!next_rq)
3821 return;
3822
3823 if (bfqq == bfqd->in_service_queue)
3824 /*
3825 * In order not to break guarantees, budgets cannot be
3826 * changed after an entity has been selected.
3827 */
3828 return;
3829
3830 new_budget = max_t(unsigned long, bfqq->max_budget,
3831 bfq_serv_to_charge(next_rq, bfqq));
3832 if (entity->budget != new_budget) {
3833 entity->budget = new_budget;
3834 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
3835 new_budget);
e21b7a0b 3836 bfq_requeue_bfqq(bfqd, bfqq);
aee69d78
PV
3837 }
3838}
3839
3840static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
3841{
3842 struct bfq_entity *entity = &bfqq->entity;
3843
3844 return entity->budget - entity->service;
3845}
3846
3847/*
3848 * If enough samples have been computed, return the current max budget
3849 * stored in bfqd, which is dynamically updated according to the
3850 * estimated disk peak rate; otherwise return the default max budget
3851 */
3852static int bfq_max_budget(struct bfq_data *bfqd)
3853{
3854 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3855 return bfq_default_max_budget;
3856 else
3857 return bfqd->bfq_max_budget;
3858}
3859
3860/*
3861 * Return min budget, which is a fraction of the current or default
3862 * max budget (trying with 1/32)
3863 */
3864static int bfq_min_budget(struct bfq_data *bfqd)
3865{
3866 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3867 return bfq_default_max_budget / 32;
3868 else
3869 return bfqd->bfq_max_budget / 32;
3870}
3871
3872static void bfq_bfqq_expire(struct bfq_data *bfqd,
3873 struct bfq_queue *bfqq,
3874 bool compensate,
3875 enum bfqq_expiration reason);
3876
3877/*
3878 * The next function, invoked after the input queue bfqq switches from
3879 * idle to busy, updates the budget of bfqq. The function also tells
3880 * whether the in-service queue should be expired, by returning
3881 * true. The purpose of expiring the in-service queue is to give bfqq
3882 * the chance to possibly preempt the in-service queue, and the reason
44e44a1b
PV
3883 * for preempting the in-service queue is to achieve one of the two
3884 * goals below.
aee69d78 3885 *
44e44a1b
PV
3886 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
3887 * expired because it has remained idle. In particular, bfqq may have
3888 * expired for one of the following two reasons:
aee69d78
PV
3889 *
3890 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3891 * and did not make it to issue a new request before its last
3892 * request was served;
3893 *
3894 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3895 * a new request before the expiration of the idling-time.
3896 *
3897 * Even if bfqq has expired for one of the above reasons, the process
3898 * associated with the queue may be however issuing requests greedily,
3899 * and thus be sensitive to the bandwidth it receives (bfqq may have
3900 * remained idle for other reasons: CPU high load, bfqq not enjoying
3901 * idling, I/O throttling somewhere in the path from the process to
3902 * the I/O scheduler, ...). But if, after every expiration for one of
3903 * the above two reasons, bfqq has to wait for the service of at least
3904 * one full budget of another queue before being served again, then
3905 * bfqq is likely to get a much lower bandwidth or resource time than
3906 * its reserved ones. To address this issue, two countermeasures need
3907 * to be taken.
3908 *
3909 * First, the budget and the timestamps of bfqq need to be updated in
3910 * a special way on bfqq reactivation: they need to be updated as if
3911 * bfqq did not remain idle and did not expire. In fact, if they are
3912 * computed as if bfqq expired and remained idle until reactivation,
3913 * then the process associated with bfqq is treated as if, instead of
3914 * being greedy, it stopped issuing requests when bfqq remained idle,
3915 * and restarts issuing requests only on this reactivation. In other
3916 * words, the scheduler does not help the process recover the "service
3917 * hole" between bfqq expiration and reactivation. As a consequence,
3918 * the process receives a lower bandwidth than its reserved one. In
3919 * contrast, to recover this hole, the budget must be updated as if
3920 * bfqq was not expired at all before this reactivation, i.e., it must
3921 * be set to the value of the remaining budget when bfqq was
3922 * expired. Along the same line, timestamps need to be assigned the
3923 * value they had the last time bfqq was selected for service, i.e.,
3924 * before last expiration. Thus timestamps need to be back-shifted
3925 * with respect to their normal computation (see [1] for more details
3926 * on this tricky aspect).
3927 *
3928 * Secondly, to allow the process to recover the hole, the in-service
3929 * queue must be expired too, to give bfqq the chance to preempt it
3930 * immediately. In fact, if bfqq has to wait for a full budget of the
3931 * in-service queue to be completed, then it may become impossible to
3932 * let the process recover the hole, even if the back-shifted
3933 * timestamps of bfqq are lower than those of the in-service queue. If
3934 * this happens for most or all of the holes, then the process may not
3935 * receive its reserved bandwidth. In this respect, it is worth noting
3936 * that, being the service of outstanding requests unpreemptible, a
3937 * little fraction of the holes may however be unrecoverable, thereby
3938 * causing a little loss of bandwidth.
3939 *
3940 * The last important point is detecting whether bfqq does need this
3941 * bandwidth recovery. In this respect, the next function deems the
3942 * process associated with bfqq greedy, and thus allows it to recover
3943 * the hole, if: 1) the process is waiting for the arrival of a new
3944 * request (which implies that bfqq expired for one of the above two
3945 * reasons), and 2) such a request has arrived soon. The first
3946 * condition is controlled through the flag non_blocking_wait_rq,
3947 * while the second through the flag arrived_in_time. If both
3948 * conditions hold, then the function computes the budget in the
3949 * above-described special way, and signals that the in-service queue
3950 * should be expired. Timestamp back-shifting is done later in
3951 * __bfq_activate_entity.
44e44a1b
PV
3952 *
3953 * 2. Reduce latency. Even if timestamps are not backshifted to let
3954 * the process associated with bfqq recover a service hole, bfqq may
3955 * however happen to have, after being (re)activated, a lower finish
3956 * timestamp than the in-service queue. That is, the next budget of
3957 * bfqq may have to be completed before the one of the in-service
3958 * queue. If this is the case, then preempting the in-service queue
3959 * allows this goal to be achieved, apart from the unpreemptible,
3960 * outstanding requests mentioned above.
3961 *
3962 * Unfortunately, regardless of which of the above two goals one wants
3963 * to achieve, service trees need first to be updated to know whether
3964 * the in-service queue must be preempted. To have service trees
3965 * correctly updated, the in-service queue must be expired and
3966 * rescheduled, and bfqq must be scheduled too. This is one of the
3967 * most costly operations (in future versions, the scheduling
3968 * mechanism may be re-designed in such a way to make it possible to
3969 * know whether preemption is needed without needing to update service
3970 * trees). In addition, queue preemptions almost always cause random
3971 * I/O, and thus loss of throughput. Because of these facts, the next
3972 * function adopts the following simple scheme to avoid both costly
3973 * operations and too frequent preemptions: it requests the expiration
3974 * of the in-service queue (unconditionally) only for queues that need
3975 * to recover a hole, or that either are weight-raised or deserve to
3976 * be weight-raised.
aee69d78
PV
3977 */
3978static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
3979 struct bfq_queue *bfqq,
44e44a1b
PV
3980 bool arrived_in_time,
3981 bool wr_or_deserves_wr)
aee69d78
PV
3982{
3983 struct bfq_entity *entity = &bfqq->entity;
3984
3985 if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
3986 /*
3987 * We do not clear the flag non_blocking_wait_rq here, as
3988 * the latter is used in bfq_activate_bfqq to signal
3989 * that timestamps need to be back-shifted (and is
3990 * cleared right after).
3991 */
3992
3993 /*
3994 * In next assignment we rely on that either
3995 * entity->service or entity->budget are not updated
3996 * on expiration if bfqq is empty (see
3997 * __bfq_bfqq_recalc_budget). Thus both quantities
3998 * remain unchanged after such an expiration, and the
3999 * following statement therefore assigns to
4000 * entity->budget the remaining budget on such an
4001 * expiration. For clarity, entity->service is not
4002 * updated on expiration in any case, and, in normal
4003 * operation, is reset only when bfqq is selected for
4004 * service (see bfq_get_next_queue).
4005 */
4006 entity->budget = min_t(unsigned long,
4007 bfq_bfqq_budget_left(bfqq),
4008 bfqq->max_budget);
4009
4010 return true;
4011 }
4012
4013 entity->budget = max_t(unsigned long, bfqq->max_budget,
4014 bfq_serv_to_charge(bfqq->next_rq, bfqq));
4015 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
44e44a1b
PV
4016 return wr_or_deserves_wr;
4017}
4018
4019static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
4020{
4021 u64 dur;
4022
4023 if (bfqd->bfq_wr_max_time > 0)
4024 return bfqd->bfq_wr_max_time;
4025
4026 dur = bfqd->RT_prod;
4027 do_div(dur, bfqd->peak_rate);
4028
4029 /*
4030 * Limit duration between 3 and 13 seconds. Tests show that
4031 * higher values than 13 seconds often yield the opposite of
4032 * the desired result, i.e., worsen responsiveness by letting
4033 * non-interactive and non-soft-real-time applications
4034 * preserve weight raising for a too long time interval.
4035 *
4036 * On the other end, lower values than 3 seconds make it
4037 * difficult for most interactive tasks to complete their jobs
4038 * before weight-raising finishes.
4039 */
4040 if (dur > msecs_to_jiffies(13000))
4041 dur = msecs_to_jiffies(13000);
4042 else if (dur < msecs_to_jiffies(3000))
4043 dur = msecs_to_jiffies(3000);
4044
4045 return dur;
4046}
4047
4048static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
4049 struct bfq_queue *bfqq,
4050 unsigned int old_wr_coeff,
4051 bool wr_or_deserves_wr,
77b7dcea
PV
4052 bool interactive,
4053 bool soft_rt)
44e44a1b
PV
4054{
4055 if (old_wr_coeff == 1 && wr_or_deserves_wr) {
4056 /* start a weight-raising period */
77b7dcea
PV
4057 if (interactive) {
4058 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4059 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4060 } else {
4061 bfqq->wr_start_at_switch_to_srt = jiffies;
4062 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
4063 BFQ_SOFTRT_WEIGHT_FACTOR;
4064 bfqq->wr_cur_max_time =
4065 bfqd->bfq_wr_rt_max_time;
4066 }
44e44a1b
PV
4067
4068 /*
4069 * If needed, further reduce budget to make sure it is
4070 * close to bfqq's backlog, so as to reduce the
4071 * scheduling-error component due to a too large
4072 * budget. Do not care about throughput consequences,
4073 * but only about latency. Finally, do not assign a
4074 * too small budget either, to avoid increasing
4075 * latency by causing too frequent expirations.
4076 */
4077 bfqq->entity.budget = min_t(unsigned long,
4078 bfqq->entity.budget,
4079 2 * bfq_min_budget(bfqd));
4080 } else if (old_wr_coeff > 1) {
77b7dcea
PV
4081 if (interactive) { /* update wr coeff and duration */
4082 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4083 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4084 } else if (soft_rt) {
4085 /*
4086 * The application is now or still meeting the
4087 * requirements for being deemed soft rt. We
4088 * can then correctly and safely (re)charge
4089 * the weight-raising duration for the
4090 * application with the weight-raising
4091 * duration for soft rt applications.
4092 *
4093 * In particular, doing this recharge now, i.e.,
4094 * before the weight-raising period for the
4095 * application finishes, reduces the probability
4096 * of the following negative scenario:
4097 * 1) the weight of a soft rt application is
4098 * raised at startup (as for any newly
4099 * created application),
4100 * 2) since the application is not interactive,
4101 * at a certain time weight-raising is
4102 * stopped for the application,
4103 * 3) at that time the application happens to
4104 * still have pending requests, and hence
4105 * is destined to not have a chance to be
4106 * deemed soft rt before these requests are
4107 * completed (see the comments to the
4108 * function bfq_bfqq_softrt_next_start()
4109 * for details on soft rt detection),
4110 * 4) these pending requests experience a high
4111 * latency because the application is not
4112 * weight-raised while they are pending.
4113 */
4114 if (bfqq->wr_cur_max_time !=
4115 bfqd->bfq_wr_rt_max_time) {
4116 bfqq->wr_start_at_switch_to_srt =
4117 bfqq->last_wr_start_finish;
4118
4119 bfqq->wr_cur_max_time =
4120 bfqd->bfq_wr_rt_max_time;
4121 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
4122 BFQ_SOFTRT_WEIGHT_FACTOR;
4123 }
4124 bfqq->last_wr_start_finish = jiffies;
4125 }
44e44a1b
PV
4126 }
4127}
4128
4129static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
4130 struct bfq_queue *bfqq)
4131{
4132 return bfqq->dispatched == 0 &&
4133 time_is_before_jiffies(
4134 bfqq->budget_timeout +
4135 bfqd->bfq_wr_min_idle_time);
aee69d78
PV
4136}
4137
4138static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
4139 struct bfq_queue *bfqq,
44e44a1b
PV
4140 int old_wr_coeff,
4141 struct request *rq,
4142 bool *interactive)
aee69d78 4143{
77b7dcea 4144 bool soft_rt, wr_or_deserves_wr, bfqq_wants_to_preempt,
44e44a1b 4145 idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
aee69d78
PV
4146 /*
4147 * See the comments on
4148 * bfq_bfqq_update_budg_for_activation for
4149 * details on the usage of the next variable.
4150 */
4151 arrived_in_time = ktime_get_ns() <=
4152 bfqq->ttime.last_end_request +
4153 bfqd->bfq_slice_idle * 3;
4154
e21b7a0b
AA
4155 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
4156
aee69d78 4157 /*
44e44a1b
PV
4158 * bfqq deserves to be weight-raised if:
4159 * - it is sync,
77b7dcea 4160 * - it has been idle for enough time or is soft real-time.
44e44a1b 4161 */
77b7dcea
PV
4162 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
4163 time_is_before_jiffies(bfqq->soft_rt_next_start);
44e44a1b
PV
4164 *interactive = idle_for_long_time;
4165 wr_or_deserves_wr = bfqd->low_latency &&
4166 (bfqq->wr_coeff > 1 ||
77b7dcea 4167 (bfq_bfqq_sync(bfqq) && (*interactive || soft_rt)));
44e44a1b
PV
4168
4169 /*
4170 * Using the last flag, update budget and check whether bfqq
4171 * may want to preempt the in-service queue.
aee69d78
PV
4172 */
4173 bfqq_wants_to_preempt =
4174 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
44e44a1b
PV
4175 arrived_in_time,
4176 wr_or_deserves_wr);
aee69d78
PV
4177
4178 if (!bfq_bfqq_IO_bound(bfqq)) {
4179 if (arrived_in_time) {
4180 bfqq->requests_within_timer++;
4181 if (bfqq->requests_within_timer >=
4182 bfqd->bfq_requests_within_timer)
4183 bfq_mark_bfqq_IO_bound(bfqq);
4184 } else
4185 bfqq->requests_within_timer = 0;
4186 }
4187
44e44a1b
PV
4188 if (bfqd->low_latency) {
4189 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
4190 old_wr_coeff,
4191 wr_or_deserves_wr,
77b7dcea
PV
4192 *interactive,
4193 soft_rt);
44e44a1b
PV
4194
4195 if (old_wr_coeff != bfqq->wr_coeff)
4196 bfqq->entity.prio_changed = 1;
4197 }
4198
77b7dcea
PV
4199 bfqq->last_idle_bklogged = jiffies;
4200 bfqq->service_from_backlogged = 0;
4201 bfq_clear_bfqq_softrt_update(bfqq);
4202
aee69d78
PV
4203 bfq_add_bfqq_busy(bfqd, bfqq);
4204
4205 /*
4206 * Expire in-service queue only if preemption may be needed
4207 * for guarantees. In this respect, the function
4208 * next_queue_may_preempt just checks a simple, necessary
4209 * condition, and not a sufficient condition based on
4210 * timestamps. In fact, for the latter condition to be
4211 * evaluated, timestamps would need first to be updated, and
4212 * this operation is quite costly (see the comments on the
4213 * function bfq_bfqq_update_budg_for_activation).
4214 */
4215 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
77b7dcea 4216 bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
aee69d78
PV
4217 next_queue_may_preempt(bfqd))
4218 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
4219 false, BFQQE_PREEMPTED);
4220}
4221
4222static void bfq_add_request(struct request *rq)
4223{
4224 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4225 struct bfq_data *bfqd = bfqq->bfqd;
4226 struct request *next_rq, *prev;
44e44a1b
PV
4227 unsigned int old_wr_coeff = bfqq->wr_coeff;
4228 bool interactive = false;
aee69d78
PV
4229
4230 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
4231 bfqq->queued[rq_is_sync(rq)]++;
4232 bfqd->queued++;
4233
4234 elv_rb_add(&bfqq->sort_list, rq);
4235
4236 /*
4237 * Check if this request is a better next-serve candidate.
4238 */
4239 prev = bfqq->next_rq;
4240 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
4241 bfqq->next_rq = next_rq;
4242
4243 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
44e44a1b
PV
4244 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
4245 rq, &interactive);
4246 else {
4247 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
4248 time_is_before_jiffies(
4249 bfqq->last_wr_start_finish +
4250 bfqd->bfq_wr_min_inter_arr_async)) {
4251 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4252 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4253
cfd69712 4254 bfqd->wr_busy_queues++;
44e44a1b
PV
4255 bfqq->entity.prio_changed = 1;
4256 }
4257 if (prev != bfqq->next_rq)
4258 bfq_updated_next_req(bfqd, bfqq);
4259 }
4260
4261 /*
4262 * Assign jiffies to last_wr_start_finish in the following
4263 * cases:
4264 *
4265 * . if bfqq is not going to be weight-raised, because, for
4266 * non weight-raised queues, last_wr_start_finish stores the
4267 * arrival time of the last request; as of now, this piece
4268 * of information is used only for deciding whether to
4269 * weight-raise async queues
4270 *
4271 * . if bfqq is not weight-raised, because, if bfqq is now
4272 * switching to weight-raised, then last_wr_start_finish
4273 * stores the time when weight-raising starts
4274 *
4275 * . if bfqq is interactive, because, regardless of whether
4276 * bfqq is currently weight-raised, the weight-raising
4277 * period must start or restart (this case is considered
4278 * separately because it is not detected by the above
4279 * conditions, if bfqq is already weight-raised)
77b7dcea
PV
4280 *
4281 * last_wr_start_finish has to be updated also if bfqq is soft
4282 * real-time, because the weight-raising period is constantly
4283 * restarted on idle-to-busy transitions for these queues, but
4284 * this is already done in bfq_bfqq_handle_idle_busy_switch if
4285 * needed.
44e44a1b
PV
4286 */
4287 if (bfqd->low_latency &&
4288 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
4289 bfqq->last_wr_start_finish = jiffies;
aee69d78
PV
4290}
4291
4292static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
4293 struct bio *bio,
4294 struct request_queue *q)
4295{
4296 struct bfq_queue *bfqq = bfqd->bio_bfqq;
4297
4298
4299 if (bfqq)
4300 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
4301
4302 return NULL;
4303}
4304
ab0e43e9
PV
4305static sector_t get_sdist(sector_t last_pos, struct request *rq)
4306{
4307 if (last_pos)
4308 return abs(blk_rq_pos(rq) - last_pos);
4309
4310 return 0;
4311}
4312
aee69d78
PV
4313#if 0 /* Still not clear if we can do without next two functions */
4314static void bfq_activate_request(struct request_queue *q, struct request *rq)
4315{
4316 struct bfq_data *bfqd = q->elevator->elevator_data;
4317
4318 bfqd->rq_in_driver++;
aee69d78
PV
4319}
4320
4321static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
4322{
4323 struct bfq_data *bfqd = q->elevator->elevator_data;
4324
4325 bfqd->rq_in_driver--;
4326}
4327#endif
4328
4329static void bfq_remove_request(struct request_queue *q,
4330 struct request *rq)
4331{
4332 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4333 struct bfq_data *bfqd = bfqq->bfqd;
4334 const int sync = rq_is_sync(rq);
4335
4336 if (bfqq->next_rq == rq) {
4337 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
4338 bfq_updated_next_req(bfqd, bfqq);
4339 }
4340
4341 if (rq->queuelist.prev != &rq->queuelist)
4342 list_del_init(&rq->queuelist);
4343 bfqq->queued[sync]--;
4344 bfqd->queued--;
4345 elv_rb_del(&bfqq->sort_list, rq);
4346
4347 elv_rqhash_del(q, rq);
4348 if (q->last_merge == rq)
4349 q->last_merge = NULL;
4350
4351 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4352 bfqq->next_rq = NULL;
4353
4354 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
e21b7a0b 4355 bfq_del_bfqq_busy(bfqd, bfqq, false);
aee69d78
PV
4356 /*
4357 * bfqq emptied. In normal operation, when
4358 * bfqq is empty, bfqq->entity.service and
4359 * bfqq->entity.budget must contain,
4360 * respectively, the service received and the
4361 * budget used last time bfqq emptied. These
4362 * facts do not hold in this case, as at least
4363 * this last removal occurred while bfqq is
4364 * not in service. To avoid inconsistencies,
4365 * reset both bfqq->entity.service and
4366 * bfqq->entity.budget, if bfqq has still a
4367 * process that may issue I/O requests to it.
4368 */
4369 bfqq->entity.budget = bfqq->entity.service = 0;
4370 }
4371 }
4372
4373 if (rq->cmd_flags & REQ_META)
4374 bfqq->meta_pending--;
e21b7a0b
AA
4375
4376 bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
aee69d78
PV
4377}
4378
4379static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
4380{
4381 struct request_queue *q = hctx->queue;
4382 struct bfq_data *bfqd = q->elevator->elevator_data;
4383 struct request *free = NULL;
4384 /*
4385 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4386 * store its return value for later use, to avoid nesting
4387 * queue_lock inside the bfqd->lock. We assume that the bic
4388 * returned by bfq_bic_lookup does not go away before
4389 * bfqd->lock is taken.
4390 */
4391 struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
4392 bool ret;
4393
4394 spin_lock_irq(&bfqd->lock);
4395
4396 if (bic)
4397 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
4398 else
4399 bfqd->bio_bfqq = NULL;
4400 bfqd->bio_bic = bic;
4401
4402 ret = blk_mq_sched_try_merge(q, bio, &free);
4403
4404 if (free)
4405 blk_mq_free_request(free);
4406 spin_unlock_irq(&bfqd->lock);
4407
4408 return ret;
4409}
4410
4411static int bfq_request_merge(struct request_queue *q, struct request **req,
4412 struct bio *bio)
4413{
4414 struct bfq_data *bfqd = q->elevator->elevator_data;
4415 struct request *__rq;
4416
4417 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
4418 if (__rq && elv_bio_merge_ok(__rq, bio)) {
4419 *req = __rq;
4420 return ELEVATOR_FRONT_MERGE;
4421 }
4422
4423 return ELEVATOR_NO_MERGE;
4424}
4425
4426static void bfq_request_merged(struct request_queue *q, struct request *req,
4427 enum elv_merge type)
4428{
4429 if (type == ELEVATOR_FRONT_MERGE &&
4430 rb_prev(&req->rb_node) &&
4431 blk_rq_pos(req) <
4432 blk_rq_pos(container_of(rb_prev(&req->rb_node),
4433 struct request, rb_node))) {
4434 struct bfq_queue *bfqq = RQ_BFQQ(req);
4435 struct bfq_data *bfqd = bfqq->bfqd;
4436 struct request *prev, *next_rq;
4437
4438 /* Reposition request in its sort_list */
4439 elv_rb_del(&bfqq->sort_list, req);
4440 elv_rb_add(&bfqq->sort_list, req);
4441
4442 /* Choose next request to be served for bfqq */
4443 prev = bfqq->next_rq;
4444 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
4445 bfqd->last_position);
4446 bfqq->next_rq = next_rq;
4447 /*
4448 * If next_rq changes, update the queue's budget to fit
4449 * the new request.
4450 */
4451 if (prev != bfqq->next_rq)
4452 bfq_updated_next_req(bfqd, bfqq);
4453 }
4454}
4455
4456static void bfq_requests_merged(struct request_queue *q, struct request *rq,
4457 struct request *next)
4458{
4459 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
4460
4461 if (!RB_EMPTY_NODE(&rq->rb_node))
e21b7a0b 4462 goto end;
aee69d78
PV
4463 spin_lock_irq(&bfqq->bfqd->lock);
4464
4465 /*
4466 * If next and rq belong to the same bfq_queue and next is older
4467 * than rq, then reposition rq in the fifo (by substituting next
4468 * with rq). Otherwise, if next and rq belong to different
4469 * bfq_queues, never reposition rq: in fact, we would have to
4470 * reposition it with respect to next's position in its own fifo,
4471 * which would most certainly be too expensive with respect to
4472 * the benefits.
4473 */
4474 if (bfqq == next_bfqq &&
4475 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
4476 next->fifo_time < rq->fifo_time) {
4477 list_del_init(&rq->queuelist);
4478 list_replace_init(&next->queuelist, &rq->queuelist);
4479 rq->fifo_time = next->fifo_time;
4480 }
4481
4482 if (bfqq->next_rq == next)
4483 bfqq->next_rq = rq;
4484
4485 bfq_remove_request(q, next);
4486
4487 spin_unlock_irq(&bfqq->bfqd->lock);
e21b7a0b
AA
4488end:
4489 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
aee69d78
PV
4490}
4491
44e44a1b
PV
4492/* Must be called with bfqq != NULL */
4493static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
4494{
cfd69712
PV
4495 if (bfq_bfqq_busy(bfqq))
4496 bfqq->bfqd->wr_busy_queues--;
44e44a1b
PV
4497 bfqq->wr_coeff = 1;
4498 bfqq->wr_cur_max_time = 0;
77b7dcea 4499 bfqq->last_wr_start_finish = jiffies;
44e44a1b
PV
4500 /*
4501 * Trigger a weight change on the next invocation of
4502 * __bfq_entity_update_weight_prio.
4503 */
4504 bfqq->entity.prio_changed = 1;
4505}
4506
4507static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
4508 struct bfq_group *bfqg)
4509{
4510 int i, j;
4511
4512 for (i = 0; i < 2; i++)
4513 for (j = 0; j < IOPRIO_BE_NR; j++)
4514 if (bfqg->async_bfqq[i][j])
4515 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
4516 if (bfqg->async_idle_bfqq)
4517 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
4518}
4519
4520static void bfq_end_wr(struct bfq_data *bfqd)
4521{
4522 struct bfq_queue *bfqq;
4523
4524 spin_lock_irq(&bfqd->lock);
4525
4526 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
4527 bfq_bfqq_end_wr(bfqq);
4528 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
4529 bfq_bfqq_end_wr(bfqq);
4530 bfq_end_wr_async(bfqd);
4531
4532 spin_unlock_irq(&bfqd->lock);
4533}
4534
aee69d78
PV
4535static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4536 struct bio *bio)
4537{
4538 struct bfq_data *bfqd = q->elevator->elevator_data;
4539 bool is_sync = op_is_sync(bio->bi_opf);
4540 struct bfq_queue *bfqq = bfqd->bio_bfqq;
4541
4542 /*
4543 * Disallow merge of a sync bio into an async request.
4544 */
4545 if (is_sync && !rq_is_sync(rq))
4546 return false;
4547
4548 /*
4549 * Lookup the bfqq that this bio will be queued with. Allow
4550 * merge only if rq is queued there.
4551 */
4552 if (!bfqq)
4553 return false;
4554
4555 return bfqq == RQ_BFQQ(rq);
4556}
4557
44e44a1b
PV
4558/*
4559 * Set the maximum time for the in-service queue to consume its
4560 * budget. This prevents seeky processes from lowering the throughput.
4561 * In practice, a time-slice service scheme is used with seeky
4562 * processes.
4563 */
4564static void bfq_set_budget_timeout(struct bfq_data *bfqd,
4565 struct bfq_queue *bfqq)
4566{
77b7dcea
PV
4567 unsigned int timeout_coeff;
4568
4569 if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
4570 timeout_coeff = 1;
4571 else
4572 timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
4573
44e44a1b
PV
4574 bfqd->last_budget_start = ktime_get();
4575
4576 bfqq->budget_timeout = jiffies +
77b7dcea 4577 bfqd->bfq_timeout * timeout_coeff;
44e44a1b
PV
4578}
4579
aee69d78
PV
4580static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
4581 struct bfq_queue *bfqq)
4582{
4583 if (bfqq) {
e21b7a0b 4584 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
aee69d78
PV
4585 bfq_clear_bfqq_fifo_expire(bfqq);
4586
4587 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
4588
77b7dcea
PV
4589 if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
4590 bfqq->wr_coeff > 1 &&
4591 bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
4592 time_is_before_jiffies(bfqq->budget_timeout)) {
4593 /*
4594 * For soft real-time queues, move the start
4595 * of the weight-raising period forward by the
4596 * time the queue has not received any
4597 * service. Otherwise, a relatively long
4598 * service delay is likely to cause the
4599 * weight-raising period of the queue to end,
4600 * because of the short duration of the
4601 * weight-raising period of a soft real-time
4602 * queue. It is worth noting that this move
4603 * is not so dangerous for the other queues,
4604 * because soft real-time queues are not
4605 * greedy.
4606 *
4607 * To not add a further variable, we use the
4608 * overloaded field budget_timeout to
4609 * determine for how long the queue has not
4610 * received service, i.e., how much time has
4611 * elapsed since the queue expired. However,
4612 * this is a little imprecise, because
4613 * budget_timeout is set to jiffies if bfqq
4614 * not only expires, but also remains with no
4615 * request.
4616 */
4617 if (time_after(bfqq->budget_timeout,
4618 bfqq->last_wr_start_finish))
4619 bfqq->last_wr_start_finish +=
4620 jiffies - bfqq->budget_timeout;
4621 else
4622 bfqq->last_wr_start_finish = jiffies;
4623 }
4624
44e44a1b 4625 bfq_set_budget_timeout(bfqd, bfqq);
aee69d78
PV
4626 bfq_log_bfqq(bfqd, bfqq,
4627 "set_in_service_queue, cur-budget = %d",
4628 bfqq->entity.budget);
4629 }
4630
4631 bfqd->in_service_queue = bfqq;
4632}
4633
4634/*
4635 * Get and set a new queue for service.
4636 */
4637static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
4638{
4639 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
4640
4641 __bfq_set_in_service_queue(bfqd, bfqq);
4642 return bfqq;
4643}
4644
aee69d78
PV
4645static void bfq_arm_slice_timer(struct bfq_data *bfqd)
4646{
4647 struct bfq_queue *bfqq = bfqd->in_service_queue;
4648 struct bfq_io_cq *bic;
4649 u32 sl;
4650
4651 /* Processes have exited, don't wait. */
4652 bic = bfqd->in_service_bic;
4653 if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
4654 return;
4655
4656 bfq_mark_bfqq_wait_request(bfqq);
4657
4658 /*
4659 * We don't want to idle for seeks, but we do want to allow
4660 * fair distribution of slice time for a process doing back-to-back
4661 * seeks. So allow a little bit of time for him to submit a new rq.
4662 */
4663 sl = bfqd->bfq_slice_idle;
4664 /*
44e44a1b
PV
4665 * Unless the queue is being weight-raised, grant only minimum
4666 * idle time if the queue is seeky. A long idling is preserved
4667 * for a weight-raised queue, because it is needed for
4668 * guaranteeing to the queue its reserved share of the
4669 * throughput.
aee69d78 4670 */
44e44a1b 4671 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
aee69d78
PV
4672 sl = min_t(u64, sl, BFQ_MIN_TT);
4673
4674 bfqd->last_idling_start = ktime_get();
4675 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
4676 HRTIMER_MODE_REL);
e21b7a0b 4677 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
aee69d78
PV
4678}
4679
ab0e43e9
PV
4680/*
4681 * In autotuning mode, max_budget is dynamically recomputed as the
4682 * amount of sectors transferred in timeout at the estimated peak
4683 * rate. This enables BFQ to utilize a full timeslice with a full
4684 * budget, even if the in-service queue is served at peak rate. And
4685 * this maximises throughput with sequential workloads.
4686 */
4687static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
4688{
4689 return (u64)bfqd->peak_rate * USEC_PER_MSEC *
4690 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
4691}
4692
44e44a1b
PV
4693/*
4694 * Update parameters related to throughput and responsiveness, as a
4695 * function of the estimated peak rate. See comments on
4696 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
4697 */
4698static void update_thr_responsiveness_params(struct bfq_data *bfqd)
4699{
4700 int dev_type = blk_queue_nonrot(bfqd->queue);
4701
4702 if (bfqd->bfq_user_max_budget == 0)
4703 bfqd->bfq_max_budget =
4704 bfq_calc_max_budget(bfqd);
4705
4706 if (bfqd->device_speed == BFQ_BFQD_FAST &&
4707 bfqd->peak_rate < device_speed_thresh[dev_type]) {
4708 bfqd->device_speed = BFQ_BFQD_SLOW;
4709 bfqd->RT_prod = R_slow[dev_type] *
4710 T_slow[dev_type];
4711 } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
4712 bfqd->peak_rate > device_speed_thresh[dev_type]) {
4713 bfqd->device_speed = BFQ_BFQD_FAST;
4714 bfqd->RT_prod = R_fast[dev_type] *
4715 T_fast[dev_type];
4716 }
4717
4718 bfq_log(bfqd,
4719"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
4720 dev_type == 0 ? "ROT" : "NONROT",
4721 bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
4722 bfqd->device_speed == BFQ_BFQD_FAST ?
4723 (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
4724 (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
4725 (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
4726 BFQ_RATE_SHIFT);
4727}
4728
ab0e43e9
PV
4729static void bfq_reset_rate_computation(struct bfq_data *bfqd,
4730 struct request *rq)
4731{
4732 if (rq != NULL) { /* new rq dispatch now, reset accordingly */
4733 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
4734 bfqd->peak_rate_samples = 1;
4735 bfqd->sequential_samples = 0;
4736 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
4737 blk_rq_sectors(rq);
4738 } else /* no new rq dispatched, just reset the number of samples */
4739 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
4740
4741 bfq_log(bfqd,
4742 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4743 bfqd->peak_rate_samples, bfqd->sequential_samples,
4744 bfqd->tot_sectors_dispatched);
4745}
4746
4747static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
4748{
4749 u32 rate, weight, divisor;
4750
4751 /*
4752 * For the convergence property to hold (see comments on
4753 * bfq_update_peak_rate()) and for the assessment to be
4754 * reliable, a minimum number of samples must be present, and
4755 * a minimum amount of time must have elapsed. If not so, do
4756 * not compute new rate. Just reset parameters, to get ready
4757 * for a new evaluation attempt.
4758 */
4759 if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
4760 bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
4761 goto reset_computation;
4762
4763 /*
4764 * If a new request completion has occurred after last
4765 * dispatch, then, to approximate the rate at which requests
4766 * have been served by the device, it is more precise to
4767 * extend the observation interval to the last completion.
4768 */
4769 bfqd->delta_from_first =
4770 max_t(u64, bfqd->delta_from_first,
4771 bfqd->last_completion - bfqd->first_dispatch);
4772
4773 /*
4774 * Rate computed in sects/usec, and not sects/nsec, for
4775 * precision issues.
4776 */
4777 rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
4778 div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
4779
4780 /*
4781 * Peak rate not updated if:
4782 * - the percentage of sequential dispatches is below 3/4 of the
4783 * total, and rate is below the current estimated peak rate
4784 * - rate is unreasonably high (> 20M sectors/sec)
4785 */
4786 if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
4787 rate <= bfqd->peak_rate) ||
4788 rate > 20<<BFQ_RATE_SHIFT)
4789 goto reset_computation;
4790
4791 /*
4792 * We have to update the peak rate, at last! To this purpose,
4793 * we use a low-pass filter. We compute the smoothing constant
4794 * of the filter as a function of the 'weight' of the new
4795 * measured rate.
4796 *
4797 * As can be seen in next formulas, we define this weight as a
4798 * quantity proportional to how sequential the workload is,
4799 * and to how long the observation time interval is.
4800 *
4801 * The weight runs from 0 to 8. The maximum value of the
4802 * weight, 8, yields the minimum value for the smoothing
4803 * constant. At this minimum value for the smoothing constant,
4804 * the measured rate contributes for half of the next value of
4805 * the estimated peak rate.
4806 *
4807 * So, the first step is to compute the weight as a function
4808 * of how sequential the workload is. Note that the weight
4809 * cannot reach 9, because bfqd->sequential_samples cannot
4810 * become equal to bfqd->peak_rate_samples, which, in its
4811 * turn, holds true because bfqd->sequential_samples is not
4812 * incremented for the first sample.
4813 */
4814 weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
4815
4816 /*
4817 * Second step: further refine the weight as a function of the
4818 * duration of the observation interval.
4819 */
4820 weight = min_t(u32, 8,
4821 div_u64(weight * bfqd->delta_from_first,
4822 BFQ_RATE_REF_INTERVAL));
4823
4824 /*
4825 * Divisor ranging from 10, for minimum weight, to 2, for
4826 * maximum weight.
4827 */
4828 divisor = 10 - weight;
4829
4830 /*
4831 * Finally, update peak rate:
4832 *
4833 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4834 */
4835 bfqd->peak_rate *= divisor-1;
4836 bfqd->peak_rate /= divisor;
4837 rate /= divisor; /* smoothing constant alpha = 1/divisor */
4838
4839 bfqd->peak_rate += rate;
44e44a1b 4840 update_thr_responsiveness_params(bfqd);
ab0e43e9
PV
4841
4842reset_computation:
4843 bfq_reset_rate_computation(bfqd, rq);
4844}
4845
4846/*
4847 * Update the read/write peak rate (the main quantity used for
4848 * auto-tuning, see update_thr_responsiveness_params()).
4849 *
4850 * It is not trivial to estimate the peak rate (correctly): because of
4851 * the presence of sw and hw queues between the scheduler and the
4852 * device components that finally serve I/O requests, it is hard to
4853 * say exactly when a given dispatched request is served inside the
4854 * device, and for how long. As a consequence, it is hard to know
4855 * precisely at what rate a given set of requests is actually served
4856 * by the device.
4857 *
4858 * On the opposite end, the dispatch time of any request is trivially
4859 * available, and, from this piece of information, the "dispatch rate"
4860 * of requests can be immediately computed. So, the idea in the next
4861 * function is to use what is known, namely request dispatch times
4862 * (plus, when useful, request completion times), to estimate what is
4863 * unknown, namely in-device request service rate.
4864 *
4865 * The main issue is that, because of the above facts, the rate at
4866 * which a certain set of requests is dispatched over a certain time
4867 * interval can vary greatly with respect to the rate at which the
4868 * same requests are then served. But, since the size of any
4869 * intermediate queue is limited, and the service scheme is lossless
4870 * (no request is silently dropped), the following obvious convergence
4871 * property holds: the number of requests dispatched MUST become
4872 * closer and closer to the number of requests completed as the
4873 * observation interval grows. This is the key property used in
4874 * the next function to estimate the peak service rate as a function
4875 * of the observed dispatch rate. The function assumes to be invoked
4876 * on every request dispatch.
4877 */
4878static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
4879{
4880 u64 now_ns = ktime_get_ns();
4881
4882 if (bfqd->peak_rate_samples == 0) { /* first dispatch */
4883 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
4884 bfqd->peak_rate_samples);
4885 bfq_reset_rate_computation(bfqd, rq);
4886 goto update_last_values; /* will add one sample */
4887 }
4888
4889 /*
4890 * Device idle for very long: the observation interval lasting
4891 * up to this dispatch cannot be a valid observation interval
4892 * for computing a new peak rate (similarly to the late-
4893 * completion event in bfq_completed_request()). Go to
4894 * update_rate_and_reset to have the following three steps
4895 * taken:
4896 * - close the observation interval at the last (previous)
4897 * request dispatch or completion
4898 * - compute rate, if possible, for that observation interval
4899 * - start a new observation interval with this dispatch
4900 */
4901 if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
4902 bfqd->rq_in_driver == 0)
4903 goto update_rate_and_reset;
4904
4905 /* Update sampling information */
4906 bfqd->peak_rate_samples++;
4907
4908 if ((bfqd->rq_in_driver > 0 ||
4909 now_ns - bfqd->last_completion < BFQ_MIN_TT)
4910 && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
4911 bfqd->sequential_samples++;
4912
4913 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
4914
4915 /* Reset max observed rq size every 32 dispatches */
4916 if (likely(bfqd->peak_rate_samples % 32))
4917 bfqd->last_rq_max_size =
4918 max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
4919 else
4920 bfqd->last_rq_max_size = blk_rq_sectors(rq);
4921
4922 bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
4923
4924 /* Target observation interval not yet reached, go on sampling */
4925 if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
4926 goto update_last_values;
4927
4928update_rate_and_reset:
4929 bfq_update_rate_reset(bfqd, rq);
4930update_last_values:
4931 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
4932 bfqd->last_dispatch = now_ns;
4933}
4934
aee69d78
PV
4935/*
4936 * Remove request from internal lists.
4937 */
4938static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4939{
4940 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4941
4942 /*
4943 * For consistency, the next instruction should have been
4944 * executed after removing the request from the queue and
4945 * dispatching it. We execute instead this instruction before
4946 * bfq_remove_request() (and hence introduce a temporary
4947 * inconsistency), for efficiency. In fact, should this
4948 * dispatch occur for a non in-service bfqq, this anticipated
4949 * increment prevents two counters related to bfqq->dispatched
4950 * from risking to be, first, uselessly decremented, and then
4951 * incremented again when the (new) value of bfqq->dispatched
4952 * happens to be taken into account.
4953 */
4954 bfqq->dispatched++;
ab0e43e9 4955 bfq_update_peak_rate(q->elevator->elevator_data, rq);
aee69d78
PV
4956
4957 bfq_remove_request(q, rq);
4958}
4959
4960static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4961{
44e44a1b
PV
4962 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4963 if (bfqq->dispatched == 0)
4964 /*
4965 * Overloading budget_timeout field to store
4966 * the time at which the queue remains with no
4967 * backlog and no outstanding request; used by
4968 * the weight-raising mechanism.
4969 */
4970 bfqq->budget_timeout = jiffies;
4971
e21b7a0b 4972 bfq_del_bfqq_busy(bfqd, bfqq, true);
44e44a1b 4973 } else
e21b7a0b
AA
4974 bfq_requeue_bfqq(bfqd, bfqq);
4975
4976 /*
4977 * All in-service entities must have been properly deactivated
4978 * or requeued before executing the next function, which
4979 * resets all in-service entites as no more in service.
4980 */
4981 __bfq_bfqd_reset_in_service(bfqd);
aee69d78
PV
4982}
4983
4984/**
4985 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4986 * @bfqd: device data.
4987 * @bfqq: queue to update.
4988 * @reason: reason for expiration.
4989 *
4990 * Handle the feedback on @bfqq budget at queue expiration.
4991 * See the body for detailed comments.
4992 */
4993static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4994 struct bfq_queue *bfqq,
4995 enum bfqq_expiration reason)
4996{
4997 struct request *next_rq;
4998 int budget, min_budget;
4999
aee69d78
PV
5000 min_budget = bfq_min_budget(bfqd);
5001
44e44a1b
PV
5002 if (bfqq->wr_coeff == 1)
5003 budget = bfqq->max_budget;
5004 else /*
5005 * Use a constant, low budget for weight-raised queues,
5006 * to help achieve a low latency. Keep it slightly higher
5007 * than the minimum possible budget, to cause a little
5008 * bit fewer expirations.
5009 */
5010 budget = 2 * min_budget;
5011
aee69d78
PV
5012 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
5013 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
5014 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
5015 budget, bfq_min_budget(bfqd));
5016 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
5017 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
5018
44e44a1b 5019 if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
aee69d78
PV
5020 switch (reason) {
5021 /*
5022 * Caveat: in all the following cases we trade latency
5023 * for throughput.
5024 */
5025 case BFQQE_TOO_IDLE:
54b60456
PV
5026 /*
5027 * This is the only case where we may reduce
5028 * the budget: if there is no request of the
5029 * process still waiting for completion, then
5030 * we assume (tentatively) that the timer has
5031 * expired because the batch of requests of
5032 * the process could have been served with a
5033 * smaller budget. Hence, betting that
5034 * process will behave in the same way when it
5035 * becomes backlogged again, we reduce its
5036 * next budget. As long as we guess right,
5037 * this budget cut reduces the latency
5038 * experienced by the process.
5039 *
5040 * However, if there are still outstanding
5041 * requests, then the process may have not yet
5042 * issued its next request just because it is
5043 * still waiting for the completion of some of
5044 * the still outstanding ones. So in this
5045 * subcase we do not reduce its budget, on the
5046 * contrary we increase it to possibly boost
5047 * the throughput, as discussed in the
5048 * comments to the BUDGET_TIMEOUT case.
5049 */
5050 if (bfqq->dispatched > 0) /* still outstanding reqs */
5051 budget = min(budget * 2, bfqd->bfq_max_budget);
5052 else {
5053 if (budget > 5 * min_budget)
5054 budget -= 4 * min_budget;
5055 else
5056 budget = min_budget;
5057 }
aee69d78
PV
5058 break;
5059 case BFQQE_BUDGET_TIMEOUT:
54b60456
PV
5060 /*
5061 * We double the budget here because it gives
5062 * the chance to boost the throughput if this
5063 * is not a seeky process (and has bumped into
5064 * this timeout because of, e.g., ZBR).
5065 */
5066 budget = min(budget * 2, bfqd->bfq_max_budget);
aee69d78
PV
5067 break;
5068 case BFQQE_BUDGET_EXHAUSTED:
5069 /*
5070 * The process still has backlog, and did not
5071 * let either the budget timeout or the disk
5072 * idling timeout expire. Hence it is not
5073 * seeky, has a short thinktime and may be
5074 * happy with a higher budget too. So
5075 * definitely increase the budget of this good
5076 * candidate to boost the disk throughput.
5077 */
54b60456 5078 budget = min(budget * 4, bfqd->bfq_max_budget);
aee69d78
PV
5079 break;
5080 case BFQQE_NO_MORE_REQUESTS:
5081 /*
5082 * For queues that expire for this reason, it
5083 * is particularly important to keep the
5084 * budget close to the actual service they
5085 * need. Doing so reduces the timestamp
5086 * misalignment problem described in the
5087 * comments in the body of
5088 * __bfq_activate_entity. In fact, suppose
5089 * that a queue systematically expires for
5090 * BFQQE_NO_MORE_REQUESTS and presents a
5091 * new request in time to enjoy timestamp
5092 * back-shifting. The larger the budget of the
5093 * queue is with respect to the service the
5094 * queue actually requests in each service
5095 * slot, the more times the queue can be
5096 * reactivated with the same virtual finish
5097 * time. It follows that, even if this finish
5098 * time is pushed to the system virtual time
5099 * to reduce the consequent timestamp
5100 * misalignment, the queue unjustly enjoys for
5101 * many re-activations a lower finish time
5102 * than all newly activated queues.
5103 *
5104 * The service needed by bfqq is measured
5105 * quite precisely by bfqq->entity.service.
5106 * Since bfqq does not enjoy device idling,
5107 * bfqq->entity.service is equal to the number
5108 * of sectors that the process associated with
5109 * bfqq requested to read/write before waiting
5110 * for request completions, or blocking for
5111 * other reasons.
5112 */
5113 budget = max_t(int, bfqq->entity.service, min_budget);
5114 break;
5115 default:
5116 return;
5117 }
44e44a1b 5118 } else if (!bfq_bfqq_sync(bfqq)) {
aee69d78
PV
5119 /*
5120 * Async queues get always the maximum possible
5121 * budget, as for them we do not care about latency
5122 * (in addition, their ability to dispatch is limited
5123 * by the charging factor).
5124 */
5125 budget = bfqd->bfq_max_budget;
5126 }
5127
5128 bfqq->max_budget = budget;
5129
5130 if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
5131 !bfqd->bfq_user_max_budget)
5132 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
5133
5134 /*
5135 * If there is still backlog, then assign a new budget, making
5136 * sure that it is large enough for the next request. Since
5137 * the finish time of bfqq must be kept in sync with the
5138 * budget, be sure to call __bfq_bfqq_expire() *after* this
5139 * update.
5140 *
5141 * If there is no backlog, then no need to update the budget;
5142 * it will be updated on the arrival of a new request.
5143 */
5144 next_rq = bfqq->next_rq;
5145 if (next_rq)
5146 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
5147 bfq_serv_to_charge(next_rq, bfqq));
5148
5149 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
5150 next_rq ? blk_rq_sectors(next_rq) : 0,
5151 bfqq->entity.budget);
5152}
5153
aee69d78 5154/*
ab0e43e9
PV
5155 * Return true if the process associated with bfqq is "slow". The slow
5156 * flag is used, in addition to the budget timeout, to reduce the
5157 * amount of service provided to seeky processes, and thus reduce
5158 * their chances to lower the throughput. More details in the comments
5159 * on the function bfq_bfqq_expire().
5160 *
5161 * An important observation is in order: as discussed in the comments
5162 * on the function bfq_update_peak_rate(), with devices with internal
5163 * queues, it is hard if ever possible to know when and for how long
5164 * an I/O request is processed by the device (apart from the trivial
5165 * I/O pattern where a new request is dispatched only after the
5166 * previous one has been completed). This makes it hard to evaluate
5167 * the real rate at which the I/O requests of each bfq_queue are
5168 * served. In fact, for an I/O scheduler like BFQ, serving a
5169 * bfq_queue means just dispatching its requests during its service
5170 * slot (i.e., until the budget of the queue is exhausted, or the
5171 * queue remains idle, or, finally, a timeout fires). But, during the
5172 * service slot of a bfq_queue, around 100 ms at most, the device may
5173 * be even still processing requests of bfq_queues served in previous
5174 * service slots. On the opposite end, the requests of the in-service
5175 * bfq_queue may be completed after the service slot of the queue
5176 * finishes.
5177 *
5178 * Anyway, unless more sophisticated solutions are used
5179 * (where possible), the sum of the sizes of the requests dispatched
5180 * during the service slot of a bfq_queue is probably the only
5181 * approximation available for the service received by the bfq_queue
5182 * during its service slot. And this sum is the quantity used in this
5183 * function to evaluate the I/O speed of a process.
aee69d78 5184 */
ab0e43e9
PV
5185static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5186 bool compensate, enum bfqq_expiration reason,
5187 unsigned long *delta_ms)
aee69d78 5188{
ab0e43e9
PV
5189 ktime_t delta_ktime;
5190 u32 delta_usecs;
5191 bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
aee69d78 5192
ab0e43e9 5193 if (!bfq_bfqq_sync(bfqq))
aee69d78
PV
5194 return false;
5195
5196 if (compensate)
ab0e43e9 5197 delta_ktime = bfqd->last_idling_start;
aee69d78 5198 else
ab0e43e9
PV
5199 delta_ktime = ktime_get();
5200 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
5201 delta_usecs = ktime_to_us(delta_ktime);
aee69d78
PV
5202
5203 /* don't use too short time intervals */
ab0e43e9
PV
5204 if (delta_usecs < 1000) {
5205 if (blk_queue_nonrot(bfqd->queue))
5206 /*
5207 * give same worst-case guarantees as idling
5208 * for seeky
5209 */
5210 *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
5211 else /* charge at least one seek */
5212 *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
5213
5214 return slow;
5215 }
aee69d78 5216
ab0e43e9 5217 *delta_ms = delta_usecs / USEC_PER_MSEC;
aee69d78
PV
5218
5219 /*
ab0e43e9
PV
5220 * Use only long (> 20ms) intervals to filter out excessive
5221 * spikes in service rate estimation.
aee69d78 5222 */
ab0e43e9
PV
5223 if (delta_usecs > 20000) {
5224 /*
5225 * Caveat for rotational devices: processes doing I/O
5226 * in the slower disk zones tend to be slow(er) even
5227 * if not seeky. In this respect, the estimated peak
5228 * rate is likely to be an average over the disk
5229 * surface. Accordingly, to not be too harsh with
5230 * unlucky processes, a process is deemed slow only if
5231 * its rate has been lower than half of the estimated
5232 * peak rate.
5233 */
5234 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
aee69d78
PV
5235 }
5236
ab0e43e9 5237 bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
aee69d78 5238
ab0e43e9 5239 return slow;
aee69d78
PV
5240}
5241
77b7dcea
PV
5242/*
5243 * To be deemed as soft real-time, an application must meet two
5244 * requirements. First, the application must not require an average
5245 * bandwidth higher than the approximate bandwidth required to playback or
5246 * record a compressed high-definition video.
5247 * The next function is invoked on the completion of the last request of a
5248 * batch, to compute the next-start time instant, soft_rt_next_start, such
5249 * that, if the next request of the application does not arrive before
5250 * soft_rt_next_start, then the above requirement on the bandwidth is met.
5251 *
5252 * The second requirement is that the request pattern of the application is
5253 * isochronous, i.e., that, after issuing a request or a batch of requests,
5254 * the application stops issuing new requests until all its pending requests
5255 * have been completed. After that, the application may issue a new batch,
5256 * and so on.
5257 * For this reason the next function is invoked to compute
5258 * soft_rt_next_start only for applications that meet this requirement,
5259 * whereas soft_rt_next_start is set to infinity for applications that do
5260 * not.
5261 *
5262 * Unfortunately, even a greedy application may happen to behave in an
5263 * isochronous way if the CPU load is high. In fact, the application may
5264 * stop issuing requests while the CPUs are busy serving other processes,
5265 * then restart, then stop again for a while, and so on. In addition, if
5266 * the disk achieves a low enough throughput with the request pattern
5267 * issued by the application (e.g., because the request pattern is random
5268 * and/or the device is slow), then the application may meet the above
5269 * bandwidth requirement too. To prevent such a greedy application to be
5270 * deemed as soft real-time, a further rule is used in the computation of
5271 * soft_rt_next_start: soft_rt_next_start must be higher than the current
5272 * time plus the maximum time for which the arrival of a request is waited
5273 * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
5274 * This filters out greedy applications, as the latter issue instead their
5275 * next request as soon as possible after the last one has been completed
5276 * (in contrast, when a batch of requests is completed, a soft real-time
5277 * application spends some time processing data).
5278 *
5279 * Unfortunately, the last filter may easily generate false positives if
5280 * only bfqd->bfq_slice_idle is used as a reference time interval and one
5281 * or both the following cases occur:
5282 * 1) HZ is so low that the duration of a jiffy is comparable to or higher
5283 * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
5284 * HZ=100.
5285 * 2) jiffies, instead of increasing at a constant rate, may stop increasing
5286 * for a while, then suddenly 'jump' by several units to recover the lost
5287 * increments. This seems to happen, e.g., inside virtual machines.
5288 * To address this issue, we do not use as a reference time interval just
5289 * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
5290 * particular we add the minimum number of jiffies for which the filter
5291 * seems to be quite precise also in embedded systems and KVM/QEMU virtual
5292 * machines.
5293 */
5294static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
5295 struct bfq_queue *bfqq)
5296{
5297 return max(bfqq->last_idle_bklogged +
5298 HZ * bfqq->service_from_backlogged /
5299 bfqd->bfq_wr_max_softrt_rate,
5300 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
5301}
5302
5303/*
5304 * Return the farthest future time instant according to jiffies
5305 * macros.
5306 */
5307static unsigned long bfq_greatest_from_now(void)
5308{
5309 return jiffies + MAX_JIFFY_OFFSET;
5310}
5311
aee69d78
PV
5312/*
5313 * Return the farthest past time instant according to jiffies
5314 * macros.
5315 */
5316static unsigned long bfq_smallest_from_now(void)
5317{
5318 return jiffies - MAX_JIFFY_OFFSET;
5319}
5320
5321/**
5322 * bfq_bfqq_expire - expire a queue.
5323 * @bfqd: device owning the queue.
5324 * @bfqq: the queue to expire.
5325 * @compensate: if true, compensate for the time spent idling.
5326 * @reason: the reason causing the expiration.
5327 *
c074170e
PV
5328 * If the process associated with bfqq does slow I/O (e.g., because it
5329 * issues random requests), we charge bfqq with the time it has been
5330 * in service instead of the service it has received (see
5331 * bfq_bfqq_charge_time for details on how this goal is achieved). As
5332 * a consequence, bfqq will typically get higher timestamps upon
5333 * reactivation, and hence it will be rescheduled as if it had
5334 * received more service than what it has actually received. In the
5335 * end, bfqq receives less service in proportion to how slowly its
5336 * associated process consumes its budgets (and hence how seriously it
5337 * tends to lower the throughput). In addition, this time-charging
5338 * strategy guarantees time fairness among slow processes. In
5339 * contrast, if the process associated with bfqq is not slow, we
5340 * charge bfqq exactly with the service it has received.
aee69d78 5341 *
c074170e
PV
5342 * Charging time to the first type of queues and the exact service to
5343 * the other has the effect of using the WF2Q+ policy to schedule the
5344 * former on a timeslice basis, without violating service domain
5345 * guarantees among the latter.
aee69d78
PV
5346 */
5347static void bfq_bfqq_expire(struct bfq_data *bfqd,
5348 struct bfq_queue *bfqq,
5349 bool compensate,
5350 enum bfqq_expiration reason)
5351{
5352 bool slow;
ab0e43e9
PV
5353 unsigned long delta = 0;
5354 struct bfq_entity *entity = &bfqq->entity;
aee69d78
PV
5355 int ref;
5356
5357 /*
ab0e43e9 5358 * Check whether the process is slow (see bfq_bfqq_is_slow).
aee69d78 5359 */
ab0e43e9 5360 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
aee69d78 5361
77b7dcea
PV
5362 /*
5363 * Increase service_from_backlogged before next statement,
5364 * because the possible next invocation of
5365 * bfq_bfqq_charge_time would likely inflate
5366 * entity->service. In contrast, service_from_backlogged must
5367 * contain real service, to enable the soft real-time
5368 * heuristic to correctly compute the bandwidth consumed by
5369 * bfqq.
5370 */
5371 bfqq->service_from_backlogged += entity->service;
5372
aee69d78 5373 /*
c074170e
PV
5374 * As above explained, charge slow (typically seeky) and
5375 * timed-out queues with the time and not the service
5376 * received, to favor sequential workloads.
5377 *
5378 * Processes doing I/O in the slower disk zones will tend to
5379 * be slow(er) even if not seeky. Therefore, since the
5380 * estimated peak rate is actually an average over the disk
5381 * surface, these processes may timeout just for bad luck. To
5382 * avoid punishing them, do not charge time to processes that
5383 * succeeded in consuming at least 2/3 of their budget. This
5384 * allows BFQ to preserve enough elasticity to still perform
5385 * bandwidth, and not time, distribution with little unlucky
5386 * or quasi-sequential processes.
aee69d78 5387 */
44e44a1b
PV
5388 if (bfqq->wr_coeff == 1 &&
5389 (slow ||
5390 (reason == BFQQE_BUDGET_TIMEOUT &&
5391 bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
c074170e 5392 bfq_bfqq_charge_time(bfqd, bfqq, delta);
aee69d78
PV
5393
5394 if (reason == BFQQE_TOO_IDLE &&
ab0e43e9 5395 entity->service <= 2 * entity->budget / 10)
aee69d78
PV
5396 bfq_clear_bfqq_IO_bound(bfqq);
5397
44e44a1b
PV
5398 if (bfqd->low_latency && bfqq->wr_coeff == 1)
5399 bfqq->last_wr_start_finish = jiffies;
5400
77b7dcea
PV
5401 if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
5402 RB_EMPTY_ROOT(&bfqq->sort_list)) {
5403 /*
5404 * If we get here, and there are no outstanding
5405 * requests, then the request pattern is isochronous
5406 * (see the comments on the function
5407 * bfq_bfqq_softrt_next_start()). Thus we can compute
5408 * soft_rt_next_start. If, instead, the queue still
5409 * has outstanding requests, then we have to wait for
5410 * the completion of all the outstanding requests to
5411 * discover whether the request pattern is actually
5412 * isochronous.
5413 */
5414 if (bfqq->dispatched == 0)
5415 bfqq->soft_rt_next_start =
5416 bfq_bfqq_softrt_next_start(bfqd, bfqq);
5417 else {
5418 /*
5419 * The application is still waiting for the
5420 * completion of one or more requests:
5421 * prevent it from possibly being incorrectly
5422 * deemed as soft real-time by setting its
5423 * soft_rt_next_start to infinity. In fact,
5424 * without this assignment, the application
5425 * would be incorrectly deemed as soft
5426 * real-time if:
5427 * 1) it issued a new request before the
5428 * completion of all its in-flight
5429 * requests, and
5430 * 2) at that time, its soft_rt_next_start
5431 * happened to be in the past.
5432 */
5433 bfqq->soft_rt_next_start =
5434 bfq_greatest_from_now();
5435 /*
5436 * Schedule an update of soft_rt_next_start to when
5437 * the task may be discovered to be isochronous.
5438 */
5439 bfq_mark_bfqq_softrt_update(bfqq);
5440 }
5441 }
5442
aee69d78
PV
5443 bfq_log_bfqq(bfqd, bfqq,
5444 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
5445 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
5446
5447 /*
5448 * Increase, decrease or leave budget unchanged according to
5449 * reason.
5450 */
5451 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
5452 ref = bfqq->ref;
5453 __bfq_bfqq_expire(bfqd, bfqq);
5454
5455 /* mark bfqq as waiting a request only if a bic still points to it */
5456 if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
5457 reason != BFQQE_BUDGET_TIMEOUT &&
5458 reason != BFQQE_BUDGET_EXHAUSTED)
5459 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
5460}
5461
5462/*
5463 * Budget timeout is not implemented through a dedicated timer, but
5464 * just checked on request arrivals and completions, as well as on
5465 * idle timer expirations.
5466 */
5467static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
5468{
44e44a1b 5469 return time_is_before_eq_jiffies(bfqq->budget_timeout);
aee69d78
PV
5470}
5471
5472/*
5473 * If we expire a queue that is actively waiting (i.e., with the
5474 * device idled) for the arrival of a new request, then we may incur
5475 * the timestamp misalignment problem described in the body of the
5476 * function __bfq_activate_entity. Hence we return true only if this
5477 * condition does not hold, or if the queue is slow enough to deserve
5478 * only to be kicked off for preserving a high throughput.
5479 */
5480static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
5481{
5482 bfq_log_bfqq(bfqq->bfqd, bfqq,
5483 "may_budget_timeout: wait_request %d left %d timeout %d",
5484 bfq_bfqq_wait_request(bfqq),
5485 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
5486 bfq_bfqq_budget_timeout(bfqq));
5487
5488 return (!bfq_bfqq_wait_request(bfqq) ||
5489 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
5490 &&
5491 bfq_bfqq_budget_timeout(bfqq);
5492}
5493
5494/*
5495 * For a queue that becomes empty, device idling is allowed only if
44e44a1b
PV
5496 * this function returns true for the queue. As a consequence, since
5497 * device idling plays a critical role in both throughput boosting and
5498 * service guarantees, the return value of this function plays a
5499 * critical role in both these aspects as well.
5500 *
5501 * In a nutshell, this function returns true only if idling is
5502 * beneficial for throughput or, even if detrimental for throughput,
5503 * idling is however necessary to preserve service guarantees (low
5504 * latency, desired throughput distribution, ...). In particular, on
5505 * NCQ-capable devices, this function tries to return false, so as to
5506 * help keep the drives' internal queues full, whenever this helps the
5507 * device boost the throughput without causing any service-guarantee
5508 * issue.
5509 *
5510 * In more detail, the return value of this function is obtained by,
5511 * first, computing a number of boolean variables that take into
5512 * account throughput and service-guarantee issues, and, then,
5513 * combining these variables in a logical expression. Most of the
5514 * issues taken into account are not trivial. We discuss these issues
5515 * individually while introducing the variables.
aee69d78
PV
5516 */
5517static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
5518{
5519 struct bfq_data *bfqd = bfqq->bfqd;
cfd69712
PV
5520 bool idling_boosts_thr, idling_boosts_thr_without_issues,
5521 asymmetric_scenario;
aee69d78
PV
5522
5523 if (bfqd->strict_guarantees)
5524 return true;
5525
5526 /*
44e44a1b
PV
5527 * The next variable takes into account the cases where idling
5528 * boosts the throughput.
5529 *
5530 * The value of the variable is computed considering that
aee69d78
PV
5531 * idling is usually beneficial for the throughput if:
5532 * (a) the device is not NCQ-capable, or
5533 * (b) regardless of the presence of NCQ, the request pattern
5534 * for bfqq is I/O-bound (possible throughput losses
5535 * caused by granting idling to seeky queues are mitigated
5536 * by the fact that, in all scenarios where boosting
5537 * throughput is the best thing to do, i.e., in all
5538 * symmetric scenarios, only a minimal idle time is
5539 * allowed to seeky queues).
5540 */
5541 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
5542
cfd69712
PV
5543 /*
5544 * The value of the next variable,
5545 * idling_boosts_thr_without_issues, is equal to that of
5546 * idling_boosts_thr, unless a special case holds. In this
5547 * special case, described below, idling may cause problems to
5548 * weight-raised queues.
5549 *
5550 * When the request pool is saturated (e.g., in the presence
5551 * of write hogs), if the processes associated with
5552 * non-weight-raised queues ask for requests at a lower rate,
5553 * then processes associated with weight-raised queues have a
5554 * higher probability to get a request from the pool
5555 * immediately (or at least soon) when they need one. Thus
5556 * they have a higher probability to actually get a fraction
5557 * of the device throughput proportional to their high
5558 * weight. This is especially true with NCQ-capable drives,
5559 * which enqueue several requests in advance, and further
5560 * reorder internally-queued requests.
5561 *
5562 * For this reason, we force to false the value of
5563 * idling_boosts_thr_without_issues if there are weight-raised
5564 * busy queues. In this case, and if bfqq is not weight-raised,
5565 * this guarantees that the device is not idled for bfqq (if,
5566 * instead, bfqq is weight-raised, then idling will be
5567 * guaranteed by another variable, see below). Combined with
5568 * the timestamping rules of BFQ (see [1] for details), this
5569 * behavior causes bfqq, and hence any sync non-weight-raised
5570 * queue, to get a lower number of requests served, and thus
5571 * to ask for a lower number of requests from the request
5572 * pool, before the busy weight-raised queues get served
5573 * again. This often mitigates starvation problems in the
5574 * presence of heavy write workloads and NCQ, thereby
5575 * guaranteeing a higher application and system responsiveness
5576 * in these hostile scenarios.
5577 */
5578 idling_boosts_thr_without_issues = idling_boosts_thr &&
5579 bfqd->wr_busy_queues == 0;
5580
aee69d78 5581 /*
44e44a1b
PV
5582 * There is then a case where idling must be performed not for
5583 * throughput concerns, but to preserve service guarantees. To
5584 * introduce it, we can note that allowing the drive to
5585 * enqueue more than one request at a time, and hence
5586 * delegating de facto final scheduling decisions to the
5587 * drive's internal scheduler, causes loss of control on the
5588 * actual request service order. In particular, the critical
5589 * situation is when requests from different processes happens
5590 * to be present, at the same time, in the internal queue(s)
5591 * of the drive. In such a situation, the drive, by deciding
5592 * the service order of the internally-queued requests, does
5593 * determine also the actual throughput distribution among
5594 * these processes. But the drive typically has no notion or
5595 * concern about per-process throughput distribution, and
5596 * makes its decisions only on a per-request basis. Therefore,
5597 * the service distribution enforced by the drive's internal
5598 * scheduler is likely to coincide with the desired
5599 * device-throughput distribution only in a completely
5600 * symmetric scenario where: (i) each of these processes must
5601 * get the same throughput as the others; (ii) all these
5602 * processes have the same I/O pattern (either sequential or
5603 * random). In fact, in such a scenario, the drive will tend
5604 * to treat the requests of each of these processes in about
5605 * the same way as the requests of the others, and thus to
5606 * provide each of these processes with about the same
5607 * throughput (which is exactly the desired throughput
5608 * distribution). In contrast, in any asymmetric scenario,
5609 * device idling is certainly needed to guarantee that bfqq
5610 * receives its assigned fraction of the device throughput
5611 * (see [1] for details).
5612 *
5613 * As for sub-condition (i), actually we check only whether
5614 * bfqq is being weight-raised. In fact, if bfqq is not being
5615 * weight-raised, we have that:
5616 * - if the process associated with bfqq is not I/O-bound, then
5617 * it is not either latency- or throughput-critical; therefore
5618 * idling is not needed for bfqq;
5619 * - if the process asociated with bfqq is I/O-bound, then
5620 * idling is already granted with bfqq (see the comments on
5621 * idling_boosts_thr).
5622 *
5623 * We do not check sub-condition (ii) at all, i.e., the next
5624 * variable is true if and only if bfqq is being
5625 * weight-raised. We do not need to control sub-condition (ii)
5626 * for the following reason:
5627 * - if bfqq is being weight-raised, then idling is already
5628 * guaranteed to bfqq by sub-condition (i);
5629 * - if bfqq is not being weight-raised, then idling is
5630 * already guaranteed to bfqq (only) if it matters, i.e., if
5631 * bfqq is associated to a currently I/O-bound process (see
5632 * the above comment on sub-condition (i)).
5633 *
5634 * As a side note, it is worth considering that the above
5635 * device-idling countermeasures may however fail in the
5636 * following unlucky scenario: if idling is (correctly)
5637 * disabled in a time period during which the symmetry
5638 * sub-condition holds, and hence the device is allowed to
5639 * enqueue many requests, but at some later point in time some
5640 * sub-condition stops to hold, then it may become impossible
5641 * to let requests be served in the desired order until all
5642 * the requests already queued in the device have been served.
5643 */
5644 asymmetric_scenario = bfqq->wr_coeff > 1;
5645
5646 /*
5647 * We have now all the components we need to compute the return
5648 * value of the function, which is true only if both the following
5649 * conditions hold:
aee69d78 5650 * 1) bfqq is sync, because idling make sense only for sync queues;
44e44a1b
PV
5651 * 2) idling either boosts the throughput (without issues), or
5652 * is necessary to preserve service guarantees.
aee69d78 5653 */
44e44a1b 5654 return bfq_bfqq_sync(bfqq) &&
cfd69712 5655 (idling_boosts_thr_without_issues || asymmetric_scenario);
aee69d78
PV
5656}
5657
5658/*
5659 * If the in-service queue is empty but the function bfq_bfqq_may_idle
5660 * returns true, then:
5661 * 1) the queue must remain in service and cannot be expired, and
5662 * 2) the device must be idled to wait for the possible arrival of a new
5663 * request for the queue.
5664 * See the comments on the function bfq_bfqq_may_idle for the reasons
5665 * why performing device idling is the best choice to boost the throughput
5666 * and preserve service guarantees when bfq_bfqq_may_idle itself
5667 * returns true.
5668 */
5669static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
5670{
5671 struct bfq_data *bfqd = bfqq->bfqd;
5672
5673 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
5674 bfq_bfqq_may_idle(bfqq);
5675}
5676
5677/*
5678 * Select a queue for service. If we have a current queue in service,
5679 * check whether to continue servicing it, or retrieve and set a new one.
5680 */
5681static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
5682{
5683 struct bfq_queue *bfqq;
5684 struct request *next_rq;
5685 enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
5686
5687 bfqq = bfqd->in_service_queue;
5688 if (!bfqq)
5689 goto new_queue;
5690
5691 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
5692
5693 if (bfq_may_expire_for_budg_timeout(bfqq) &&
5694 !bfq_bfqq_wait_request(bfqq) &&
5695 !bfq_bfqq_must_idle(bfqq))
5696 goto expire;
5697
5698check_queue:
5699 /*
5700 * This loop is rarely executed more than once. Even when it
5701 * happens, it is much more convenient to re-execute this loop
5702 * than to return NULL and trigger a new dispatch to get a
5703 * request served.
5704 */
5705 next_rq = bfqq->next_rq;
5706 /*
5707 * If bfqq has requests queued and it has enough budget left to
5708 * serve them, keep the queue, otherwise expire it.
5709 */
5710 if (next_rq) {
5711 if (bfq_serv_to_charge(next_rq, bfqq) >
5712 bfq_bfqq_budget_left(bfqq)) {
5713 /*
5714 * Expire the queue for budget exhaustion,
5715 * which makes sure that the next budget is
5716 * enough to serve the next request, even if
5717 * it comes from the fifo expired path.
5718 */
5719 reason = BFQQE_BUDGET_EXHAUSTED;
5720 goto expire;
5721 } else {
5722 /*
5723 * The idle timer may be pending because we may
5724 * not disable disk idling even when a new request
5725 * arrives.
5726 */
5727 if (bfq_bfqq_wait_request(bfqq)) {
5728 /*
5729 * If we get here: 1) at least a new request
5730 * has arrived but we have not disabled the
5731 * timer because the request was too small,
5732 * 2) then the block layer has unplugged
5733 * the device, causing the dispatch to be
5734 * invoked.
5735 *
5736 * Since the device is unplugged, now the
5737 * requests are probably large enough to
5738 * provide a reasonable throughput.
5739 * So we disable idling.
5740 */
5741 bfq_clear_bfqq_wait_request(bfqq);
5742 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 5743 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
5744 }
5745 goto keep_queue;
5746 }
5747 }
5748
5749 /*
5750 * No requests pending. However, if the in-service queue is idling
5751 * for a new request, or has requests waiting for a completion and
5752 * may idle after their completion, then keep it anyway.
5753 */
5754 if (bfq_bfqq_wait_request(bfqq) ||
5755 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
5756 bfqq = NULL;
5757 goto keep_queue;
5758 }
5759
5760 reason = BFQQE_NO_MORE_REQUESTS;
5761expire:
5762 bfq_bfqq_expire(bfqd, bfqq, false, reason);
5763new_queue:
5764 bfqq = bfq_set_in_service_queue(bfqd);
5765 if (bfqq) {
5766 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
5767 goto check_queue;
5768 }
5769keep_queue:
5770 if (bfqq)
5771 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
5772 else
5773 bfq_log(bfqd, "select_queue: no queue returned");
5774
5775 return bfqq;
5776}
5777
44e44a1b
PV
5778static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5779{
5780 struct bfq_entity *entity = &bfqq->entity;
5781
5782 if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
5783 bfq_log_bfqq(bfqd, bfqq,
5784 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
5785 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
5786 jiffies_to_msecs(bfqq->wr_cur_max_time),
5787 bfqq->wr_coeff,
5788 bfqq->entity.weight, bfqq->entity.orig_weight);
5789
5790 if (entity->prio_changed)
5791 bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
5792
5793 /*
5794 * If too much time has elapsed from the beginning of
5795 * this weight-raising period, then end weight
5796 * raising.
5797 */
5798 if (time_is_before_jiffies(bfqq->last_wr_start_finish +
5799 bfqq->wr_cur_max_time)) {
77b7dcea
PV
5800 if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
5801 time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
5802 bfq_wr_duration(bfqd)))
5803 bfq_bfqq_end_wr(bfqq);
5804 else {
5805 /* switch back to interactive wr */
5806 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
5807 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
5808 bfqq->last_wr_start_finish =
5809 bfqq->wr_start_at_switch_to_srt;
5810 bfqq->entity.prio_changed = 1;
5811 }
44e44a1b
PV
5812 }
5813 }
5814 /* Update weight both if it must be raised and if it must be lowered */
5815 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
5816 __bfq_entity_update_weight_prio(
5817 bfq_entity_service_tree(entity),
5818 entity);
5819}
5820
aee69d78
PV
5821/*
5822 * Dispatch next request from bfqq.
5823 */
5824static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
5825 struct bfq_queue *bfqq)
5826{
5827 struct request *rq = bfqq->next_rq;
5828 unsigned long service_to_charge;
5829
5830 service_to_charge = bfq_serv_to_charge(rq, bfqq);
5831
5832 bfq_bfqq_served(bfqq, service_to_charge);
5833
5834 bfq_dispatch_remove(bfqd->queue, rq);
5835
44e44a1b
PV
5836 /*
5837 * If weight raising has to terminate for bfqq, then next
5838 * function causes an immediate update of bfqq's weight,
5839 * without waiting for next activation. As a consequence, on
5840 * expiration, bfqq will be timestamped as if has never been
5841 * weight-raised during this service slot, even if it has
5842 * received part or even most of the service as a
5843 * weight-raised queue. This inflates bfqq's timestamps, which
5844 * is beneficial, as bfqq is then more willing to leave the
5845 * device immediately to possible other weight-raised queues.
5846 */
5847 bfq_update_wr_data(bfqd, bfqq);
5848
aee69d78
PV
5849 if (!bfqd->in_service_bic) {
5850 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
5851 bfqd->in_service_bic = RQ_BIC(rq);
5852 }
5853
5854 /*
5855 * Expire bfqq, pretending that its budget expired, if bfqq
5856 * belongs to CLASS_IDLE and other queues are waiting for
5857 * service.
5858 */
5859 if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
5860 goto expire;
5861
5862 return rq;
5863
5864expire:
5865 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
5866 return rq;
5867}
5868
5869static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
5870{
5871 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5872
5873 /*
5874 * Avoiding lock: a race on bfqd->busy_queues should cause at
5875 * most a call to dispatch for nothing
5876 */
5877 return !list_empty_careful(&bfqd->dispatch) ||
5878 bfqd->busy_queues > 0;
5879}
5880
5881static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
5882{
5883 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5884 struct request *rq = NULL;
5885 struct bfq_queue *bfqq = NULL;
5886
5887 if (!list_empty(&bfqd->dispatch)) {
5888 rq = list_first_entry(&bfqd->dispatch, struct request,
5889 queuelist);
5890 list_del_init(&rq->queuelist);
5891
5892 bfqq = RQ_BFQQ(rq);
5893
5894 if (bfqq) {
5895 /*
5896 * Increment counters here, because this
5897 * dispatch does not follow the standard
5898 * dispatch flow (where counters are
5899 * incremented)
5900 */
5901 bfqq->dispatched++;
5902
5903 goto inc_in_driver_start_rq;
5904 }
5905
5906 /*
5907 * We exploit the put_rq_private hook to decrement
5908 * rq_in_driver, but put_rq_private will not be
5909 * invoked on this request. So, to avoid unbalance,
5910 * just start this request, without incrementing
5911 * rq_in_driver. As a negative consequence,
5912 * rq_in_driver is deceptively lower than it should be
5913 * while this request is in service. This may cause
5914 * bfq_schedule_dispatch to be invoked uselessly.
5915 *
5916 * As for implementing an exact solution, the
5917 * put_request hook, if defined, is probably invoked
5918 * also on this request. So, by exploiting this hook,
5919 * we could 1) increment rq_in_driver here, and 2)
5920 * decrement it in put_request. Such a solution would
5921 * let the value of the counter be always accurate,
5922 * but it would entail using an extra interface
5923 * function. This cost seems higher than the benefit,
5924 * being the frequency of non-elevator-private
5925 * requests very low.
5926 */
5927 goto start_rq;
5928 }
5929
5930 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
5931
5932 if (bfqd->busy_queues == 0)
5933 goto exit;
5934
5935 /*
5936 * Force device to serve one request at a time if
5937 * strict_guarantees is true. Forcing this service scheme is
5938 * currently the ONLY way to guarantee that the request
5939 * service order enforced by the scheduler is respected by a
5940 * queueing device. Otherwise the device is free even to make
5941 * some unlucky request wait for as long as the device
5942 * wishes.
5943 *
5944 * Of course, serving one request at at time may cause loss of
5945 * throughput.
5946 */
5947 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
5948 goto exit;
5949
5950 bfqq = bfq_select_queue(bfqd);
5951 if (!bfqq)
5952 goto exit;
5953
5954 rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
5955
5956 if (rq) {
5957inc_in_driver_start_rq:
5958 bfqd->rq_in_driver++;
5959start_rq:
5960 rq->rq_flags |= RQF_STARTED;
5961 }
5962exit:
5963 return rq;
5964}
5965
5966static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
5967{
5968 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5969 struct request *rq;
5970
5971 spin_lock_irq(&bfqd->lock);
5972 rq = __bfq_dispatch_request(hctx);
5973 spin_unlock_irq(&bfqd->lock);
5974
5975 return rq;
5976}
5977
5978/*
5979 * Task holds one reference to the queue, dropped when task exits. Each rq
5980 * in-flight on this queue also holds a reference, dropped when rq is freed.
5981 *
5982 * Scheduler lock must be held here. Recall not to use bfqq after calling
5983 * this function on it.
5984 */
5985static void bfq_put_queue(struct bfq_queue *bfqq)
5986{
e21b7a0b
AA
5987#ifdef CONFIG_BFQ_GROUP_IOSCHED
5988 struct bfq_group *bfqg = bfqq_group(bfqq);
5989#endif
5990
aee69d78
PV
5991 if (bfqq->bfqd)
5992 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
5993 bfqq, bfqq->ref);
5994
5995 bfqq->ref--;
5996 if (bfqq->ref)
5997 return;
5998
e21b7a0b
AA
5999 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
6000
aee69d78 6001 kmem_cache_free(bfq_pool, bfqq);
e21b7a0b
AA
6002#ifdef CONFIG_BFQ_GROUP_IOSCHED
6003 bfqg_put(bfqg);
6004#endif
aee69d78
PV
6005}
6006
6007static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6008{
6009 if (bfqq == bfqd->in_service_queue) {
6010 __bfq_bfqq_expire(bfqd, bfqq);
6011 bfq_schedule_dispatch(bfqd);
6012 }
6013
6014 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
6015
6016 bfq_put_queue(bfqq); /* release process reference */
6017}
6018
6019static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
6020{
6021 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
6022 struct bfq_data *bfqd;
6023
6024 if (bfqq)
6025 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
6026
6027 if (bfqq && bfqd) {
6028 unsigned long flags;
6029
6030 spin_lock_irqsave(&bfqd->lock, flags);
6031 bfq_exit_bfqq(bfqd, bfqq);
6032 bic_set_bfqq(bic, NULL, is_sync);
6033 spin_unlock_irq(&bfqd->lock);
6034 }
6035}
6036
6037static void bfq_exit_icq(struct io_cq *icq)
6038{
6039 struct bfq_io_cq *bic = icq_to_bic(icq);
6040
6041 bfq_exit_icq_bfqq(bic, true);
6042 bfq_exit_icq_bfqq(bic, false);
6043}
6044
6045/*
6046 * Update the entity prio values; note that the new values will not
6047 * be used until the next (re)activation.
6048 */
6049static void
6050bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
6051{
6052 struct task_struct *tsk = current;
6053 int ioprio_class;
6054 struct bfq_data *bfqd = bfqq->bfqd;
6055
6056 if (!bfqd)
6057 return;
6058
6059 ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
6060 switch (ioprio_class) {
6061 default:
6062 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
6063 "bfq: bad prio class %d\n", ioprio_class);
6064 case IOPRIO_CLASS_NONE:
6065 /*
6066 * No prio set, inherit CPU scheduling settings.
6067 */
6068 bfqq->new_ioprio = task_nice_ioprio(tsk);
6069 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
6070 break;
6071 case IOPRIO_CLASS_RT:
6072 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
6073 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
6074 break;
6075 case IOPRIO_CLASS_BE:
6076 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
6077 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
6078 break;
6079 case IOPRIO_CLASS_IDLE:
6080 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
6081 bfqq->new_ioprio = 7;
6082 bfq_clear_bfqq_idle_window(bfqq);
6083 break;
6084 }
6085
6086 if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
6087 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
6088 bfqq->new_ioprio);
6089 bfqq->new_ioprio = IOPRIO_BE_NR;
6090 }
6091
6092 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
6093 bfqq->entity.prio_changed = 1;
6094}
6095
6096static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
6097{
6098 struct bfq_data *bfqd = bic_to_bfqd(bic);
6099 struct bfq_queue *bfqq;
6100 int ioprio = bic->icq.ioc->ioprio;
6101
6102 /*
6103 * This condition may trigger on a newly created bic, be sure to
6104 * drop the lock before returning.
6105 */
6106 if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
6107 return;
6108
6109 bic->ioprio = ioprio;
6110
6111 bfqq = bic_to_bfqq(bic, false);
6112 if (bfqq) {
6113 /* release process reference on this queue */
6114 bfq_put_queue(bfqq);
6115 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
6116 bic_set_bfqq(bic, bfqq, false);
6117 }
6118
6119 bfqq = bic_to_bfqq(bic, true);
6120 if (bfqq)
6121 bfq_set_next_ioprio_data(bfqq, bic);
6122}
6123
6124static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6125 struct bfq_io_cq *bic, pid_t pid, int is_sync)
6126{
6127 RB_CLEAR_NODE(&bfqq->entity.rb_node);
6128 INIT_LIST_HEAD(&bfqq->fifo);
6129
6130 bfqq->ref = 0;
6131 bfqq->bfqd = bfqd;
6132
6133 if (bic)
6134 bfq_set_next_ioprio_data(bfqq, bic);
6135
6136 if (is_sync) {
6137 if (!bfq_class_idle(bfqq))
6138 bfq_mark_bfqq_idle_window(bfqq);
6139 bfq_mark_bfqq_sync(bfqq);
6140 } else
6141 bfq_clear_bfqq_sync(bfqq);
6142
6143 /* set end request to minus infinity from now */
6144 bfqq->ttime.last_end_request = ktime_get_ns() + 1;
6145
6146 bfq_mark_bfqq_IO_bound(bfqq);
6147
6148 bfqq->pid = pid;
6149
6150 /* Tentative initial value to trade off between thr and lat */
54b60456 6151 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
aee69d78 6152 bfqq->budget_timeout = bfq_smallest_from_now();
aee69d78 6153
44e44a1b
PV
6154 bfqq->wr_coeff = 1;
6155 bfqq->last_wr_start_finish = bfq_smallest_from_now();
77b7dcea
PV
6156 bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
6157
6158 /*
6159 * Set to the value for which bfqq will not be deemed as
6160 * soft rt when it becomes backlogged.
6161 */
6162 bfqq->soft_rt_next_start = bfq_greatest_from_now();
44e44a1b 6163
aee69d78
PV
6164 /* first request is almost certainly seeky */
6165 bfqq->seek_history = 1;
6166}
6167
6168static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
e21b7a0b 6169 struct bfq_group *bfqg,
aee69d78
PV
6170 int ioprio_class, int ioprio)
6171{
6172 switch (ioprio_class) {
6173 case IOPRIO_CLASS_RT:
e21b7a0b 6174 return &bfqg->async_bfqq[0][ioprio];
aee69d78
PV
6175 case IOPRIO_CLASS_NONE:
6176 ioprio = IOPRIO_NORM;
6177 /* fall through */
6178 case IOPRIO_CLASS_BE:
e21b7a0b 6179 return &bfqg->async_bfqq[1][ioprio];
aee69d78 6180 case IOPRIO_CLASS_IDLE:
e21b7a0b 6181 return &bfqg->async_idle_bfqq;
aee69d78
PV
6182 default:
6183 return NULL;
6184 }
6185}
6186
6187static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6188 struct bio *bio, bool is_sync,
6189 struct bfq_io_cq *bic)
6190{
6191 const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
6192 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
6193 struct bfq_queue **async_bfqq = NULL;
6194 struct bfq_queue *bfqq;
e21b7a0b 6195 struct bfq_group *bfqg;
aee69d78
PV
6196
6197 rcu_read_lock();
6198
e21b7a0b
AA
6199 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
6200 if (!bfqg) {
6201 bfqq = &bfqd->oom_bfqq;
6202 goto out;
6203 }
6204
aee69d78 6205 if (!is_sync) {
e21b7a0b 6206 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
aee69d78
PV
6207 ioprio);
6208 bfqq = *async_bfqq;
6209 if (bfqq)
6210 goto out;
6211 }
6212
6213 bfqq = kmem_cache_alloc_node(bfq_pool,
6214 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
6215 bfqd->queue->node);
6216
6217 if (bfqq) {
6218 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
6219 is_sync);
e21b7a0b 6220 bfq_init_entity(&bfqq->entity, bfqg);
aee69d78
PV
6221 bfq_log_bfqq(bfqd, bfqq, "allocated");
6222 } else {
6223 bfqq = &bfqd->oom_bfqq;
6224 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
6225 goto out;
6226 }
6227
6228 /*
6229 * Pin the queue now that it's allocated, scheduler exit will
6230 * prune it.
6231 */
6232 if (async_bfqq) {
e21b7a0b
AA
6233 bfqq->ref++; /*
6234 * Extra group reference, w.r.t. sync
6235 * queue. This extra reference is removed
6236 * only if bfqq->bfqg disappears, to
6237 * guarantee that this queue is not freed
6238 * until its group goes away.
6239 */
6240 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
aee69d78
PV
6241 bfqq, bfqq->ref);
6242 *async_bfqq = bfqq;
6243 }
6244
6245out:
6246 bfqq->ref++; /* get a process reference to this queue */
6247 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
6248 rcu_read_unlock();
6249 return bfqq;
6250}
6251
6252static void bfq_update_io_thinktime(struct bfq_data *bfqd,
6253 struct bfq_queue *bfqq)
6254{
6255 struct bfq_ttime *ttime = &bfqq->ttime;
6256 u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
6257
6258 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
6259
6260 ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
6261 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
6262 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
6263 ttime->ttime_samples);
6264}
6265
6266static void
6267bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6268 struct request *rq)
6269{
aee69d78 6270 bfqq->seek_history <<= 1;
ab0e43e9
PV
6271 bfqq->seek_history |=
6272 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
aee69d78
PV
6273 (!blk_queue_nonrot(bfqd->queue) ||
6274 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
6275}
6276
6277/*
6278 * Disable idle window if the process thinks too long or seeks so much that
6279 * it doesn't matter.
6280 */
6281static void bfq_update_idle_window(struct bfq_data *bfqd,
6282 struct bfq_queue *bfqq,
6283 struct bfq_io_cq *bic)
6284{
6285 int enable_idle;
6286
6287 /* Don't idle for async or idle io prio class. */
6288 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
6289 return;
6290
6291 enable_idle = bfq_bfqq_idle_window(bfqq);
6292
6293 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
6294 bfqd->bfq_slice_idle == 0 ||
bcd56426
PV
6295 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
6296 bfqq->wr_coeff == 1))
aee69d78
PV
6297 enable_idle = 0;
6298 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
44e44a1b
PV
6299 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
6300 bfqq->wr_coeff == 1)
aee69d78
PV
6301 enable_idle = 0;
6302 else
6303 enable_idle = 1;
6304 }
6305 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
6306 enable_idle);
6307
6308 if (enable_idle)
6309 bfq_mark_bfqq_idle_window(bfqq);
6310 else
6311 bfq_clear_bfqq_idle_window(bfqq);
6312}
6313
6314/*
6315 * Called when a new fs request (rq) is added to bfqq. Check if there's
6316 * something we should do about it.
6317 */
6318static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6319 struct request *rq)
6320{
6321 struct bfq_io_cq *bic = RQ_BIC(rq);
6322
6323 if (rq->cmd_flags & REQ_META)
6324 bfqq->meta_pending++;
6325
6326 bfq_update_io_thinktime(bfqd, bfqq);
6327 bfq_update_io_seektime(bfqd, bfqq, rq);
6328 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
6329 !BFQQ_SEEKY(bfqq))
6330 bfq_update_idle_window(bfqd, bfqq, bic);
6331
6332 bfq_log_bfqq(bfqd, bfqq,
6333 "rq_enqueued: idle_window=%d (seeky %d)",
6334 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
6335
6336 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
6337
6338 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
6339 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
6340 blk_rq_sectors(rq) < 32;
6341 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
6342
6343 /*
6344 * There is just this request queued: if the request
6345 * is small and the queue is not to be expired, then
6346 * just exit.
6347 *
6348 * In this way, if the device is being idled to wait
6349 * for a new request from the in-service queue, we
6350 * avoid unplugging the device and committing the
6351 * device to serve just a small request. On the
6352 * contrary, we wait for the block layer to decide
6353 * when to unplug the device: hopefully, new requests
6354 * will be merged to this one quickly, then the device
6355 * will be unplugged and larger requests will be
6356 * dispatched.
6357 */
6358 if (small_req && !budget_timeout)
6359 return;
6360
6361 /*
6362 * A large enough request arrived, or the queue is to
6363 * be expired: in both cases disk idling is to be
6364 * stopped, so clear wait_request flag and reset
6365 * timer.
6366 */
6367 bfq_clear_bfqq_wait_request(bfqq);
6368 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 6369 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
6370
6371 /*
6372 * The queue is not empty, because a new request just
6373 * arrived. Hence we can safely expire the queue, in
6374 * case of budget timeout, without risking that the
6375 * timestamps of the queue are not updated correctly.
6376 * See [1] for more details.
6377 */
6378 if (budget_timeout)
6379 bfq_bfqq_expire(bfqd, bfqq, false,
6380 BFQQE_BUDGET_TIMEOUT);
6381 }
6382}
6383
6384static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
6385{
6386 struct bfq_queue *bfqq = RQ_BFQQ(rq);
6387
6388 bfq_add_request(rq);
6389
6390 rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
6391 list_add_tail(&rq->queuelist, &bfqq->fifo);
6392
6393 bfq_rq_enqueued(bfqd, bfqq, rq);
6394}
6395
6396static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
6397 bool at_head)
6398{
6399 struct request_queue *q = hctx->queue;
6400 struct bfq_data *bfqd = q->elevator->elevator_data;
6401
6402 spin_lock_irq(&bfqd->lock);
6403 if (blk_mq_sched_try_insert_merge(q, rq)) {
6404 spin_unlock_irq(&bfqd->lock);
6405 return;
6406 }
6407
6408 spin_unlock_irq(&bfqd->lock);
6409
6410 blk_mq_sched_request_inserted(rq);
6411
6412 spin_lock_irq(&bfqd->lock);
6413 if (at_head || blk_rq_is_passthrough(rq)) {
6414 if (at_head)
6415 list_add(&rq->queuelist, &bfqd->dispatch);
6416 else
6417 list_add_tail(&rq->queuelist, &bfqd->dispatch);
6418 } else {
6419 __bfq_insert_request(bfqd, rq);
6420
6421 if (rq_mergeable(rq)) {
6422 elv_rqhash_add(q, rq);
6423 if (!q->last_merge)
6424 q->last_merge = rq;
6425 }
6426 }
6427
6428 spin_unlock_irq(&bfqd->lock);
6429}
6430
6431static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
6432 struct list_head *list, bool at_head)
6433{
6434 while (!list_empty(list)) {
6435 struct request *rq;
6436
6437 rq = list_first_entry(list, struct request, queuelist);
6438 list_del_init(&rq->queuelist);
6439 bfq_insert_request(hctx, rq, at_head);
6440 }
6441}
6442
6443static void bfq_update_hw_tag(struct bfq_data *bfqd)
6444{
6445 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
6446 bfqd->rq_in_driver);
6447
6448 if (bfqd->hw_tag == 1)
6449 return;
6450
6451 /*
6452 * This sample is valid if the number of outstanding requests
6453 * is large enough to allow a queueing behavior. Note that the
6454 * sum is not exact, as it's not taking into account deactivated
6455 * requests.
6456 */
6457 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
6458 return;
6459
6460 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
6461 return;
6462
6463 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
6464 bfqd->max_rq_in_driver = 0;
6465 bfqd->hw_tag_samples = 0;
6466}
6467
6468static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
6469{
ab0e43e9
PV
6470 u64 now_ns;
6471 u32 delta_us;
6472
aee69d78
PV
6473 bfq_update_hw_tag(bfqd);
6474
6475 bfqd->rq_in_driver--;
6476 bfqq->dispatched--;
6477
44e44a1b
PV
6478 if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
6479 /*
6480 * Set budget_timeout (which we overload to store the
6481 * time at which the queue remains with no backlog and
6482 * no outstanding request; used by the weight-raising
6483 * mechanism).
6484 */
6485 bfqq->budget_timeout = jiffies;
6486 }
6487
ab0e43e9
PV
6488 now_ns = ktime_get_ns();
6489
6490 bfqq->ttime.last_end_request = now_ns;
6491
6492 /*
6493 * Using us instead of ns, to get a reasonable precision in
6494 * computing rate in next check.
6495 */
6496 delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
6497
6498 /*
6499 * If the request took rather long to complete, and, according
6500 * to the maximum request size recorded, this completion latency
6501 * implies that the request was certainly served at a very low
6502 * rate (less than 1M sectors/sec), then the whole observation
6503 * interval that lasts up to this time instant cannot be a
6504 * valid time interval for computing a new peak rate. Invoke
6505 * bfq_update_rate_reset to have the following three steps
6506 * taken:
6507 * - close the observation interval at the last (previous)
6508 * request dispatch or completion
6509 * - compute rate, if possible, for that observation interval
6510 * - reset to zero samples, which will trigger a proper
6511 * re-initialization of the observation interval on next
6512 * dispatch
6513 */
6514 if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
6515 (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
6516 1UL<<(BFQ_RATE_SHIFT - 10))
6517 bfq_update_rate_reset(bfqd, NULL);
6518 bfqd->last_completion = now_ns;
aee69d78 6519
77b7dcea
PV
6520 /*
6521 * If we are waiting to discover whether the request pattern
6522 * of the task associated with the queue is actually
6523 * isochronous, and both requisites for this condition to hold
6524 * are now satisfied, then compute soft_rt_next_start (see the
6525 * comments on the function bfq_bfqq_softrt_next_start()). We
6526 * schedule this delayed check when bfqq expires, if it still
6527 * has in-flight requests.
6528 */
6529 if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
6530 RB_EMPTY_ROOT(&bfqq->sort_list))
6531 bfqq->soft_rt_next_start =
6532 bfq_bfqq_softrt_next_start(bfqd, bfqq);
6533
aee69d78
PV
6534 /*
6535 * If this is the in-service queue, check if it needs to be expired,
6536 * or if we want to idle in case it has no pending requests.
6537 */
6538 if (bfqd->in_service_queue == bfqq) {
44e44a1b 6539 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
aee69d78
PV
6540 bfq_arm_slice_timer(bfqd);
6541 return;
6542 } else if (bfq_may_expire_for_budg_timeout(bfqq))
6543 bfq_bfqq_expire(bfqd, bfqq, false,
6544 BFQQE_BUDGET_TIMEOUT);
6545 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
6546 (bfqq->dispatched == 0 ||
6547 !bfq_bfqq_may_idle(bfqq)))
6548 bfq_bfqq_expire(bfqd, bfqq, false,
6549 BFQQE_NO_MORE_REQUESTS);
6550 }
6551}
6552
6553static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
6554{
6555 bfqq->allocated--;
6556
6557 bfq_put_queue(bfqq);
6558}
6559
6560static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
6561{
6562 struct bfq_queue *bfqq = RQ_BFQQ(rq);
6563 struct bfq_data *bfqd = bfqq->bfqd;
6564
e21b7a0b
AA
6565 if (rq->rq_flags & RQF_STARTED)
6566 bfqg_stats_update_completion(bfqq_group(bfqq),
6567 rq_start_time_ns(rq),
6568 rq_io_start_time_ns(rq),
6569 rq->cmd_flags);
aee69d78
PV
6570
6571 if (likely(rq->rq_flags & RQF_STARTED)) {
6572 unsigned long flags;
6573
6574 spin_lock_irqsave(&bfqd->lock, flags);
6575
6576 bfq_completed_request(bfqq, bfqd);
6577 bfq_put_rq_priv_body(bfqq);
6578
6579 spin_unlock_irqrestore(&bfqd->lock, flags);
6580 } else {
6581 /*
6582 * Request rq may be still/already in the scheduler,
6583 * in which case we need to remove it. And we cannot
6584 * defer such a check and removal, to avoid
6585 * inconsistencies in the time interval from the end
6586 * of this function to the start of the deferred work.
6587 * This situation seems to occur only in process
6588 * context, as a consequence of a merge. In the
6589 * current version of the code, this implies that the
6590 * lock is held.
6591 */
6592
6593 if (!RB_EMPTY_NODE(&rq->rb_node))
6594 bfq_remove_request(q, rq);
6595 bfq_put_rq_priv_body(bfqq);
6596 }
6597
6598 rq->elv.priv[0] = NULL;
6599 rq->elv.priv[1] = NULL;
6600}
6601
6602/*
6603 * Allocate bfq data structures associated with this request.
6604 */
6605static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
6606 struct bio *bio)
6607{
6608 struct bfq_data *bfqd = q->elevator->elevator_data;
6609 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
6610 const int is_sync = rq_is_sync(rq);
6611 struct bfq_queue *bfqq;
6612
6613 spin_lock_irq(&bfqd->lock);
6614
6615 bfq_check_ioprio_change(bic, bio);
6616
6617 if (!bic)
6618 goto queue_fail;
6619
e21b7a0b
AA
6620 bfq_bic_update_cgroup(bic, bio);
6621
aee69d78
PV
6622 bfqq = bic_to_bfqq(bic, is_sync);
6623 if (!bfqq || bfqq == &bfqd->oom_bfqq) {
6624 if (bfqq)
6625 bfq_put_queue(bfqq);
6626 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
6627 bic_set_bfqq(bic, bfqq, is_sync);
6628 }
6629
6630 bfqq->allocated++;
6631 bfqq->ref++;
6632 bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
6633 rq, bfqq, bfqq->ref);
6634
6635 rq->elv.priv[0] = bic;
6636 rq->elv.priv[1] = bfqq;
6637
6638 spin_unlock_irq(&bfqd->lock);
6639
6640 return 0;
6641
6642queue_fail:
6643 spin_unlock_irq(&bfqd->lock);
6644
6645 return 1;
6646}
6647
6648static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
6649{
6650 struct bfq_data *bfqd = bfqq->bfqd;
6651 enum bfqq_expiration reason;
6652 unsigned long flags;
6653
6654 spin_lock_irqsave(&bfqd->lock, flags);
6655 bfq_clear_bfqq_wait_request(bfqq);
6656
6657 if (bfqq != bfqd->in_service_queue) {
6658 spin_unlock_irqrestore(&bfqd->lock, flags);
6659 return;
6660 }
6661
6662 if (bfq_bfqq_budget_timeout(bfqq))
6663 /*
6664 * Also here the queue can be safely expired
6665 * for budget timeout without wasting
6666 * guarantees
6667 */
6668 reason = BFQQE_BUDGET_TIMEOUT;
6669 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
6670 /*
6671 * The queue may not be empty upon timer expiration,
6672 * because we may not disable the timer when the
6673 * first request of the in-service queue arrives
6674 * during disk idling.
6675 */
6676 reason = BFQQE_TOO_IDLE;
6677 else
6678 goto schedule_dispatch;
6679
6680 bfq_bfqq_expire(bfqd, bfqq, true, reason);
6681
6682schedule_dispatch:
6683 spin_unlock_irqrestore(&bfqd->lock, flags);
6684 bfq_schedule_dispatch(bfqd);
6685}
6686
6687/*
6688 * Handler of the expiration of the timer running if the in-service queue
6689 * is idling inside its time slice.
6690 */
6691static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
6692{
6693 struct bfq_data *bfqd = container_of(timer, struct bfq_data,
6694 idle_slice_timer);
6695 struct bfq_queue *bfqq = bfqd->in_service_queue;
6696
6697 /*
6698 * Theoretical race here: the in-service queue can be NULL or
6699 * different from the queue that was idling if a new request
6700 * arrives for the current queue and there is a full dispatch
6701 * cycle that changes the in-service queue. This can hardly
6702 * happen, but in the worst case we just expire a queue too
6703 * early.
6704 */
6705 if (bfqq)
6706 bfq_idle_slice_timer_body(bfqq);
6707
6708 return HRTIMER_NORESTART;
6709}
6710
6711static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
6712 struct bfq_queue **bfqq_ptr)
6713{
6714 struct bfq_queue *bfqq = *bfqq_ptr;
6715
6716 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
6717 if (bfqq) {
e21b7a0b
AA
6718 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
6719
aee69d78
PV
6720 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
6721 bfqq, bfqq->ref);
6722 bfq_put_queue(bfqq);
6723 *bfqq_ptr = NULL;
6724 }
6725}
6726
6727/*
e21b7a0b
AA
6728 * Release all the bfqg references to its async queues. If we are
6729 * deallocating the group these queues may still contain requests, so
6730 * we reparent them to the root cgroup (i.e., the only one that will
6731 * exist for sure until all the requests on a device are gone).
aee69d78 6732 */
e21b7a0b 6733static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
aee69d78
PV
6734{
6735 int i, j;
6736
6737 for (i = 0; i < 2; i++)
6738 for (j = 0; j < IOPRIO_BE_NR; j++)
e21b7a0b 6739 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
aee69d78 6740
e21b7a0b 6741 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
aee69d78
PV
6742}
6743
6744static void bfq_exit_queue(struct elevator_queue *e)
6745{
6746 struct bfq_data *bfqd = e->elevator_data;
6747 struct bfq_queue *bfqq, *n;
6748
6749 hrtimer_cancel(&bfqd->idle_slice_timer);
6750
6751 spin_lock_irq(&bfqd->lock);
6752 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
e21b7a0b 6753 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
aee69d78
PV
6754 spin_unlock_irq(&bfqd->lock);
6755
6756 hrtimer_cancel(&bfqd->idle_slice_timer);
6757
e21b7a0b
AA
6758#ifdef CONFIG_BFQ_GROUP_IOSCHED
6759 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
6760#else
6761 spin_lock_irq(&bfqd->lock);
6762 bfq_put_async_queues(bfqd, bfqd->root_group);
6763 kfree(bfqd->root_group);
6764 spin_unlock_irq(&bfqd->lock);
6765#endif
6766
aee69d78
PV
6767 kfree(bfqd);
6768}
6769
e21b7a0b
AA
6770static void bfq_init_root_group(struct bfq_group *root_group,
6771 struct bfq_data *bfqd)
6772{
6773 int i;
6774
6775#ifdef CONFIG_BFQ_GROUP_IOSCHED
6776 root_group->entity.parent = NULL;
6777 root_group->my_entity = NULL;
6778 root_group->bfqd = bfqd;
6779#endif
6780 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6781 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6782 root_group->sched_data.bfq_class_idle_last_service = jiffies;
6783}
6784
aee69d78
PV
6785static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
6786{
6787 struct bfq_data *bfqd;
6788 struct elevator_queue *eq;
aee69d78
PV
6789
6790 eq = elevator_alloc(q, e);
6791 if (!eq)
6792 return -ENOMEM;
6793
6794 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
6795 if (!bfqd) {
6796 kobject_put(&eq->kobj);
6797 return -ENOMEM;
6798 }
6799 eq->elevator_data = bfqd;
6800
e21b7a0b
AA
6801 spin_lock_irq(q->queue_lock);
6802 q->elevator = eq;
6803 spin_unlock_irq(q->queue_lock);
6804
aee69d78
PV
6805 /*
6806 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
6807 * Grab a permanent reference to it, so that the normal code flow
6808 * will not attempt to free it.
6809 */
6810 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
6811 bfqd->oom_bfqq.ref++;
6812 bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
6813 bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
6814 bfqd->oom_bfqq.entity.new_weight =
6815 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
6816 /*
6817 * Trigger weight initialization, according to ioprio, at the
6818 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
6819 * class won't be changed any more.
6820 */
6821 bfqd->oom_bfqq.entity.prio_changed = 1;
6822
6823 bfqd->queue = q;
6824
e21b7a0b 6825 INIT_LIST_HEAD(&bfqd->dispatch);
aee69d78
PV
6826
6827 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
6828 HRTIMER_MODE_REL);
6829 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
6830
6831 INIT_LIST_HEAD(&bfqd->active_list);
6832 INIT_LIST_HEAD(&bfqd->idle_list);
6833
6834 bfqd->hw_tag = -1;
6835
6836 bfqd->bfq_max_budget = bfq_default_max_budget;
6837
6838 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
6839 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
6840 bfqd->bfq_back_max = bfq_back_max;
6841 bfqd->bfq_back_penalty = bfq_back_penalty;
6842 bfqd->bfq_slice_idle = bfq_slice_idle;
aee69d78
PV
6843 bfqd->bfq_timeout = bfq_timeout;
6844
6845 bfqd->bfq_requests_within_timer = 120;
6846
44e44a1b
PV
6847 bfqd->low_latency = true;
6848
6849 /*
6850 * Trade-off between responsiveness and fairness.
6851 */
6852 bfqd->bfq_wr_coeff = 30;
77b7dcea 6853 bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
44e44a1b
PV
6854 bfqd->bfq_wr_max_time = 0;
6855 bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
6856 bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
77b7dcea
PV
6857 bfqd->bfq_wr_max_softrt_rate = 7000; /*
6858 * Approximate rate required
6859 * to playback or record a
6860 * high-definition compressed
6861 * video.
6862 */
cfd69712 6863 bfqd->wr_busy_queues = 0;
44e44a1b
PV
6864
6865 /*
6866 * Begin by assuming, optimistically, that the device is a
6867 * high-speed one, and that its peak rate is equal to 2/3 of
6868 * the highest reference rate.
6869 */
6870 bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
6871 T_fast[blk_queue_nonrot(bfqd->queue)];
6872 bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
6873 bfqd->device_speed = BFQ_BFQD_FAST;
6874
aee69d78 6875 spin_lock_init(&bfqd->lock);
aee69d78 6876
e21b7a0b
AA
6877 /*
6878 * The invocation of the next bfq_create_group_hierarchy
6879 * function is the head of a chain of function calls
6880 * (bfq_create_group_hierarchy->blkcg_activate_policy->
6881 * blk_mq_freeze_queue) that may lead to the invocation of the
6882 * has_work hook function. For this reason,
6883 * bfq_create_group_hierarchy is invoked only after all
6884 * scheduler data has been initialized, apart from the fields
6885 * that can be initialized only after invoking
6886 * bfq_create_group_hierarchy. This, in particular, enables
6887 * has_work to correctly return false. Of course, to avoid
6888 * other inconsistencies, the blk-mq stack must then refrain
6889 * from invoking further scheduler hooks before this init
6890 * function is finished.
6891 */
6892 bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
6893 if (!bfqd->root_group)
6894 goto out_free;
6895 bfq_init_root_group(bfqd->root_group, bfqd);
6896 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
6897
aee69d78
PV
6898
6899 return 0;
e21b7a0b
AA
6900
6901out_free:
6902 kfree(bfqd);
6903 kobject_put(&eq->kobj);
6904 return -ENOMEM;
aee69d78
PV
6905}
6906
6907static void bfq_slab_kill(void)
6908{
6909 kmem_cache_destroy(bfq_pool);
6910}
6911
6912static int __init bfq_slab_setup(void)
6913{
6914 bfq_pool = KMEM_CACHE(bfq_queue, 0);
6915 if (!bfq_pool)
6916 return -ENOMEM;
6917 return 0;
6918}
6919
6920static ssize_t bfq_var_show(unsigned int var, char *page)
6921{
6922 return sprintf(page, "%u\n", var);
6923}
6924
6925static ssize_t bfq_var_store(unsigned long *var, const char *page,
6926 size_t count)
6927{
6928 unsigned long new_val;
6929 int ret = kstrtoul(page, 10, &new_val);
6930
6931 if (ret == 0)
6932 *var = new_val;
6933
6934 return count;
6935}
6936
6937#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
6938static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6939{ \
6940 struct bfq_data *bfqd = e->elevator_data; \
6941 u64 __data = __VAR; \
6942 if (__CONV == 1) \
6943 __data = jiffies_to_msecs(__data); \
6944 else if (__CONV == 2) \
6945 __data = div_u64(__data, NSEC_PER_MSEC); \
6946 return bfq_var_show(__data, (page)); \
6947}
6948SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
6949SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
6950SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
6951SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
6952SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
6953SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
6954SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
6955SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
44e44a1b 6956SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
aee69d78
PV
6957#undef SHOW_FUNCTION
6958
6959#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6960static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6961{ \
6962 struct bfq_data *bfqd = e->elevator_data; \
6963 u64 __data = __VAR; \
6964 __data = div_u64(__data, NSEC_PER_USEC); \
6965 return bfq_var_show(__data, (page)); \
6966}
6967USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
6968#undef USEC_SHOW_FUNCTION
6969
6970#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6971static ssize_t \
6972__FUNC(struct elevator_queue *e, const char *page, size_t count) \
6973{ \
6974 struct bfq_data *bfqd = e->elevator_data; \
6975 unsigned long uninitialized_var(__data); \
6976 int ret = bfq_var_store(&__data, (page), count); \
6977 if (__data < (MIN)) \
6978 __data = (MIN); \
6979 else if (__data > (MAX)) \
6980 __data = (MAX); \
6981 if (__CONV == 1) \
6982 *(__PTR) = msecs_to_jiffies(__data); \
6983 else if (__CONV == 2) \
6984 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6985 else \
6986 *(__PTR) = __data; \
6987 return ret; \
6988}
6989STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
6990 INT_MAX, 2);
6991STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
6992 INT_MAX, 2);
6993STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
6994STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
6995 INT_MAX, 0);
6996STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
6997#undef STORE_FUNCTION
6998
6999#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
7000static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
7001{ \
7002 struct bfq_data *bfqd = e->elevator_data; \
7003 unsigned long uninitialized_var(__data); \
7004 int ret = bfq_var_store(&__data, (page), count); \
7005 if (__data < (MIN)) \
7006 __data = (MIN); \
7007 else if (__data > (MAX)) \
7008 __data = (MAX); \
7009 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
7010 return ret; \
7011}
7012USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
7013 UINT_MAX);
7014#undef USEC_STORE_FUNCTION
7015
aee69d78
PV
7016static ssize_t bfq_max_budget_store(struct elevator_queue *e,
7017 const char *page, size_t count)
7018{
7019 struct bfq_data *bfqd = e->elevator_data;
7020 unsigned long uninitialized_var(__data);
7021 int ret = bfq_var_store(&__data, (page), count);
7022
7023 if (__data == 0)
ab0e43e9 7024 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
aee69d78
PV
7025 else {
7026 if (__data > INT_MAX)
7027 __data = INT_MAX;
7028 bfqd->bfq_max_budget = __data;
7029 }
7030
7031 bfqd->bfq_user_max_budget = __data;
7032
7033 return ret;
7034}
7035
7036/*
7037 * Leaving this name to preserve name compatibility with cfq
7038 * parameters, but this timeout is used for both sync and async.
7039 */
7040static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
7041 const char *page, size_t count)
7042{
7043 struct bfq_data *bfqd = e->elevator_data;
7044 unsigned long uninitialized_var(__data);
7045 int ret = bfq_var_store(&__data, (page), count);
7046
7047 if (__data < 1)
7048 __data = 1;
7049 else if (__data > INT_MAX)
7050 __data = INT_MAX;
7051
7052 bfqd->bfq_timeout = msecs_to_jiffies(__data);
7053 if (bfqd->bfq_user_max_budget == 0)
ab0e43e9 7054 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
aee69d78
PV
7055
7056 return ret;
7057}
7058
7059static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
7060 const char *page, size_t count)
7061{
7062 struct bfq_data *bfqd = e->elevator_data;
7063 unsigned long uninitialized_var(__data);
7064 int ret = bfq_var_store(&__data, (page), count);
7065
7066 if (__data > 1)
7067 __data = 1;
7068 if (!bfqd->strict_guarantees && __data == 1
7069 && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
7070 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
7071
7072 bfqd->strict_guarantees = __data;
7073
7074 return ret;
7075}
7076
44e44a1b
PV
7077static ssize_t bfq_low_latency_store(struct elevator_queue *e,
7078 const char *page, size_t count)
7079{
7080 struct bfq_data *bfqd = e->elevator_data;
7081 unsigned long uninitialized_var(__data);
7082 int ret = bfq_var_store(&__data, (page), count);
7083
7084 if (__data > 1)
7085 __data = 1;
7086 if (__data == 0 && bfqd->low_latency != 0)
7087 bfq_end_wr(bfqd);
7088 bfqd->low_latency = __data;
7089
7090 return ret;
7091}
7092
aee69d78
PV
7093#define BFQ_ATTR(name) \
7094 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
7095
7096static struct elv_fs_entry bfq_attrs[] = {
7097 BFQ_ATTR(fifo_expire_sync),
7098 BFQ_ATTR(fifo_expire_async),
7099 BFQ_ATTR(back_seek_max),
7100 BFQ_ATTR(back_seek_penalty),
7101 BFQ_ATTR(slice_idle),
7102 BFQ_ATTR(slice_idle_us),
7103 BFQ_ATTR(max_budget),
7104 BFQ_ATTR(timeout_sync),
7105 BFQ_ATTR(strict_guarantees),
44e44a1b 7106 BFQ_ATTR(low_latency),
aee69d78
PV
7107 __ATTR_NULL
7108};
7109
7110static struct elevator_type iosched_bfq_mq = {
7111 .ops.mq = {
7112 .get_rq_priv = bfq_get_rq_private,
7113 .put_rq_priv = bfq_put_rq_private,
7114 .exit_icq = bfq_exit_icq,
7115 .insert_requests = bfq_insert_requests,
7116 .dispatch_request = bfq_dispatch_request,
7117 .next_request = elv_rb_latter_request,
7118 .former_request = elv_rb_former_request,
7119 .allow_merge = bfq_allow_bio_merge,
7120 .bio_merge = bfq_bio_merge,
7121 .request_merge = bfq_request_merge,
7122 .requests_merged = bfq_requests_merged,
7123 .request_merged = bfq_request_merged,
7124 .has_work = bfq_has_work,
7125 .init_sched = bfq_init_queue,
7126 .exit_sched = bfq_exit_queue,
7127 },
7128
7129 .uses_mq = true,
7130 .icq_size = sizeof(struct bfq_io_cq),
7131 .icq_align = __alignof__(struct bfq_io_cq),
7132 .elevator_attrs = bfq_attrs,
7133 .elevator_name = "bfq",
7134 .elevator_owner = THIS_MODULE,
7135};
7136
e21b7a0b
AA
7137#ifdef CONFIG_BFQ_GROUP_IOSCHED
7138static struct blkcg_policy blkcg_policy_bfq = {
7139 .dfl_cftypes = bfq_blkg_files,
7140 .legacy_cftypes = bfq_blkcg_legacy_files,
7141
7142 .cpd_alloc_fn = bfq_cpd_alloc,
7143 .cpd_init_fn = bfq_cpd_init,
7144 .cpd_bind_fn = bfq_cpd_init,
7145 .cpd_free_fn = bfq_cpd_free,
7146
7147 .pd_alloc_fn = bfq_pd_alloc,
7148 .pd_init_fn = bfq_pd_init,
7149 .pd_offline_fn = bfq_pd_offline,
7150 .pd_free_fn = bfq_pd_free,
7151 .pd_reset_stats_fn = bfq_pd_reset_stats,
7152};
7153#endif
7154
aee69d78
PV
7155static int __init bfq_init(void)
7156{
7157 int ret;
7158
e21b7a0b
AA
7159#ifdef CONFIG_BFQ_GROUP_IOSCHED
7160 ret = blkcg_policy_register(&blkcg_policy_bfq);
7161 if (ret)
7162 return ret;
7163#endif
7164
aee69d78
PV
7165 ret = -ENOMEM;
7166 if (bfq_slab_setup())
7167 goto err_pol_unreg;
7168
44e44a1b
PV
7169 /*
7170 * Times to load large popular applications for the typical
7171 * systems installed on the reference devices (see the
7172 * comments before the definitions of the next two
7173 * arrays). Actually, we use slightly slower values, as the
7174 * estimated peak rate tends to be smaller than the actual
7175 * peak rate. The reason for this last fact is that estimates
7176 * are computed over much shorter time intervals than the long
7177 * intervals typically used for benchmarking. Why? First, to
7178 * adapt more quickly to variations. Second, because an I/O
7179 * scheduler cannot rely on a peak-rate-evaluation workload to
7180 * be run for a long time.
7181 */
7182 T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
7183 T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
7184 T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
7185 T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
7186
7187 /*
7188 * Thresholds that determine the switch between speed classes
7189 * (see the comments before the definition of the array
7190 * device_speed_thresh). These thresholds are biased towards
7191 * transitions to the fast class. This is safer than the
7192 * opposite bias. In fact, a wrong transition to the slow
7193 * class results in short weight-raising periods, because the
7194 * speed of the device then tends to be higher that the
7195 * reference peak rate. On the opposite end, a wrong
7196 * transition to the fast class tends to increase
7197 * weight-raising periods, because of the opposite reason.
7198 */
7199 device_speed_thresh[0] = (4 * R_slow[0]) / 3;
7200 device_speed_thresh[1] = (4 * R_slow[1]) / 3;
7201
aee69d78
PV
7202 ret = elv_register(&iosched_bfq_mq);
7203 if (ret)
7204 goto err_pol_unreg;
7205
7206 return 0;
7207
7208err_pol_unreg:
e21b7a0b
AA
7209#ifdef CONFIG_BFQ_GROUP_IOSCHED
7210 blkcg_policy_unregister(&blkcg_policy_bfq);
7211#endif
aee69d78
PV
7212 return ret;
7213}
7214
7215static void __exit bfq_exit(void)
7216{
7217 elv_unregister(&iosched_bfq_mq);
e21b7a0b
AA
7218#ifdef CONFIG_BFQ_GROUP_IOSCHED
7219 blkcg_policy_unregister(&blkcg_policy_bfq);
7220#endif
aee69d78
PV
7221 bfq_slab_kill();
7222}
7223
7224module_init(bfq_init);
7225module_exit(bfq_exit);
7226
7227MODULE_AUTHOR("Paolo Valente");
7228MODULE_LICENSE("GPL");
7229MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");