]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/bfq-iosched.c
block, bfq: improve responsiveness
[mirror_ubuntu-bionic-kernel.git] / block / bfq-iosched.c
CommitLineData
aee69d78
PV
1/*
2 * Budget Fair Queueing (BFQ) I/O scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
12 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
30 *
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
48 * applications.
49 *
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
58 *
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
67 *
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
71 * in [3].
72 *
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
77 *
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
80 * Oct 1997.
81 *
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
83 *
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
87 *
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
89 */
90#include <linux/module.h>
91#include <linux/slab.h>
92#include <linux/blkdev.h>
e21b7a0b 93#include <linux/cgroup.h>
aee69d78
PV
94#include <linux/elevator.h>
95#include <linux/ktime.h>
96#include <linux/rbtree.h>
97#include <linux/ioprio.h>
98#include <linux/sbitmap.h>
99#include <linux/delay.h>
100
101#include "blk.h"
102#include "blk-mq.h"
103#include "blk-mq-tag.h"
104#include "blk-mq-sched.h"
105#include <linux/blktrace_api.h>
106#include <linux/hrtimer.h>
107#include <linux/blk-cgroup.h>
108
109#define BFQ_IOPRIO_CLASSES 3
110#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
111
112#define BFQ_MIN_WEIGHT 1
113#define BFQ_MAX_WEIGHT 1000
114#define BFQ_WEIGHT_CONVERSION_COEFF 10
115
116#define BFQ_DEFAULT_QUEUE_IOPRIO 4
117
e21b7a0b 118#define BFQ_WEIGHT_LEGACY_DFL 100
aee69d78
PV
119#define BFQ_DEFAULT_GRP_IOPRIO 0
120#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
121
122struct bfq_entity;
123
124/**
125 * struct bfq_service_tree - per ioprio_class service tree.
126 *
127 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
128 * ioprio_class has its own independent scheduler, and so its own
129 * bfq_service_tree. All the fields are protected by the queue lock
130 * of the containing bfqd.
131 */
132struct bfq_service_tree {
133 /* tree for active entities (i.e., those backlogged) */
134 struct rb_root active;
135 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
136 struct rb_root idle;
137
138 /* idle entity with minimum F_i */
139 struct bfq_entity *first_idle;
140 /* idle entity with maximum F_i */
141 struct bfq_entity *last_idle;
142
143 /* scheduler virtual time */
144 u64 vtime;
145 /* scheduler weight sum; active and idle entities contribute to it */
146 unsigned long wsum;
147};
148
149/**
150 * struct bfq_sched_data - multi-class scheduler.
151 *
152 * bfq_sched_data is the basic scheduler queue. It supports three
e21b7a0b
AA
153 * ioprio_classes, and can be used either as a toplevel queue or as an
154 * intermediate queue on a hierarchical setup. @next_in_service
155 * points to the active entity of the sched_data service trees that
156 * will be scheduled next. It is used to reduce the number of steps
157 * needed for each hierarchical-schedule update.
aee69d78
PV
158 *
159 * The supported ioprio_classes are the same as in CFQ, in descending
160 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
161 * Requests from higher priority queues are served before all the
162 * requests from lower priority queues; among requests of the same
163 * queue requests are served according to B-WF2Q+.
164 * All the fields are protected by the queue lock of the containing bfqd.
165 */
166struct bfq_sched_data {
167 /* entity in service */
168 struct bfq_entity *in_service_entity;
e21b7a0b 169 /* head-of-line entity (see comments above) */
aee69d78
PV
170 struct bfq_entity *next_in_service;
171 /* array of service trees, one per ioprio_class */
172 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
e21b7a0b
AA
173 /* last time CLASS_IDLE was served */
174 unsigned long bfq_class_idle_last_service;
175
aee69d78
PV
176};
177
178/**
179 * struct bfq_entity - schedulable entity.
180 *
e21b7a0b
AA
181 * A bfq_entity is used to represent either a bfq_queue (leaf node in the
182 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
183 * entity belongs to the sched_data of the parent group in the cgroup
184 * hierarchy. Non-leaf entities have also their own sched_data, stored
185 * in @my_sched_data.
aee69d78
PV
186 *
187 * Each entity stores independently its priority values; this would
188 * allow different weights on different devices, but this
189 * functionality is not exported to userspace by now. Priorities and
190 * weights are updated lazily, first storing the new values into the
191 * new_* fields, then setting the @prio_changed flag. As soon as
192 * there is a transition in the entity state that allows the priority
193 * update to take place the effective and the requested priority
194 * values are synchronized.
195 *
e21b7a0b
AA
196 * Unless cgroups are used, the weight value is calculated from the
197 * ioprio to export the same interface as CFQ. When dealing with
198 * ``well-behaved'' queues (i.e., queues that do not spend too much
199 * time to consume their budget and have true sequential behavior, and
200 * when there are no external factors breaking anticipation) the
201 * relative weights at each level of the cgroups hierarchy should be
202 * guaranteed. All the fields are protected by the queue lock of the
203 * containing bfqd.
aee69d78
PV
204 */
205struct bfq_entity {
206 /* service_tree member */
207 struct rb_node rb_node;
208
209 /*
e21b7a0b
AA
210 * Flag, true if the entity is on a tree (either the active or
211 * the idle one of its service_tree) or is in service.
aee69d78 212 */
e21b7a0b 213 bool on_st;
aee69d78
PV
214
215 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
216 u64 start, finish;
217
218 /* tree the entity is enqueued into; %NULL if not on a tree */
219 struct rb_root *tree;
220
221 /*
222 * minimum start time of the (active) subtree rooted at this
223 * entity; used for O(log N) lookups into active trees
224 */
225 u64 min_start;
226
227 /* amount of service received during the last service slot */
228 int service;
229
230 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
231 int budget;
232
233 /* weight of the queue */
234 int weight;
235 /* next weight if a change is in progress */
236 int new_weight;
237
238 /* original weight, used to implement weight boosting */
239 int orig_weight;
240
241 /* parent entity, for hierarchical scheduling */
242 struct bfq_entity *parent;
243
244 /*
245 * For non-leaf nodes in the hierarchy, the associated
246 * scheduler queue, %NULL on leaf nodes.
247 */
248 struct bfq_sched_data *my_sched_data;
249 /* the scheduler queue this entity belongs to */
250 struct bfq_sched_data *sched_data;
251
252 /* flag, set to request a weight, ioprio or ioprio_class change */
253 int prio_changed;
254};
255
e21b7a0b
AA
256struct bfq_group;
257
aee69d78
PV
258/**
259 * struct bfq_ttime - per process thinktime stats.
260 */
261struct bfq_ttime {
262 /* completion time of the last request */
263 u64 last_end_request;
264
265 /* total process thinktime */
266 u64 ttime_total;
267 /* number of thinktime samples */
268 unsigned long ttime_samples;
269 /* average process thinktime */
270 u64 ttime_mean;
271};
272
273/**
274 * struct bfq_queue - leaf schedulable entity.
275 *
276 * A bfq_queue is a leaf request queue; it can be associated with an
e21b7a0b
AA
277 * io_context or more, if it is async. @cgroup holds a reference to
278 * the cgroup, to be sure that it does not disappear while a bfqq
279 * still references it (mostly to avoid races between request issuing
280 * and task migration followed by cgroup destruction). All the fields
281 * are protected by the queue lock of the containing bfqd.
aee69d78
PV
282 */
283struct bfq_queue {
284 /* reference counter */
285 int ref;
286 /* parent bfq_data */
287 struct bfq_data *bfqd;
288
289 /* current ioprio and ioprio class */
290 unsigned short ioprio, ioprio_class;
291 /* next ioprio and ioprio class if a change is in progress */
292 unsigned short new_ioprio, new_ioprio_class;
293
294 /* sorted list of pending requests */
295 struct rb_root sort_list;
296 /* if fifo isn't expired, next request to serve */
297 struct request *next_rq;
298 /* number of sync and async requests queued */
299 int queued[2];
300 /* number of requests currently allocated */
301 int allocated;
302 /* number of pending metadata requests */
303 int meta_pending;
304 /* fifo list of requests in sort_list */
305 struct list_head fifo;
306
307 /* entity representing this queue in the scheduler */
308 struct bfq_entity entity;
309
310 /* maximum budget allowed from the feedback mechanism */
311 int max_budget;
312 /* budget expiration (in jiffies) */
313 unsigned long budget_timeout;
314
315 /* number of requests on the dispatch list or inside driver */
316 int dispatched;
317
318 /* status flags */
319 unsigned long flags;
320
321 /* node for active/idle bfqq list inside parent bfqd */
322 struct list_head bfqq_list;
323
324 /* associated @bfq_ttime struct */
325 struct bfq_ttime ttime;
326
327 /* bit vector: a 1 for each seeky requests in history */
328 u32 seek_history;
329 /* position of the last request enqueued */
330 sector_t last_request_pos;
331
332 /* Number of consecutive pairs of request completion and
333 * arrival, such that the queue becomes idle after the
334 * completion, but the next request arrives within an idle
335 * time slice; used only if the queue's IO_bound flag has been
336 * cleared.
337 */
338 unsigned int requests_within_timer;
339
340 /* pid of the process owning the queue, used for logging purposes */
341 pid_t pid;
44e44a1b
PV
342
343 /* current maximum weight-raising time for this queue */
344 unsigned long wr_cur_max_time;
345 /*
346 * Start time of the current weight-raising period if
347 * the @bfq-queue is being weight-raised, otherwise
348 * finish time of the last weight-raising period.
349 */
350 unsigned long last_wr_start_finish;
351 /* factor by which the weight of this queue is multiplied */
352 unsigned int wr_coeff;
aee69d78
PV
353};
354
355/**
356 * struct bfq_io_cq - per (request_queue, io_context) structure.
357 */
358struct bfq_io_cq {
359 /* associated io_cq structure */
360 struct io_cq icq; /* must be the first member */
361 /* array of two process queues, the sync and the async */
362 struct bfq_queue *bfqq[2];
363 /* per (request_queue, blkcg) ioprio */
364 int ioprio;
e21b7a0b
AA
365#ifdef CONFIG_BFQ_GROUP_IOSCHED
366 uint64_t blkcg_serial_nr; /* the current blkcg serial */
367#endif
aee69d78
PV
368};
369
44e44a1b
PV
370enum bfq_device_speed {
371 BFQ_BFQD_FAST,
372 BFQ_BFQD_SLOW,
373};
374
aee69d78
PV
375/**
376 * struct bfq_data - per-device data structure.
377 *
378 * All the fields are protected by @lock.
379 */
380struct bfq_data {
381 /* device request queue */
382 struct request_queue *queue;
383 /* dispatch queue */
384 struct list_head dispatch;
385
e21b7a0b
AA
386 /* root bfq_group for the device */
387 struct bfq_group *root_group;
aee69d78
PV
388
389 /*
390 * Number of bfq_queues containing requests (including the
391 * queue in service, even if it is idling).
392 */
393 int busy_queues;
394 /* number of queued requests */
395 int queued;
396 /* number of requests dispatched and waiting for completion */
397 int rq_in_driver;
398
399 /*
400 * Maximum number of requests in driver in the last
401 * @hw_tag_samples completed requests.
402 */
403 int max_rq_in_driver;
404 /* number of samples used to calculate hw_tag */
405 int hw_tag_samples;
406 /* flag set to one if the driver is showing a queueing behavior */
407 int hw_tag;
408
409 /* number of budgets assigned */
410 int budgets_assigned;
411
412 /*
413 * Timer set when idling (waiting) for the next request from
414 * the queue in service.
415 */
416 struct hrtimer idle_slice_timer;
417
418 /* bfq_queue in service */
419 struct bfq_queue *in_service_queue;
420 /* bfq_io_cq (bic) associated with the @in_service_queue */
421 struct bfq_io_cq *in_service_bic;
422
423 /* on-disk position of the last served request */
424 sector_t last_position;
425
ab0e43e9
PV
426 /* time of last request completion (ns) */
427 u64 last_completion;
428
429 /* time of first rq dispatch in current observation interval (ns) */
430 u64 first_dispatch;
431 /* time of last rq dispatch in current observation interval (ns) */
432 u64 last_dispatch;
433
aee69d78
PV
434 /* beginning of the last budget */
435 ktime_t last_budget_start;
436 /* beginning of the last idle slice */
437 ktime_t last_idling_start;
ab0e43e9
PV
438
439 /* number of samples in current observation interval */
aee69d78 440 int peak_rate_samples;
ab0e43e9
PV
441 /* num of samples of seq dispatches in current observation interval */
442 u32 sequential_samples;
443 /* total num of sectors transferred in current observation interval */
444 u64 tot_sectors_dispatched;
445 /* max rq size seen during current observation interval (sectors) */
446 u32 last_rq_max_size;
447 /* time elapsed from first dispatch in current observ. interval (us) */
448 u64 delta_from_first;
aee69d78 449 /*
ab0e43e9
PV
450 * Current estimate of the device peak rate, measured in
451 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
452 * BFQ_RATE_SHIFT is performed to increase precision in
aee69d78
PV
453 * fixed-point calculations.
454 */
ab0e43e9
PV
455 u32 peak_rate;
456
aee69d78
PV
457 /* maximum budget allotted to a bfq_queue before rescheduling */
458 int bfq_max_budget;
459
460 /* list of all the bfq_queues active on the device */
461 struct list_head active_list;
462 /* list of all the bfq_queues idle on the device */
463 struct list_head idle_list;
464
465 /*
466 * Timeout for async/sync requests; when it fires, requests
467 * are served in fifo order.
468 */
469 u64 bfq_fifo_expire[2];
470 /* weight of backward seeks wrt forward ones */
471 unsigned int bfq_back_penalty;
472 /* maximum allowed backward seek */
473 unsigned int bfq_back_max;
474 /* maximum idling time */
475 u32 bfq_slice_idle;
aee69d78
PV
476
477 /* user-configured max budget value (0 for auto-tuning) */
478 int bfq_user_max_budget;
479 /*
480 * Timeout for bfq_queues to consume their budget; used to
481 * prevent seeky queues from imposing long latencies to
482 * sequential or quasi-sequential ones (this also implies that
483 * seeky queues cannot receive guarantees in the service
484 * domain; after a timeout they are charged for the time they
485 * have been in service, to preserve fairness among them, but
486 * without service-domain guarantees).
487 */
488 unsigned int bfq_timeout;
489
490 /*
491 * Number of consecutive requests that must be issued within
492 * the idle time slice to set again idling to a queue which
493 * was marked as non-I/O-bound (see the definition of the
494 * IO_bound flag for further details).
495 */
496 unsigned int bfq_requests_within_timer;
497
498 /*
499 * Force device idling whenever needed to provide accurate
500 * service guarantees, without caring about throughput
501 * issues. CAVEAT: this may even increase latencies, in case
502 * of useless idling for processes that did stop doing I/O.
503 */
504 bool strict_guarantees;
505
44e44a1b
PV
506 /* if set to true, low-latency heuristics are enabled */
507 bool low_latency;
508 /*
509 * Maximum factor by which the weight of a weight-raised queue
510 * is multiplied.
511 */
512 unsigned int bfq_wr_coeff;
513 /* maximum duration of a weight-raising period (jiffies) */
514 unsigned int bfq_wr_max_time;
515 /*
516 * Minimum idle period after which weight-raising may be
517 * reactivated for a queue (in jiffies).
518 */
519 unsigned int bfq_wr_min_idle_time;
520 /*
521 * Minimum period between request arrivals after which
522 * weight-raising may be reactivated for an already busy async
523 * queue (in jiffies).
524 */
525 unsigned long bfq_wr_min_inter_arr_async;
526 /*
527 * Cached value of the product R*T, used for computing the
528 * maximum duration of weight raising automatically.
529 */
530 u64 RT_prod;
531 /* device-speed class for the low-latency heuristic */
532 enum bfq_device_speed device_speed;
533
aee69d78
PV
534 /* fallback dummy bfqq for extreme OOM conditions */
535 struct bfq_queue oom_bfqq;
536
537 spinlock_t lock;
538
539 /*
540 * bic associated with the task issuing current bio for
541 * merging. This and the next field are used as a support to
542 * be able to perform the bic lookup, needed by bio-merge
543 * functions, before the scheduler lock is taken, and thus
544 * avoid taking the request-queue lock while the scheduler
545 * lock is being held.
546 */
547 struct bfq_io_cq *bio_bic;
548 /* bfqq associated with the task issuing current bio for merging */
549 struct bfq_queue *bio_bfqq;
550};
551
552enum bfqq_state_flags {
553 BFQQF_busy = 0, /* has requests or is in service */
554 BFQQF_wait_request, /* waiting for a request */
555 BFQQF_non_blocking_wait_rq, /*
556 * waiting for a request
557 * without idling the device
558 */
559 BFQQF_fifo_expire, /* FIFO checked in this slice */
560 BFQQF_idle_window, /* slice idling enabled */
561 BFQQF_sync, /* synchronous queue */
aee69d78
PV
562 BFQQF_IO_bound, /*
563 * bfqq has timed-out at least once
564 * having consumed at most 2/10 of
565 * its budget
566 */
567};
568
569#define BFQ_BFQQ_FNS(name) \
570static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
571{ \
572 __set_bit(BFQQF_##name, &(bfqq)->flags); \
573} \
574static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
575{ \
576 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
577} \
578static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
579{ \
580 return test_bit(BFQQF_##name, &(bfqq)->flags); \
581}
582
583BFQ_BFQQ_FNS(busy);
584BFQ_BFQQ_FNS(wait_request);
585BFQ_BFQQ_FNS(non_blocking_wait_rq);
586BFQ_BFQQ_FNS(fifo_expire);
587BFQ_BFQQ_FNS(idle_window);
588BFQ_BFQQ_FNS(sync);
aee69d78
PV
589BFQ_BFQQ_FNS(IO_bound);
590#undef BFQ_BFQQ_FNS
591
592/* Logging facilities. */
e21b7a0b
AA
593#ifdef CONFIG_BFQ_GROUP_IOSCHED
594static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
595static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
596
597#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
598 char __pbuf[128]; \
599 \
600 blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
601 blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
602 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
603 __pbuf, ##args); \
604} while (0)
605
606#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
607 char __pbuf[128]; \
608 \
609 blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
610 blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
611} while (0)
612
613#else /* CONFIG_BFQ_GROUP_IOSCHED */
614
615#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
616 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
617 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
618 ##args)
619#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
620
621#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
622
623#define bfq_log(bfqd, fmt, args...) \
624 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
625
626/* Expiration reasons. */
627enum bfqq_expiration {
628 BFQQE_TOO_IDLE = 0, /*
629 * queue has been idling for
630 * too long
631 */
632 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
633 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
634 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
635 BFQQE_PREEMPTED /* preemption in progress */
636};
637
e21b7a0b
AA
638struct bfqg_stats {
639#ifdef CONFIG_BFQ_GROUP_IOSCHED
640 /* number of ios merged */
641 struct blkg_rwstat merged;
642 /* total time spent on device in ns, may not be accurate w/ queueing */
643 struct blkg_rwstat service_time;
644 /* total time spent waiting in scheduler queue in ns */
645 struct blkg_rwstat wait_time;
646 /* number of IOs queued up */
647 struct blkg_rwstat queued;
648 /* total disk time and nr sectors dispatched by this group */
649 struct blkg_stat time;
650 /* sum of number of ios queued across all samples */
651 struct blkg_stat avg_queue_size_sum;
652 /* count of samples taken for average */
653 struct blkg_stat avg_queue_size_samples;
654 /* how many times this group has been removed from service tree */
655 struct blkg_stat dequeue;
656 /* total time spent waiting for it to be assigned a timeslice. */
657 struct blkg_stat group_wait_time;
658 /* time spent idling for this blkcg_gq */
659 struct blkg_stat idle_time;
660 /* total time with empty current active q with other requests queued */
661 struct blkg_stat empty_time;
662 /* fields after this shouldn't be cleared on stat reset */
663 uint64_t start_group_wait_time;
664 uint64_t start_idle_time;
665 uint64_t start_empty_time;
666 uint16_t flags;
667#endif /* CONFIG_BFQ_GROUP_IOSCHED */
668};
669
670#ifdef CONFIG_BFQ_GROUP_IOSCHED
671
672/*
673 * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
674 *
675 * @ps: @blkcg_policy_storage that this structure inherits
676 * @weight: weight of the bfq_group
677 */
678struct bfq_group_data {
679 /* must be the first member */
680 struct blkcg_policy_data pd;
681
44e44a1b 682 unsigned int weight;
e21b7a0b
AA
683};
684
685/**
686 * struct bfq_group - per (device, cgroup) data structure.
687 * @entity: schedulable entity to insert into the parent group sched_data.
688 * @sched_data: own sched_data, to contain child entities (they may be
689 * both bfq_queues and bfq_groups).
690 * @bfqd: the bfq_data for the device this group acts upon.
691 * @async_bfqq: array of async queues for all the tasks belonging to
692 * the group, one queue per ioprio value per ioprio_class,
693 * except for the idle class that has only one queue.
694 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
695 * @my_entity: pointer to @entity, %NULL for the toplevel group; used
696 * to avoid too many special cases during group creation/
697 * migration.
698 * @stats: stats for this bfqg.
699 *
700 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
701 * there is a set of bfq_groups, each one collecting the lower-level
702 * entities belonging to the group that are acting on the same device.
703 *
704 * Locking works as follows:
705 * o @bfqd is protected by the queue lock, RCU is used to access it
706 * from the readers.
707 * o All the other fields are protected by the @bfqd queue lock.
708 */
709struct bfq_group {
710 /* must be the first member */
711 struct blkg_policy_data pd;
712
713 struct bfq_entity entity;
714 struct bfq_sched_data sched_data;
715
716 void *bfqd;
717
718 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
719 struct bfq_queue *async_idle_bfqq;
720
721 struct bfq_entity *my_entity;
722
723 struct bfqg_stats stats;
724};
725
726#else
727struct bfq_group {
728 struct bfq_sched_data sched_data;
729
730 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
731 struct bfq_queue *async_idle_bfqq;
732
733 struct rb_root rq_pos_tree;
734};
735#endif
736
aee69d78
PV
737static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
738
e21b7a0b
AA
739static unsigned int bfq_class_idx(struct bfq_entity *entity)
740{
741 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
742
743 return bfqq ? bfqq->ioprio_class - 1 :
744 BFQ_DEFAULT_GRP_CLASS - 1;
745}
746
aee69d78
PV
747static struct bfq_service_tree *
748bfq_entity_service_tree(struct bfq_entity *entity)
749{
750 struct bfq_sched_data *sched_data = entity->sched_data;
e21b7a0b 751 unsigned int idx = bfq_class_idx(entity);
aee69d78
PV
752
753 return sched_data->service_tree + idx;
754}
755
756static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
757{
758 return bic->bfqq[is_sync];
759}
760
761static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
762 bool is_sync)
763{
764 bic->bfqq[is_sync] = bfqq;
765}
766
767static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
768{
769 return bic->icq.q->elevator->elevator_data;
770}
771
772static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
773static void bfq_put_queue(struct bfq_queue *bfqq);
774static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
775 struct bio *bio, bool is_sync,
776 struct bfq_io_cq *bic);
44e44a1b
PV
777static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
778 struct bfq_group *bfqg);
e21b7a0b 779static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
aee69d78
PV
780static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
781
aee69d78
PV
782/* Expiration time of sync (0) and async (1) requests, in ns. */
783static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
784
785/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
786static const int bfq_back_max = 16 * 1024;
787
788/* Penalty of a backwards seek, in number of sectors. */
789static const int bfq_back_penalty = 2;
790
791/* Idling period duration, in ns. */
792static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
793
794/* Minimum number of assigned budgets for which stats are safe to compute. */
795static const int bfq_stats_min_budgets = 194;
796
797/* Default maximum budget values, in sectors and number of requests. */
798static const int bfq_default_max_budget = 16 * 1024;
799
c074170e
PV
800/*
801 * Async to sync throughput distribution is controlled as follows:
802 * when an async request is served, the entity is charged the number
803 * of sectors of the request, multiplied by the factor below
804 */
805static const int bfq_async_charge_factor = 10;
806
aee69d78
PV
807/* Default timeout values, in jiffies, approximating CFQ defaults. */
808static const int bfq_timeout = HZ / 8;
809
810static struct kmem_cache *bfq_pool;
811
ab0e43e9 812/* Below this threshold (in ns), we consider thinktime immediate. */
aee69d78
PV
813#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
814
815/* hw_tag detection: parallel requests threshold and min samples needed. */
816#define BFQ_HW_QUEUE_THRESHOLD 4
817#define BFQ_HW_QUEUE_SAMPLES 32
818
819#define BFQQ_SEEK_THR (sector_t)(8 * 100)
820#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
821#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
822#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
823
ab0e43e9
PV
824/* Min number of samples required to perform peak-rate update */
825#define BFQ_RATE_MIN_SAMPLES 32
826/* Min observation time interval required to perform a peak-rate update (ns) */
827#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
828/* Target observation time interval for a peak-rate update (ns) */
829#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
aee69d78
PV
830
831/* Shift used for peak rate fixed precision calculations. */
832#define BFQ_RATE_SHIFT 16
833
44e44a1b
PV
834/*
835 * By default, BFQ computes the duration of the weight raising for
836 * interactive applications automatically, using the following formula:
837 * duration = (R / r) * T, where r is the peak rate of the device, and
838 * R and T are two reference parameters.
839 * In particular, R is the peak rate of the reference device (see below),
840 * and T is a reference time: given the systems that are likely to be
841 * installed on the reference device according to its speed class, T is
842 * about the maximum time needed, under BFQ and while reading two files in
843 * parallel, to load typical large applications on these systems.
844 * In practice, the slower/faster the device at hand is, the more/less it
845 * takes to load applications with respect to the reference device.
846 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
847 * applications.
848 *
849 * BFQ uses four different reference pairs (R, T), depending on:
850 * . whether the device is rotational or non-rotational;
851 * . whether the device is slow, such as old or portable HDDs, as well as
852 * SD cards, or fast, such as newer HDDs and SSDs.
853 *
854 * The device's speed class is dynamically (re)detected in
855 * bfq_update_peak_rate() every time the estimated peak rate is updated.
856 *
857 * In the following definitions, R_slow[0]/R_fast[0] and
858 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
859 * rotational device, whereas R_slow[1]/R_fast[1] and
860 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
861 * non-rotational device. Finally, device_speed_thresh are the
862 * thresholds used to switch between speed classes. The reference
863 * rates are not the actual peak rates of the devices used as a
864 * reference, but slightly lower values. The reason for using these
865 * slightly lower values is that the peak-rate estimator tends to
866 * yield slightly lower values than the actual peak rate (it can yield
867 * the actual peak rate only if there is only one process doing I/O,
868 * and the process does sequential I/O).
869 *
870 * Both the reference peak rates and the thresholds are measured in
871 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
872 */
873static int R_slow[2] = {1000, 10700};
874static int R_fast[2] = {14000, 33000};
875/*
876 * To improve readability, a conversion function is used to initialize the
877 * following arrays, which entails that they can be initialized only in a
878 * function.
879 */
880static int T_slow[2];
881static int T_fast[2];
882static int device_speed_thresh[2];
883
aee69d78
PV
884#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
885 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
886
887#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
888#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
889
890/**
891 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
892 * @icq: the iocontext queue.
893 */
894static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
895{
896 /* bic->icq is the first member, %NULL will convert to %NULL */
897 return container_of(icq, struct bfq_io_cq, icq);
898}
899
900/**
901 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
902 * @bfqd: the lookup key.
903 * @ioc: the io_context of the process doing I/O.
904 * @q: the request queue.
905 */
906static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
907 struct io_context *ioc,
908 struct request_queue *q)
909{
910 if (ioc) {
911 unsigned long flags;
912 struct bfq_io_cq *icq;
913
914 spin_lock_irqsave(q->queue_lock, flags);
915 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
916 spin_unlock_irqrestore(q->queue_lock, flags);
917
918 return icq;
919 }
920
921 return NULL;
922}
923
924/*
e21b7a0b
AA
925 * Scheduler run of queue, if there are requests pending and no one in the
926 * driver that will restart queueing.
927 */
928static void bfq_schedule_dispatch(struct bfq_data *bfqd)
929{
930 if (bfqd->queued != 0) {
931 bfq_log(bfqd, "schedule dispatch");
932 blk_mq_run_hw_queues(bfqd->queue, true);
933 }
934}
935
936/**
937 * bfq_gt - compare two timestamps.
938 * @a: first ts.
939 * @b: second ts.
940 *
941 * Return @a > @b, dealing with wrapping correctly.
942 */
943static int bfq_gt(u64 a, u64 b)
944{
945 return (s64)(a - b) > 0;
946}
947
948static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
949{
950 struct rb_node *node = tree->rb_node;
951
952 return rb_entry(node, struct bfq_entity, rb_node);
953}
954
955static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
956
957static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
958
959/**
960 * bfq_update_next_in_service - update sd->next_in_service
961 * @sd: sched_data for which to perform the update.
962 * @new_entity: if not NULL, pointer to the entity whose activation,
963 * requeueing or repositionig triggered the invocation of
964 * this function.
965 *
966 * This function is called to update sd->next_in_service, which, in
967 * its turn, may change as a consequence of the insertion or
968 * extraction of an entity into/from one of the active trees of
969 * sd. These insertions/extractions occur as a consequence of
970 * activations/deactivations of entities, with some activations being
971 * 'true' activations, and other activations being requeueings (i.e.,
972 * implementing the second, requeueing phase of the mechanism used to
973 * reposition an entity in its active tree; see comments on
974 * __bfq_activate_entity and __bfq_requeue_entity for details). In
975 * both the last two activation sub-cases, new_entity points to the
976 * just activated or requeued entity.
977 *
978 * Returns true if sd->next_in_service changes in such a way that
979 * entity->parent may become the next_in_service for its parent
980 * entity.
aee69d78 981 */
e21b7a0b
AA
982static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
983 struct bfq_entity *new_entity)
984{
985 struct bfq_entity *next_in_service = sd->next_in_service;
986 bool parent_sched_may_change = false;
987
988 /*
989 * If this update is triggered by the activation, requeueing
990 * or repositiong of an entity that does not coincide with
991 * sd->next_in_service, then a full lookup in the active tree
992 * can be avoided. In fact, it is enough to check whether the
993 * just-modified entity has a higher priority than
994 * sd->next_in_service, or, even if it has the same priority
995 * as sd->next_in_service, is eligible and has a lower virtual
996 * finish time than sd->next_in_service. If this compound
997 * condition holds, then the new entity becomes the new
998 * next_in_service. Otherwise no change is needed.
999 */
1000 if (new_entity && new_entity != sd->next_in_service) {
1001 /*
1002 * Flag used to decide whether to replace
1003 * sd->next_in_service with new_entity. Tentatively
1004 * set to true, and left as true if
1005 * sd->next_in_service is NULL.
1006 */
1007 bool replace_next = true;
1008
1009 /*
1010 * If there is already a next_in_service candidate
1011 * entity, then compare class priorities or timestamps
1012 * to decide whether to replace sd->service_tree with
1013 * new_entity.
1014 */
1015 if (next_in_service) {
1016 unsigned int new_entity_class_idx =
1017 bfq_class_idx(new_entity);
1018 struct bfq_service_tree *st =
1019 sd->service_tree + new_entity_class_idx;
1020
1021 /*
1022 * For efficiency, evaluate the most likely
1023 * sub-condition first.
1024 */
1025 replace_next =
1026 (new_entity_class_idx ==
1027 bfq_class_idx(next_in_service)
1028 &&
1029 !bfq_gt(new_entity->start, st->vtime)
1030 &&
1031 bfq_gt(next_in_service->finish,
1032 new_entity->finish))
1033 ||
1034 new_entity_class_idx <
1035 bfq_class_idx(next_in_service);
1036 }
1037
1038 if (replace_next)
1039 next_in_service = new_entity;
1040 } else /* invoked because of a deactivation: lookup needed */
1041 next_in_service = bfq_lookup_next_entity(sd);
1042
1043 if (next_in_service) {
1044 parent_sched_may_change = !sd->next_in_service ||
1045 bfq_update_parent_budget(next_in_service);
1046 }
1047
1048 sd->next_in_service = next_in_service;
1049
1050 if (!next_in_service)
1051 return parent_sched_may_change;
1052
1053 return parent_sched_may_change;
1054}
1055
1056#ifdef CONFIG_BFQ_GROUP_IOSCHED
1057/* both next loops stop at one of the child entities of the root group */
aee69d78 1058#define for_each_entity(entity) \
e21b7a0b 1059 for (; entity ; entity = entity->parent)
aee69d78 1060
e21b7a0b
AA
1061/*
1062 * For each iteration, compute parent in advance, so as to be safe if
1063 * entity is deallocated during the iteration. Such a deallocation may
1064 * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1065 * containing entity.
1066 */
aee69d78 1067#define for_each_entity_safe(entity, parent) \
e21b7a0b 1068 for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
aee69d78 1069
e21b7a0b
AA
1070/*
1071 * Returns true if this budget changes may let next_in_service->parent
1072 * become the next_in_service entity for its parent entity.
1073 */
1074static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 1075{
e21b7a0b
AA
1076 struct bfq_entity *bfqg_entity;
1077 struct bfq_group *bfqg;
1078 struct bfq_sched_data *group_sd;
1079 bool ret = false;
1080
1081 group_sd = next_in_service->sched_data;
1082
1083 bfqg = container_of(group_sd, struct bfq_group, sched_data);
1084 /*
1085 * bfq_group's my_entity field is not NULL only if the group
1086 * is not the root group. We must not touch the root entity
1087 * as it must never become an in-service entity.
1088 */
1089 bfqg_entity = bfqg->my_entity;
1090 if (bfqg_entity) {
1091 if (bfqg_entity->budget > next_in_service->budget)
1092 ret = true;
1093 bfqg_entity->budget = next_in_service->budget;
1094 }
1095
1096 return ret;
1097}
1098
1099/*
1100 * This function tells whether entity stops being a candidate for next
1101 * service, according to the following logic.
1102 *
1103 * This function is invoked for an entity that is about to be set in
1104 * service. If such an entity is a queue, then the entity is no longer
1105 * a candidate for next service (i.e, a candidate entity to serve
1106 * after the in-service entity is expired). The function then returns
1107 * true.
1108 */
1109static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
1110{
1111 if (bfq_entity_to_bfqq(entity))
1112 return true;
1113
1114 return false;
aee69d78
PV
1115}
1116
e21b7a0b
AA
1117#else /* CONFIG_BFQ_GROUP_IOSCHED */
1118/*
1119 * Next two macros are fake loops when cgroups support is not
1120 * enabled. I fact, in such a case, there is only one level to go up
1121 * (to reach the root group).
1122 */
1123#define for_each_entity(entity) \
1124 for (; entity ; entity = NULL)
1125
1126#define for_each_entity_safe(entity, parent) \
1127 for (parent = NULL; entity ; entity = parent)
1128
1129static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
aee69d78 1130{
e21b7a0b 1131 return false;
aee69d78
PV
1132}
1133
e21b7a0b 1134static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
aee69d78 1135{
e21b7a0b 1136 return true;
aee69d78
PV
1137}
1138
e21b7a0b
AA
1139#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1140
aee69d78
PV
1141/*
1142 * Shift for timestamp calculations. This actually limits the maximum
1143 * service allowed in one timestamp delta (small shift values increase it),
1144 * the maximum total weight that can be used for the queues in the system
1145 * (big shift values increase it), and the period of virtual time
1146 * wraparounds.
1147 */
1148#define WFQ_SERVICE_SHIFT 22
1149
aee69d78
PV
1150static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1151{
1152 struct bfq_queue *bfqq = NULL;
1153
1154 if (!entity->my_sched_data)
1155 bfqq = container_of(entity, struct bfq_queue, entity);
1156
1157 return bfqq;
1158}
1159
1160
1161/**
1162 * bfq_delta - map service into the virtual time domain.
1163 * @service: amount of service.
1164 * @weight: scale factor (weight of an entity or weight sum).
1165 */
1166static u64 bfq_delta(unsigned long service, unsigned long weight)
1167{
1168 u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1169
1170 do_div(d, weight);
1171 return d;
1172}
1173
1174/**
1175 * bfq_calc_finish - assign the finish time to an entity.
1176 * @entity: the entity to act upon.
1177 * @service: the service to be charged to the entity.
1178 */
1179static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
1180{
1181 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1182
1183 entity->finish = entity->start +
1184 bfq_delta(service, entity->weight);
1185
1186 if (bfqq) {
1187 bfq_log_bfqq(bfqq->bfqd, bfqq,
1188 "calc_finish: serv %lu, w %d",
1189 service, entity->weight);
1190 bfq_log_bfqq(bfqq->bfqd, bfqq,
1191 "calc_finish: start %llu, finish %llu, delta %llu",
1192 entity->start, entity->finish,
1193 bfq_delta(service, entity->weight));
1194 }
1195}
1196
1197/**
1198 * bfq_entity_of - get an entity from a node.
1199 * @node: the node field of the entity.
1200 *
1201 * Convert a node pointer to the relative entity. This is used only
1202 * to simplify the logic of some functions and not as the generic
1203 * conversion mechanism because, e.g., in the tree walking functions,
1204 * the check for a %NULL value would be redundant.
1205 */
1206static struct bfq_entity *bfq_entity_of(struct rb_node *node)
1207{
1208 struct bfq_entity *entity = NULL;
1209
1210 if (node)
1211 entity = rb_entry(node, struct bfq_entity, rb_node);
1212
1213 return entity;
1214}
1215
1216/**
1217 * bfq_extract - remove an entity from a tree.
1218 * @root: the tree root.
1219 * @entity: the entity to remove.
1220 */
1221static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
1222{
1223 entity->tree = NULL;
1224 rb_erase(&entity->rb_node, root);
1225}
1226
1227/**
1228 * bfq_idle_extract - extract an entity from the idle tree.
1229 * @st: the service tree of the owning @entity.
1230 * @entity: the entity being removed.
1231 */
1232static void bfq_idle_extract(struct bfq_service_tree *st,
1233 struct bfq_entity *entity)
1234{
1235 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1236 struct rb_node *next;
1237
1238 if (entity == st->first_idle) {
1239 next = rb_next(&entity->rb_node);
1240 st->first_idle = bfq_entity_of(next);
1241 }
1242
1243 if (entity == st->last_idle) {
1244 next = rb_prev(&entity->rb_node);
1245 st->last_idle = bfq_entity_of(next);
1246 }
1247
1248 bfq_extract(&st->idle, entity);
1249
1250 if (bfqq)
1251 list_del(&bfqq->bfqq_list);
1252}
1253
1254/**
1255 * bfq_insert - generic tree insertion.
1256 * @root: tree root.
1257 * @entity: entity to insert.
1258 *
1259 * This is used for the idle and the active tree, since they are both
1260 * ordered by finish time.
1261 */
1262static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
1263{
1264 struct bfq_entity *entry;
1265 struct rb_node **node = &root->rb_node;
1266 struct rb_node *parent = NULL;
1267
1268 while (*node) {
1269 parent = *node;
1270 entry = rb_entry(parent, struct bfq_entity, rb_node);
1271
1272 if (bfq_gt(entry->finish, entity->finish))
1273 node = &parent->rb_left;
1274 else
1275 node = &parent->rb_right;
1276 }
1277
1278 rb_link_node(&entity->rb_node, parent, node);
1279 rb_insert_color(&entity->rb_node, root);
1280
1281 entity->tree = root;
1282}
1283
1284/**
1285 * bfq_update_min - update the min_start field of a entity.
1286 * @entity: the entity to update.
1287 * @node: one of its children.
1288 *
1289 * This function is called when @entity may store an invalid value for
1290 * min_start due to updates to the active tree. The function assumes
1291 * that the subtree rooted at @node (which may be its left or its right
1292 * child) has a valid min_start value.
1293 */
1294static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
1295{
1296 struct bfq_entity *child;
1297
1298 if (node) {
1299 child = rb_entry(node, struct bfq_entity, rb_node);
1300 if (bfq_gt(entity->min_start, child->min_start))
1301 entity->min_start = child->min_start;
1302 }
1303}
1304
1305/**
1306 * bfq_update_active_node - recalculate min_start.
1307 * @node: the node to update.
1308 *
1309 * @node may have changed position or one of its children may have moved,
1310 * this function updates its min_start value. The left and right subtrees
1311 * are assumed to hold a correct min_start value.
1312 */
1313static void bfq_update_active_node(struct rb_node *node)
1314{
1315 struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
1316
1317 entity->min_start = entity->start;
1318 bfq_update_min(entity, node->rb_right);
1319 bfq_update_min(entity, node->rb_left);
1320}
1321
1322/**
1323 * bfq_update_active_tree - update min_start for the whole active tree.
1324 * @node: the starting node.
1325 *
1326 * @node must be the deepest modified node after an update. This function
1327 * updates its min_start using the values held by its children, assuming
1328 * that they did not change, and then updates all the nodes that may have
1329 * changed in the path to the root. The only nodes that may have changed
1330 * are the ones in the path or their siblings.
1331 */
1332static void bfq_update_active_tree(struct rb_node *node)
1333{
1334 struct rb_node *parent;
1335
1336up:
1337 bfq_update_active_node(node);
1338
1339 parent = rb_parent(node);
1340 if (!parent)
1341 return;
1342
1343 if (node == parent->rb_left && parent->rb_right)
1344 bfq_update_active_node(parent->rb_right);
1345 else if (parent->rb_left)
1346 bfq_update_active_node(parent->rb_left);
1347
1348 node = parent;
1349 goto up;
1350}
1351
1352/**
1353 * bfq_active_insert - insert an entity in the active tree of its
1354 * group/device.
1355 * @st: the service tree of the entity.
1356 * @entity: the entity being inserted.
1357 *
1358 * The active tree is ordered by finish time, but an extra key is kept
1359 * per each node, containing the minimum value for the start times of
1360 * its children (and the node itself), so it's possible to search for
1361 * the eligible node with the lowest finish time in logarithmic time.
1362 */
1363static void bfq_active_insert(struct bfq_service_tree *st,
1364 struct bfq_entity *entity)
1365{
1366 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1367 struct rb_node *node = &entity->rb_node;
e21b7a0b
AA
1368#ifdef CONFIG_BFQ_GROUP_IOSCHED
1369 struct bfq_sched_data *sd = NULL;
1370 struct bfq_group *bfqg = NULL;
1371 struct bfq_data *bfqd = NULL;
1372#endif
aee69d78
PV
1373
1374 bfq_insert(&st->active, entity);
1375
1376 if (node->rb_left)
1377 node = node->rb_left;
1378 else if (node->rb_right)
1379 node = node->rb_right;
1380
1381 bfq_update_active_tree(node);
1382
e21b7a0b
AA
1383#ifdef CONFIG_BFQ_GROUP_IOSCHED
1384 sd = entity->sched_data;
1385 bfqg = container_of(sd, struct bfq_group, sched_data);
1386 bfqd = (struct bfq_data *)bfqg->bfqd;
1387#endif
aee69d78
PV
1388 if (bfqq)
1389 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
1390}
1391
1392/**
1393 * bfq_ioprio_to_weight - calc a weight from an ioprio.
1394 * @ioprio: the ioprio value to convert.
1395 */
1396static unsigned short bfq_ioprio_to_weight(int ioprio)
1397{
1398 return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
1399}
1400
1401/**
1402 * bfq_weight_to_ioprio - calc an ioprio from a weight.
1403 * @weight: the weight value to convert.
1404 *
1405 * To preserve as much as possible the old only-ioprio user interface,
1406 * 0 is used as an escape ioprio value for weights (numerically) equal or
1407 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1408 */
1409static unsigned short bfq_weight_to_ioprio(int weight)
1410{
1411 return max_t(int, 0,
1412 IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
1413}
1414
1415static void bfq_get_entity(struct bfq_entity *entity)
1416{
1417 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1418
1419 if (bfqq) {
1420 bfqq->ref++;
1421 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1422 bfqq, bfqq->ref);
1423 }
1424}
1425
1426/**
1427 * bfq_find_deepest - find the deepest node that an extraction can modify.
1428 * @node: the node being removed.
1429 *
1430 * Do the first step of an extraction in an rb tree, looking for the
1431 * node that will replace @node, and returning the deepest node that
1432 * the following modifications to the tree can touch. If @node is the
1433 * last node in the tree return %NULL.
1434 */
1435static struct rb_node *bfq_find_deepest(struct rb_node *node)
1436{
1437 struct rb_node *deepest;
1438
1439 if (!node->rb_right && !node->rb_left)
1440 deepest = rb_parent(node);
1441 else if (!node->rb_right)
1442 deepest = node->rb_left;
1443 else if (!node->rb_left)
1444 deepest = node->rb_right;
1445 else {
1446 deepest = rb_next(node);
1447 if (deepest->rb_right)
1448 deepest = deepest->rb_right;
1449 else if (rb_parent(deepest) != node)
1450 deepest = rb_parent(deepest);
1451 }
1452
1453 return deepest;
1454}
1455
1456/**
1457 * bfq_active_extract - remove an entity from the active tree.
1458 * @st: the service_tree containing the tree.
1459 * @entity: the entity being removed.
1460 */
1461static void bfq_active_extract(struct bfq_service_tree *st,
1462 struct bfq_entity *entity)
1463{
1464 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1465 struct rb_node *node;
e21b7a0b
AA
1466#ifdef CONFIG_BFQ_GROUP_IOSCHED
1467 struct bfq_sched_data *sd = NULL;
1468 struct bfq_group *bfqg = NULL;
1469 struct bfq_data *bfqd = NULL;
1470#endif
aee69d78
PV
1471
1472 node = bfq_find_deepest(&entity->rb_node);
1473 bfq_extract(&st->active, entity);
1474
1475 if (node)
1476 bfq_update_active_tree(node);
1477
e21b7a0b
AA
1478#ifdef CONFIG_BFQ_GROUP_IOSCHED
1479 sd = entity->sched_data;
1480 bfqg = container_of(sd, struct bfq_group, sched_data);
1481 bfqd = (struct bfq_data *)bfqg->bfqd;
1482#endif
aee69d78
PV
1483 if (bfqq)
1484 list_del(&bfqq->bfqq_list);
1485}
1486
1487/**
1488 * bfq_idle_insert - insert an entity into the idle tree.
1489 * @st: the service tree containing the tree.
1490 * @entity: the entity to insert.
1491 */
1492static void bfq_idle_insert(struct bfq_service_tree *st,
1493 struct bfq_entity *entity)
1494{
1495 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1496 struct bfq_entity *first_idle = st->first_idle;
1497 struct bfq_entity *last_idle = st->last_idle;
1498
1499 if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
1500 st->first_idle = entity;
1501 if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
1502 st->last_idle = entity;
1503
1504 bfq_insert(&st->idle, entity);
1505
1506 if (bfqq)
1507 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
1508}
1509
1510/**
1511 * bfq_forget_entity - do not consider entity any longer for scheduling
1512 * @st: the service tree.
1513 * @entity: the entity being removed.
1514 * @is_in_service: true if entity is currently the in-service entity.
1515 *
1516 * Forget everything about @entity. In addition, if entity represents
1517 * a queue, and the latter is not in service, then release the service
1518 * reference to the queue (the one taken through bfq_get_entity). In
1519 * fact, in this case, there is really no more service reference to
1520 * the queue, as the latter is also outside any service tree. If,
1521 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1522 * will take care of putting the reference when the queue finally
1523 * stops being served.
1524 */
1525static void bfq_forget_entity(struct bfq_service_tree *st,
1526 struct bfq_entity *entity,
1527 bool is_in_service)
1528{
1529 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1530
e21b7a0b 1531 entity->on_st = false;
aee69d78
PV
1532 st->wsum -= entity->weight;
1533 if (bfqq && !is_in_service)
1534 bfq_put_queue(bfqq);
1535}
1536
1537/**
1538 * bfq_put_idle_entity - release the idle tree ref of an entity.
1539 * @st: service tree for the entity.
1540 * @entity: the entity being released.
1541 */
1542static void bfq_put_idle_entity(struct bfq_service_tree *st,
1543 struct bfq_entity *entity)
1544{
1545 bfq_idle_extract(st, entity);
1546 bfq_forget_entity(st, entity,
1547 entity == entity->sched_data->in_service_entity);
1548}
1549
1550/**
1551 * bfq_forget_idle - update the idle tree if necessary.
1552 * @st: the service tree to act upon.
1553 *
1554 * To preserve the global O(log N) complexity we only remove one entry here;
1555 * as the idle tree will not grow indefinitely this can be done safely.
1556 */
1557static void bfq_forget_idle(struct bfq_service_tree *st)
1558{
1559 struct bfq_entity *first_idle = st->first_idle;
1560 struct bfq_entity *last_idle = st->last_idle;
1561
1562 if (RB_EMPTY_ROOT(&st->active) && last_idle &&
1563 !bfq_gt(last_idle->finish, st->vtime)) {
1564 /*
1565 * Forget the whole idle tree, increasing the vtime past
1566 * the last finish time of idle entities.
1567 */
1568 st->vtime = last_idle->finish;
1569 }
1570
1571 if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
1572 bfq_put_idle_entity(st, first_idle);
1573}
1574
1575static struct bfq_service_tree *
1576__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
e21b7a0b 1577 struct bfq_entity *entity)
aee69d78
PV
1578{
1579 struct bfq_service_tree *new_st = old_st;
1580
1581 if (entity->prio_changed) {
1582 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
44e44a1b 1583 unsigned int prev_weight, new_weight;
aee69d78 1584 struct bfq_data *bfqd = NULL;
e21b7a0b
AA
1585#ifdef CONFIG_BFQ_GROUP_IOSCHED
1586 struct bfq_sched_data *sd;
1587 struct bfq_group *bfqg;
1588#endif
aee69d78
PV
1589
1590 if (bfqq)
1591 bfqd = bfqq->bfqd;
e21b7a0b
AA
1592#ifdef CONFIG_BFQ_GROUP_IOSCHED
1593 else {
1594 sd = entity->my_sched_data;
1595 bfqg = container_of(sd, struct bfq_group, sched_data);
1596 bfqd = (struct bfq_data *)bfqg->bfqd;
1597 }
1598#endif
aee69d78
PV
1599
1600 old_st->wsum -= entity->weight;
1601
1602 if (entity->new_weight != entity->orig_weight) {
1603 if (entity->new_weight < BFQ_MIN_WEIGHT ||
1604 entity->new_weight > BFQ_MAX_WEIGHT) {
1605 pr_crit("update_weight_prio: new_weight %d\n",
1606 entity->new_weight);
1607 if (entity->new_weight < BFQ_MIN_WEIGHT)
1608 entity->new_weight = BFQ_MIN_WEIGHT;
1609 else
1610 entity->new_weight = BFQ_MAX_WEIGHT;
1611 }
1612 entity->orig_weight = entity->new_weight;
1613 if (bfqq)
1614 bfqq->ioprio =
1615 bfq_weight_to_ioprio(entity->orig_weight);
1616 }
1617
1618 if (bfqq)
1619 bfqq->ioprio_class = bfqq->new_ioprio_class;
1620 entity->prio_changed = 0;
1621
1622 /*
1623 * NOTE: here we may be changing the weight too early,
1624 * this will cause unfairness. The correct approach
1625 * would have required additional complexity to defer
1626 * weight changes to the proper time instants (i.e.,
1627 * when entity->finish <= old_st->vtime).
1628 */
1629 new_st = bfq_entity_service_tree(entity);
1630
1631 prev_weight = entity->weight;
44e44a1b
PV
1632 new_weight = entity->orig_weight *
1633 (bfqq ? bfqq->wr_coeff : 1);
aee69d78
PV
1634 entity->weight = new_weight;
1635
1636 new_st->wsum += entity->weight;
1637
1638 if (new_st != old_st)
1639 entity->start = new_st->vtime;
1640 }
1641
1642 return new_st;
1643}
1644
e21b7a0b
AA
1645static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1646static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1647
aee69d78
PV
1648/**
1649 * bfq_bfqq_served - update the scheduler status after selection for
1650 * service.
1651 * @bfqq: the queue being served.
1652 * @served: bytes to transfer.
1653 *
1654 * NOTE: this can be optimized, as the timestamps of upper level entities
1655 * are synchronized every time a new bfqq is selected for service. By now,
1656 * we keep it to better check consistency.
1657 */
1658static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1659{
1660 struct bfq_entity *entity = &bfqq->entity;
1661 struct bfq_service_tree *st;
1662
1663 for_each_entity(entity) {
1664 st = bfq_entity_service_tree(entity);
1665
1666 entity->service += served;
1667
1668 st->vtime += bfq_delta(served, st->wsum);
1669 bfq_forget_idle(st);
1670 }
e21b7a0b 1671 bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
aee69d78
PV
1672 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1673}
1674
1675/**
c074170e
PV
1676 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1677 * of the time interval during which bfqq has been in
1678 * service.
1679 * @bfqd: the device
aee69d78 1680 * @bfqq: the queue that needs a service update.
c074170e 1681 * @time_ms: the amount of time during which the queue has received service
aee69d78 1682 *
c074170e
PV
1683 * If a queue does not consume its budget fast enough, then providing
1684 * the queue with service fairness may impair throughput, more or less
1685 * severely. For this reason, queues that consume their budget slowly
1686 * are provided with time fairness instead of service fairness. This
1687 * goal is achieved through the BFQ scheduling engine, even if such an
1688 * engine works in the service, and not in the time domain. The trick
1689 * is charging these queues with an inflated amount of service, equal
1690 * to the amount of service that they would have received during their
1691 * service slot if they had been fast, i.e., if their requests had
1692 * been dispatched at a rate equal to the estimated peak rate.
1693 *
1694 * It is worth noting that time fairness can cause important
1695 * distortions in terms of bandwidth distribution, on devices with
1696 * internal queueing. The reason is that I/O requests dispatched
1697 * during the service slot of a queue may be served after that service
1698 * slot is finished, and may have a total processing time loosely
1699 * correlated with the duration of the service slot. This is
1700 * especially true for short service slots.
aee69d78 1701 */
c074170e
PV
1702static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1703 unsigned long time_ms)
aee69d78
PV
1704{
1705 struct bfq_entity *entity = &bfqq->entity;
c074170e
PV
1706 int tot_serv_to_charge = entity->service;
1707 unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
1708
1709 if (time_ms > 0 && time_ms < timeout_ms)
1710 tot_serv_to_charge =
1711 (bfqd->bfq_max_budget * time_ms) / timeout_ms;
aee69d78 1712
c074170e
PV
1713 if (tot_serv_to_charge < entity->service)
1714 tot_serv_to_charge = entity->service;
aee69d78 1715
c074170e
PV
1716 /* Increase budget to avoid inconsistencies */
1717 if (tot_serv_to_charge > entity->budget)
1718 entity->budget = tot_serv_to_charge;
1719
1720 bfq_bfqq_served(bfqq,
1721 max_t(int, 0, tot_serv_to_charge - entity->service));
aee69d78
PV
1722}
1723
e21b7a0b
AA
1724static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1725 struct bfq_service_tree *st,
1726 bool backshifted)
aee69d78 1727{
44e44a1b
PV
1728 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1729
aee69d78
PV
1730 st = __bfq_entity_update_weight_prio(st, entity);
1731 bfq_calc_finish(entity, entity->budget);
1732
1733 /*
1734 * If some queues enjoy backshifting for a while, then their
1735 * (virtual) finish timestamps may happen to become lower and
1736 * lower than the system virtual time. In particular, if
1737 * these queues often happen to be idle for short time
1738 * periods, and during such time periods other queues with
1739 * higher timestamps happen to be busy, then the backshifted
1740 * timestamps of the former queues can become much lower than
1741 * the system virtual time. In fact, to serve the queues with
1742 * higher timestamps while the ones with lower timestamps are
1743 * idle, the system virtual time may be pushed-up to much
1744 * higher values than the finish timestamps of the idle
1745 * queues. As a consequence, the finish timestamps of all new
1746 * or newly activated queues may end up being much larger than
1747 * those of lucky queues with backshifted timestamps. The
1748 * latter queues may then monopolize the device for a lot of
1749 * time. This would simply break service guarantees.
1750 *
1751 * To reduce this problem, push up a little bit the
1752 * backshifted timestamps of the queue associated with this
1753 * entity (only a queue can happen to have the backshifted
1754 * flag set): just enough to let the finish timestamp of the
1755 * queue be equal to the current value of the system virtual
1756 * time. This may introduce a little unfairness among queues
1757 * with backshifted timestamps, but it does not break
1758 * worst-case fairness guarantees.
44e44a1b
PV
1759 *
1760 * As a special case, if bfqq is weight-raised, push up
1761 * timestamps much less, to keep very low the probability that
1762 * this push up causes the backshifted finish timestamps of
1763 * weight-raised queues to become higher than the backshifted
1764 * finish timestamps of non weight-raised queues.
aee69d78
PV
1765 */
1766 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
1767 unsigned long delta = st->vtime - entity->finish;
1768
44e44a1b
PV
1769 if (bfqq)
1770 delta /= bfqq->wr_coeff;
1771
aee69d78
PV
1772 entity->start += delta;
1773 entity->finish += delta;
1774 }
1775
1776 bfq_active_insert(st, entity);
1777}
1778
1779/**
e21b7a0b
AA
1780 * __bfq_activate_entity - handle activation of entity.
1781 * @entity: the entity being activated.
1782 * @non_blocking_wait_rq: true if entity was waiting for a request
1783 *
1784 * Called for a 'true' activation, i.e., if entity is not active and
1785 * one of its children receives a new request.
1786 *
1787 * Basically, this function updates the timestamps of entity and
1788 * inserts entity into its active tree, ater possible extracting it
1789 * from its idle tree.
1790 */
1791static void __bfq_activate_entity(struct bfq_entity *entity,
1792 bool non_blocking_wait_rq)
1793{
1794 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1795 bool backshifted = false;
1796 unsigned long long min_vstart;
1797
1798 /* See comments on bfq_fqq_update_budg_for_activation */
1799 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1800 backshifted = true;
1801 min_vstart = entity->finish;
1802 } else
1803 min_vstart = st->vtime;
1804
1805 if (entity->tree == &st->idle) {
1806 /*
1807 * Must be on the idle tree, bfq_idle_extract() will
1808 * check for that.
1809 */
1810 bfq_idle_extract(st, entity);
1811 entity->start = bfq_gt(min_vstart, entity->finish) ?
1812 min_vstart : entity->finish;
1813 } else {
1814 /*
1815 * The finish time of the entity may be invalid, and
1816 * it is in the past for sure, otherwise the queue
1817 * would have been on the idle tree.
1818 */
1819 entity->start = min_vstart;
1820 st->wsum += entity->weight;
1821 /*
1822 * entity is about to be inserted into a service tree,
1823 * and then set in service: get a reference to make
1824 * sure entity does not disappear until it is no
1825 * longer in service or scheduled for service.
1826 */
1827 bfq_get_entity(entity);
1828
1829 entity->on_st = true;
1830 }
1831
1832 bfq_update_fin_time_enqueue(entity, st, backshifted);
1833}
1834
1835/**
1836 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1837 * @entity: the entity being requeued or repositioned.
1838 *
1839 * Requeueing is needed if this entity stops being served, which
1840 * happens if a leaf descendant entity has expired. On the other hand,
1841 * repositioning is needed if the next_inservice_entity for the child
1842 * entity has changed. See the comments inside the function for
1843 * details.
1844 *
1845 * Basically, this function: 1) removes entity from its active tree if
1846 * present there, 2) updates the timestamps of entity and 3) inserts
1847 * entity back into its active tree (in the new, right position for
1848 * the new values of the timestamps).
1849 */
1850static void __bfq_requeue_entity(struct bfq_entity *entity)
1851{
1852 struct bfq_sched_data *sd = entity->sched_data;
1853 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1854
1855 if (entity == sd->in_service_entity) {
1856 /*
1857 * We are requeueing the current in-service entity,
1858 * which may have to be done for one of the following
1859 * reasons:
1860 * - entity represents the in-service queue, and the
1861 * in-service queue is being requeued after an
1862 * expiration;
1863 * - entity represents a group, and its budget has
1864 * changed because one of its child entities has
1865 * just been either activated or requeued for some
1866 * reason; the timestamps of the entity need then to
1867 * be updated, and the entity needs to be enqueued
1868 * or repositioned accordingly.
1869 *
1870 * In particular, before requeueing, the start time of
1871 * the entity must be moved forward to account for the
1872 * service that the entity has received while in
1873 * service. This is done by the next instructions. The
1874 * finish time will then be updated according to this
1875 * new value of the start time, and to the budget of
1876 * the entity.
1877 */
1878 bfq_calc_finish(entity, entity->service);
1879 entity->start = entity->finish;
1880 /*
1881 * In addition, if the entity had more than one child
1882 * when set in service, then was not extracted from
1883 * the active tree. This implies that the position of
1884 * the entity in the active tree may need to be
1885 * changed now, because we have just updated the start
1886 * time of the entity, and we will update its finish
1887 * time in a moment (the requeueing is then, more
1888 * precisely, a repositioning in this case). To
1889 * implement this repositioning, we: 1) dequeue the
1890 * entity here, 2) update the finish time and
1891 * requeue the entity according to the new
1892 * timestamps below.
1893 */
1894 if (entity->tree)
1895 bfq_active_extract(st, entity);
1896 } else { /* The entity is already active, and not in service */
1897 /*
1898 * In this case, this function gets called only if the
1899 * next_in_service entity below this entity has
1900 * changed, and this change has caused the budget of
1901 * this entity to change, which, finally implies that
1902 * the finish time of this entity must be
1903 * updated. Such an update may cause the scheduling,
1904 * i.e., the position in the active tree, of this
1905 * entity to change. We handle this change by: 1)
1906 * dequeueing the entity here, 2) updating the finish
1907 * time and requeueing the entity according to the new
1908 * timestamps below. This is the same approach as the
1909 * non-extracted-entity sub-case above.
1910 */
1911 bfq_active_extract(st, entity);
1912 }
1913
1914 bfq_update_fin_time_enqueue(entity, st, false);
1915}
1916
1917static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1918 struct bfq_sched_data *sd,
1919 bool non_blocking_wait_rq)
1920{
1921 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1922
1923 if (sd->in_service_entity == entity || entity->tree == &st->active)
1924 /*
1925 * in service or already queued on the active tree,
1926 * requeue or reposition
1927 */
1928 __bfq_requeue_entity(entity);
1929 else
1930 /*
1931 * Not in service and not queued on its active tree:
1932 * the activity is idle and this is a true activation.
1933 */
1934 __bfq_activate_entity(entity, non_blocking_wait_rq);
1935}
1936
1937
1938/**
1939 * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
1940 * and activate, requeue or reposition all ancestors
1941 * for which such an update becomes necessary.
aee69d78
PV
1942 * @entity: the entity to activate.
1943 * @non_blocking_wait_rq: true if this entity was waiting for a request
e21b7a0b
AA
1944 * @requeue: true if this is a requeue, which implies that bfqq is
1945 * being expired; thus ALL its ancestors stop being served and must
1946 * therefore be requeued
aee69d78 1947 */
e21b7a0b
AA
1948static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1949 bool non_blocking_wait_rq,
1950 bool requeue)
aee69d78
PV
1951{
1952 struct bfq_sched_data *sd;
1953
1954 for_each_entity(entity) {
aee69d78 1955 sd = entity->sched_data;
e21b7a0b
AA
1956 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1957
1958 if (!bfq_update_next_in_service(sd, entity) && !requeue)
aee69d78
PV
1959 break;
1960 }
1961}
1962
1963/**
1964 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1965 * @entity: the entity to deactivate.
e21b7a0b
AA
1966 * @ins_into_idle_tree: if false, the entity will not be put into the
1967 * idle tree.
aee69d78 1968 *
e21b7a0b
AA
1969 * Deactivates an entity, independently from its previous state. Must
1970 * be invoked only if entity is on a service tree. Extracts the entity
1971 * from that tree, and if necessary and allowed, puts it on the idle
1972 * tree.
aee69d78 1973 */
e21b7a0b
AA
1974static bool __bfq_deactivate_entity(struct bfq_entity *entity,
1975 bool ins_into_idle_tree)
aee69d78
PV
1976{
1977 struct bfq_sched_data *sd = entity->sched_data;
1978 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1979 int is_in_service = entity == sd->in_service_entity;
aee69d78 1980
e21b7a0b
AA
1981 if (!entity->on_st) /* entity never activated, or already inactive */
1982 return false;
aee69d78 1983
e21b7a0b 1984 if (is_in_service)
aee69d78 1985 bfq_calc_finish(entity, entity->service);
e21b7a0b
AA
1986
1987 if (entity->tree == &st->active)
aee69d78 1988 bfq_active_extract(st, entity);
e21b7a0b 1989 else if (!is_in_service && entity->tree == &st->idle)
aee69d78
PV
1990 bfq_idle_extract(st, entity);
1991
e21b7a0b 1992 if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
aee69d78
PV
1993 bfq_forget_entity(st, entity, is_in_service);
1994 else
1995 bfq_idle_insert(st, entity);
1996
e21b7a0b 1997 return true;
aee69d78
PV
1998}
1999
2000/**
e21b7a0b 2001 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
aee69d78 2002 * @entity: the entity to deactivate.
e21b7a0b 2003 * @ins_into_idle_tree: true if the entity can be put on the idle tree
aee69d78 2004 */
e21b7a0b
AA
2005static void bfq_deactivate_entity(struct bfq_entity *entity,
2006 bool ins_into_idle_tree,
2007 bool expiration)
aee69d78
PV
2008{
2009 struct bfq_sched_data *sd;
2010 struct bfq_entity *parent = NULL;
2011
2012 for_each_entity_safe(entity, parent) {
2013 sd = entity->sched_data;
2014
e21b7a0b 2015 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
aee69d78 2016 /*
e21b7a0b
AA
2017 * entity is not in any tree any more, so
2018 * this deactivation is a no-op, and there is
2019 * nothing to change for upper-level entities
2020 * (in case of expiration, this can never
2021 * happen).
aee69d78 2022 */
e21b7a0b
AA
2023 return;
2024 }
2025
2026 if (sd->next_in_service == entity)
2027 /*
2028 * entity was the next_in_service entity,
2029 * then, since entity has just been
2030 * deactivated, a new one must be found.
2031 */
2032 bfq_update_next_in_service(sd, NULL);
aee69d78
PV
2033
2034 if (sd->next_in_service)
2035 /*
e21b7a0b
AA
2036 * The parent entity is still backlogged,
2037 * because next_in_service is not NULL. So, no
2038 * further upwards deactivation must be
2039 * performed. Yet, next_in_service has
2040 * changed. Then the schedule does need to be
2041 * updated upwards.
aee69d78 2042 */
e21b7a0b 2043 break;
aee69d78
PV
2044
2045 /*
e21b7a0b
AA
2046 * If we get here, then the parent is no more
2047 * backlogged and we need to propagate the
2048 * deactivation upwards. Thus let the loop go on.
aee69d78 2049 */
aee69d78 2050
e21b7a0b
AA
2051 /*
2052 * Also let parent be queued into the idle tree on
2053 * deactivation, to preserve service guarantees, and
2054 * assuming that who invoked this function does not
2055 * need parent entities too to be removed completely.
2056 */
2057 ins_into_idle_tree = true;
2058 }
aee69d78 2059
e21b7a0b
AA
2060 /*
2061 * If the deactivation loop is fully executed, then there are
2062 * no more entities to touch and next loop is not executed at
2063 * all. Otherwise, requeue remaining entities if they are
2064 * about to stop receiving service, or reposition them if this
2065 * is not the case.
2066 */
aee69d78
PV
2067 entity = parent;
2068 for_each_entity(entity) {
e21b7a0b
AA
2069 /*
2070 * Invoke __bfq_requeue_entity on entity, even if
2071 * already active, to requeue/reposition it in the
2072 * active tree (because sd->next_in_service has
2073 * changed)
2074 */
2075 __bfq_requeue_entity(entity);
aee69d78
PV
2076
2077 sd = entity->sched_data;
e21b7a0b
AA
2078 if (!bfq_update_next_in_service(sd, entity) &&
2079 !expiration)
2080 /*
2081 * next_in_service unchanged or not causing
2082 * any change in entity->parent->sd, and no
2083 * requeueing needed for expiration: stop
2084 * here.
2085 */
aee69d78
PV
2086 break;
2087 }
2088}
2089
2090/**
e21b7a0b
AA
2091 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2092 * if needed, to have at least one entity eligible.
aee69d78
PV
2093 * @st: the service tree to act upon.
2094 *
e21b7a0b 2095 * Assumes that st is not empty.
aee69d78 2096 */
e21b7a0b 2097static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
aee69d78 2098{
e21b7a0b
AA
2099 struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
2100
2101 if (bfq_gt(root_entity->min_start, st->vtime))
2102 return root_entity->min_start;
2103
2104 return st->vtime;
2105}
aee69d78 2106
e21b7a0b
AA
2107static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
2108{
2109 if (new_value > st->vtime) {
2110 st->vtime = new_value;
aee69d78
PV
2111 bfq_forget_idle(st);
2112 }
2113}
2114
2115/**
2116 * bfq_first_active_entity - find the eligible entity with
2117 * the smallest finish time
2118 * @st: the service tree to select from.
e21b7a0b 2119 * @vtime: the system virtual to use as a reference for eligibility
aee69d78
PV
2120 *
2121 * This function searches the first schedulable entity, starting from the
2122 * root of the tree and going on the left every time on this side there is
2123 * a subtree with at least one eligible (start >= vtime) entity. The path on
2124 * the right is followed only if a) the left subtree contains no eligible
2125 * entities and b) no eligible entity has been found yet.
2126 */
e21b7a0b
AA
2127static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
2128 u64 vtime)
aee69d78
PV
2129{
2130 struct bfq_entity *entry, *first = NULL;
2131 struct rb_node *node = st->active.rb_node;
2132
2133 while (node) {
2134 entry = rb_entry(node, struct bfq_entity, rb_node);
2135left:
e21b7a0b 2136 if (!bfq_gt(entry->start, vtime))
aee69d78
PV
2137 first = entry;
2138
2139 if (node->rb_left) {
2140 entry = rb_entry(node->rb_left,
2141 struct bfq_entity, rb_node);
e21b7a0b 2142 if (!bfq_gt(entry->min_start, vtime)) {
aee69d78
PV
2143 node = node->rb_left;
2144 goto left;
2145 }
2146 }
2147 if (first)
2148 break;
2149 node = node->rb_right;
2150 }
2151
e21b7a0b
AA
2152 return first;
2153}
2154
2155/**
2156 * __bfq_lookup_next_entity - return the first eligible entity in @st.
2157 * @st: the service tree.
2158 *
2159 * If there is no in-service entity for the sched_data st belongs to,
2160 * then return the entity that will be set in service if:
2161 * 1) the parent entity this st belongs to is set in service;
2162 * 2) no entity belonging to such parent entity undergoes a state change
2163 * that would influence the timestamps of the entity (e.g., becomes idle,
2164 * becomes backlogged, changes its budget, ...).
2165 *
2166 * In this first case, update the virtual time in @st too (see the
2167 * comments on this update inside the function).
2168 *
2169 * In constrast, if there is an in-service entity, then return the
2170 * entity that would be set in service if not only the above
2171 * conditions, but also the next one held true: the currently
2172 * in-service entity, on expiration,
2173 * 1) gets a finish time equal to the current one, or
2174 * 2) is not eligible any more, or
2175 * 3) is idle.
2176 */
2177static struct bfq_entity *
2178__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
2179{
2180 struct bfq_entity *entity;
2181 u64 new_vtime;
2182
2183 if (RB_EMPTY_ROOT(&st->active))
2184 return NULL;
2185
2186 /*
2187 * Get the value of the system virtual time for which at
2188 * least one entity is eligible.
2189 */
2190 new_vtime = bfq_calc_vtime_jump(st);
2191
2192 /*
2193 * If there is no in-service entity for the sched_data this
2194 * active tree belongs to, then push the system virtual time
2195 * up to the value that guarantees that at least one entity is
2196 * eligible. If, instead, there is an in-service entity, then
2197 * do not make any such update, because there is already an
2198 * eligible entity, namely the in-service one (even if the
2199 * entity is not on st, because it was extracted when set in
2200 * service).
2201 */
2202 if (!in_service)
2203 bfq_update_vtime(st, new_vtime);
2204
2205 entity = bfq_first_active_entity(st, new_vtime);
2206
2207 return entity;
2208}
2209
2210/**
2211 * bfq_lookup_next_entity - return the first eligible entity in @sd.
2212 * @sd: the sched_data.
2213 *
2214 * This function is invoked when there has been a change in the trees
2215 * for sd, and we need know what is the new next entity after this
2216 * change.
2217 */
2218static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
2219{
2220 struct bfq_service_tree *st = sd->service_tree;
2221 struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
2222 struct bfq_entity *entity = NULL;
2223 int class_idx = 0;
2224
2225 /*
2226 * Choose from idle class, if needed to guarantee a minimum
2227 * bandwidth to this class (and if there is some active entity
2228 * in idle class). This should also mitigate
2229 * priority-inversion problems in case a low priority task is
2230 * holding file system resources.
2231 */
2232 if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
2233 BFQ_CL_IDLE_TIMEOUT)) {
2234 if (!RB_EMPTY_ROOT(&idle_class_st->active))
2235 class_idx = BFQ_IOPRIO_CLASSES - 1;
2236 /* About to be served if backlogged, or not yet backlogged */
2237 sd->bfq_class_idle_last_service = jiffies;
2238 }
2239
2240 /*
2241 * Find the next entity to serve for the highest-priority
2242 * class, unless the idle class needs to be served.
2243 */
2244 for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
2245 entity = __bfq_lookup_next_entity(st + class_idx,
2246 sd->in_service_entity);
2247
2248 if (entity)
2249 break;
2250 }
2251
2252 if (!entity)
2253 return NULL;
2254
2255 return entity;
2256}
2257
2258static bool next_queue_may_preempt(struct bfq_data *bfqd)
2259{
2260 struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
2261
2262 return sd->next_in_service != sd->in_service_entity;
2263}
2264
2265/*
2266 * Get next queue for service.
2267 */
2268static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
2269{
2270 struct bfq_entity *entity = NULL;
2271 struct bfq_sched_data *sd;
2272 struct bfq_queue *bfqq;
2273
2274 if (bfqd->busy_queues == 0)
2275 return NULL;
2276
2277 /*
2278 * Traverse the path from the root to the leaf entity to
2279 * serve. Set in service all the entities visited along the
2280 * way.
2281 */
2282 sd = &bfqd->root_group->sched_data;
2283 for (; sd ; sd = entity->my_sched_data) {
2284 /*
2285 * WARNING. We are about to set the in-service entity
2286 * to sd->next_in_service, i.e., to the (cached) value
2287 * returned by bfq_lookup_next_entity(sd) the last
2288 * time it was invoked, i.e., the last time when the
2289 * service order in sd changed as a consequence of the
2290 * activation or deactivation of an entity. In this
2291 * respect, if we execute bfq_lookup_next_entity(sd)
2292 * in this very moment, it may, although with low
2293 * probability, yield a different entity than that
2294 * pointed to by sd->next_in_service. This rare event
2295 * happens in case there was no CLASS_IDLE entity to
2296 * serve for sd when bfq_lookup_next_entity(sd) was
2297 * invoked for the last time, while there is now one
2298 * such entity.
2299 *
2300 * If the above event happens, then the scheduling of
2301 * such entity in CLASS_IDLE is postponed until the
2302 * service of the sd->next_in_service entity
2303 * finishes. In fact, when the latter is expired,
2304 * bfq_lookup_next_entity(sd) gets called again,
2305 * exactly to update sd->next_in_service.
2306 */
2307
2308 /* Make next_in_service entity become in_service_entity */
2309 entity = sd->next_in_service;
2310 sd->in_service_entity = entity;
2311
2312 /*
2313 * Reset the accumulator of the amount of service that
2314 * the entity is about to receive.
2315 */
2316 entity->service = 0;
2317
2318 /*
2319 * If entity is no longer a candidate for next
2320 * service, then we extract it from its active tree,
2321 * for the following reason. To further boost the
2322 * throughput in some special case, BFQ needs to know
2323 * which is the next candidate entity to serve, while
2324 * there is already an entity in service. In this
2325 * respect, to make it easy to compute/update the next
2326 * candidate entity to serve after the current
2327 * candidate has been set in service, there is a case
2328 * where it is necessary to extract the current
2329 * candidate from its service tree. Such a case is
2330 * when the entity just set in service cannot be also
2331 * a candidate for next service. Details about when
2332 * this conditions holds are reported in the comments
2333 * on the function bfq_no_longer_next_in_service()
2334 * invoked below.
2335 */
2336 if (bfq_no_longer_next_in_service(entity))
2337 bfq_active_extract(bfq_entity_service_tree(entity),
2338 entity);
2339
2340 /*
2341 * For the same reason why we may have just extracted
2342 * entity from its active tree, we may need to update
2343 * next_in_service for the sched_data of entity too,
2344 * regardless of whether entity has been extracted.
2345 * In fact, even if entity has not been extracted, a
2346 * descendant entity may get extracted. Such an event
2347 * would cause a change in next_in_service for the
2348 * level of the descendant entity, and thus possibly
2349 * back to upper levels.
2350 *
2351 * We cannot perform the resulting needed update
2352 * before the end of this loop, because, to know which
2353 * is the correct next-to-serve candidate entity for
2354 * each level, we need first to find the leaf entity
2355 * to set in service. In fact, only after we know
2356 * which is the next-to-serve leaf entity, we can
2357 * discover whether the parent entity of the leaf
2358 * entity becomes the next-to-serve, and so on.
2359 */
2360
2361 }
2362
2363 bfqq = bfq_entity_to_bfqq(entity);
2364
2365 /*
2366 * We can finally update all next-to-serve entities along the
2367 * path from the leaf entity just set in service to the root.
2368 */
2369 for_each_entity(entity) {
2370 struct bfq_sched_data *sd = entity->sched_data;
2371
2372 if (!bfq_update_next_in_service(sd, NULL))
2373 break;
2374 }
2375
2376 return bfqq;
2377}
2378
2379static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
2380{
2381 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
2382 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
2383 struct bfq_entity *entity = in_serv_entity;
2384
2385 if (bfqd->in_service_bic) {
2386 put_io_context(bfqd->in_service_bic->icq.ioc);
2387 bfqd->in_service_bic = NULL;
2388 }
2389
2390 bfq_clear_bfqq_wait_request(in_serv_bfqq);
2391 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
2392 bfqd->in_service_queue = NULL;
2393
2394 /*
2395 * When this function is called, all in-service entities have
2396 * been properly deactivated or requeued, so we can safely
2397 * execute the final step: reset in_service_entity along the
2398 * path from entity to the root.
2399 */
2400 for_each_entity(entity)
2401 entity->sched_data->in_service_entity = NULL;
2402
2403 /*
2404 * in_serv_entity is no longer in service, so, if it is in no
2405 * service tree either, then release the service reference to
2406 * the queue it represents (taken with bfq_get_entity).
2407 */
2408 if (!in_serv_entity->on_st)
2409 bfq_put_queue(in_serv_bfqq);
2410}
2411
2412static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2413 bool ins_into_idle_tree, bool expiration)
2414{
2415 struct bfq_entity *entity = &bfqq->entity;
2416
2417 bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
2418}
2419
2420static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2421{
2422 struct bfq_entity *entity = &bfqq->entity;
2423
2424 bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
2425 false);
2426 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
2427}
2428
2429static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2430{
2431 struct bfq_entity *entity = &bfqq->entity;
2432
2433 bfq_activate_requeue_entity(entity, false,
2434 bfqq == bfqd->in_service_queue);
2435}
2436
2437static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
2438
2439/*
2440 * Called when the bfqq no longer has requests pending, remove it from
2441 * the service tree. As a special case, it can be invoked during an
2442 * expiration.
2443 */
2444static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2445 bool expiration)
2446{
2447 bfq_log_bfqq(bfqd, bfqq, "del from busy");
2448
2449 bfq_clear_bfqq_busy(bfqq);
2450
2451 bfqd->busy_queues--;
2452
2453 bfqg_stats_update_dequeue(bfqq_group(bfqq));
2454
2455 bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
2456}
2457
2458/*
2459 * Called when an inactive queue receives a new request.
2460 */
2461static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2462{
2463 bfq_log_bfqq(bfqd, bfqq, "add to busy");
2464
2465 bfq_activate_bfqq(bfqd, bfqq);
2466
2467 bfq_mark_bfqq_busy(bfqq);
2468 bfqd->busy_queues++;
2469}
2470
2471#ifdef CONFIG_BFQ_GROUP_IOSCHED
2472
2473/* bfqg stats flags */
2474enum bfqg_stats_flags {
2475 BFQG_stats_waiting = 0,
2476 BFQG_stats_idling,
2477 BFQG_stats_empty,
2478};
2479
2480#define BFQG_FLAG_FNS(name) \
2481static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
2482{ \
2483 stats->flags |= (1 << BFQG_stats_##name); \
2484} \
2485static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
2486{ \
2487 stats->flags &= ~(1 << BFQG_stats_##name); \
2488} \
2489static int bfqg_stats_##name(struct bfqg_stats *stats) \
2490{ \
2491 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
2492} \
2493
2494BFQG_FLAG_FNS(waiting)
2495BFQG_FLAG_FNS(idling)
2496BFQG_FLAG_FNS(empty)
2497#undef BFQG_FLAG_FNS
2498
2499/* This should be called with the queue_lock held. */
2500static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
2501{
2502 unsigned long long now;
2503
2504 if (!bfqg_stats_waiting(stats))
2505 return;
2506
2507 now = sched_clock();
2508 if (time_after64(now, stats->start_group_wait_time))
2509 blkg_stat_add(&stats->group_wait_time,
2510 now - stats->start_group_wait_time);
2511 bfqg_stats_clear_waiting(stats);
2512}
2513
2514/* This should be called with the queue_lock held. */
2515static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
2516 struct bfq_group *curr_bfqg)
2517{
2518 struct bfqg_stats *stats = &bfqg->stats;
2519
2520 if (bfqg_stats_waiting(stats))
2521 return;
2522 if (bfqg == curr_bfqg)
2523 return;
2524 stats->start_group_wait_time = sched_clock();
2525 bfqg_stats_mark_waiting(stats);
2526}
2527
2528/* This should be called with the queue_lock held. */
2529static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
2530{
2531 unsigned long long now;
2532
2533 if (!bfqg_stats_empty(stats))
2534 return;
2535
2536 now = sched_clock();
2537 if (time_after64(now, stats->start_empty_time))
2538 blkg_stat_add(&stats->empty_time,
2539 now - stats->start_empty_time);
2540 bfqg_stats_clear_empty(stats);
2541}
2542
2543static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
2544{
2545 blkg_stat_add(&bfqg->stats.dequeue, 1);
2546}
2547
2548static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
2549{
2550 struct bfqg_stats *stats = &bfqg->stats;
2551
2552 if (blkg_rwstat_total(&stats->queued))
2553 return;
2554
2555 /*
2556 * group is already marked empty. This can happen if bfqq got new
2557 * request in parent group and moved to this group while being added
2558 * to service tree. Just ignore the event and move on.
2559 */
2560 if (bfqg_stats_empty(stats))
2561 return;
2562
2563 stats->start_empty_time = sched_clock();
2564 bfqg_stats_mark_empty(stats);
2565}
2566
2567static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
2568{
2569 struct bfqg_stats *stats = &bfqg->stats;
2570
2571 if (bfqg_stats_idling(stats)) {
2572 unsigned long long now = sched_clock();
2573
2574 if (time_after64(now, stats->start_idle_time))
2575 blkg_stat_add(&stats->idle_time,
2576 now - stats->start_idle_time);
2577 bfqg_stats_clear_idling(stats);
2578 }
2579}
2580
2581static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
2582{
2583 struct bfqg_stats *stats = &bfqg->stats;
2584
2585 stats->start_idle_time = sched_clock();
2586 bfqg_stats_mark_idling(stats);
2587}
2588
2589static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
2590{
2591 struct bfqg_stats *stats = &bfqg->stats;
2592
2593 blkg_stat_add(&stats->avg_queue_size_sum,
2594 blkg_rwstat_total(&stats->queued));
2595 blkg_stat_add(&stats->avg_queue_size_samples, 1);
2596 bfqg_stats_update_group_wait_time(stats);
2597}
2598
2599/*
2600 * blk-cgroup policy-related handlers
2601 * The following functions help in converting between blk-cgroup
2602 * internal structures and BFQ-specific structures.
2603 */
2604
2605static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
2606{
2607 return pd ? container_of(pd, struct bfq_group, pd) : NULL;
2608}
2609
2610static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
2611{
2612 return pd_to_blkg(&bfqg->pd);
2613}
2614
2615static struct blkcg_policy blkcg_policy_bfq;
2616
2617static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
2618{
2619 return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
2620}
2621
2622/*
2623 * bfq_group handlers
2624 * The following functions help in navigating the bfq_group hierarchy
2625 * by allowing to find the parent of a bfq_group or the bfq_group
2626 * associated to a bfq_queue.
2627 */
2628
2629static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
2630{
2631 struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
2632
2633 return pblkg ? blkg_to_bfqg(pblkg) : NULL;
2634}
2635
2636static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
2637{
2638 struct bfq_entity *group_entity = bfqq->entity.parent;
2639
2640 return group_entity ? container_of(group_entity, struct bfq_group,
2641 entity) :
2642 bfqq->bfqd->root_group;
2643}
2644
2645/*
2646 * The following two functions handle get and put of a bfq_group by
2647 * wrapping the related blk-cgroup hooks.
2648 */
2649
2650static void bfqg_get(struct bfq_group *bfqg)
2651{
2652 return blkg_get(bfqg_to_blkg(bfqg));
2653}
2654
2655static void bfqg_put(struct bfq_group *bfqg)
2656{
2657 return blkg_put(bfqg_to_blkg(bfqg));
2658}
2659
2660static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
2661 struct bfq_queue *bfqq,
2662 unsigned int op)
2663{
2664 blkg_rwstat_add(&bfqg->stats.queued, op, 1);
2665 bfqg_stats_end_empty_time(&bfqg->stats);
2666 if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
2667 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
2668}
2669
2670static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
2671{
2672 blkg_rwstat_add(&bfqg->stats.queued, op, -1);
2673}
2674
2675static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
2676{
2677 blkg_rwstat_add(&bfqg->stats.merged, op, 1);
2678}
2679
2680static void bfqg_stats_update_completion(struct bfq_group *bfqg,
2681 uint64_t start_time, uint64_t io_start_time,
2682 unsigned int op)
2683{
2684 struct bfqg_stats *stats = &bfqg->stats;
2685 unsigned long long now = sched_clock();
2686
2687 if (time_after64(now, io_start_time))
2688 blkg_rwstat_add(&stats->service_time, op,
2689 now - io_start_time);
2690 if (time_after64(io_start_time, start_time))
2691 blkg_rwstat_add(&stats->wait_time, op,
2692 io_start_time - start_time);
2693}
2694
2695/* @stats = 0 */
2696static void bfqg_stats_reset(struct bfqg_stats *stats)
2697{
2698 /* queued stats shouldn't be cleared */
2699 blkg_rwstat_reset(&stats->merged);
2700 blkg_rwstat_reset(&stats->service_time);
2701 blkg_rwstat_reset(&stats->wait_time);
2702 blkg_stat_reset(&stats->time);
2703 blkg_stat_reset(&stats->avg_queue_size_sum);
2704 blkg_stat_reset(&stats->avg_queue_size_samples);
2705 blkg_stat_reset(&stats->dequeue);
2706 blkg_stat_reset(&stats->group_wait_time);
2707 blkg_stat_reset(&stats->idle_time);
2708 blkg_stat_reset(&stats->empty_time);
2709}
2710
2711/* @to += @from */
2712static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
2713{
2714 if (!to || !from)
2715 return;
2716
2717 /* queued stats shouldn't be cleared */
2718 blkg_rwstat_add_aux(&to->merged, &from->merged);
2719 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
2720 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
2721 blkg_stat_add_aux(&from->time, &from->time);
2722 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
2723 blkg_stat_add_aux(&to->avg_queue_size_samples,
2724 &from->avg_queue_size_samples);
2725 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
2726 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
2727 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
2728 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
2729}
2730
2731/*
2732 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
2733 * recursive stats can still account for the amount used by this bfqg after
2734 * it's gone.
2735 */
2736static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
2737{
2738 struct bfq_group *parent;
2739
2740 if (!bfqg) /* root_group */
2741 return;
2742
2743 parent = bfqg_parent(bfqg);
2744
2745 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
2746
2747 if (unlikely(!parent))
2748 return;
2749
2750 bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
2751 bfqg_stats_reset(&bfqg->stats);
2752}
2753
2754static void bfq_init_entity(struct bfq_entity *entity,
2755 struct bfq_group *bfqg)
2756{
2757 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2758
2759 entity->weight = entity->new_weight;
2760 entity->orig_weight = entity->new_weight;
2761 if (bfqq) {
2762 bfqq->ioprio = bfqq->new_ioprio;
2763 bfqq->ioprio_class = bfqq->new_ioprio_class;
2764 bfqg_get(bfqg);
2765 }
2766 entity->parent = bfqg->my_entity; /* NULL for root group */
2767 entity->sched_data = &bfqg->sched_data;
2768}
2769
2770static void bfqg_stats_exit(struct bfqg_stats *stats)
2771{
2772 blkg_rwstat_exit(&stats->merged);
2773 blkg_rwstat_exit(&stats->service_time);
2774 blkg_rwstat_exit(&stats->wait_time);
2775 blkg_rwstat_exit(&stats->queued);
2776 blkg_stat_exit(&stats->time);
2777 blkg_stat_exit(&stats->avg_queue_size_sum);
2778 blkg_stat_exit(&stats->avg_queue_size_samples);
2779 blkg_stat_exit(&stats->dequeue);
2780 blkg_stat_exit(&stats->group_wait_time);
2781 blkg_stat_exit(&stats->idle_time);
2782 blkg_stat_exit(&stats->empty_time);
2783}
2784
2785static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
2786{
2787 if (blkg_rwstat_init(&stats->merged, gfp) ||
2788 blkg_rwstat_init(&stats->service_time, gfp) ||
2789 blkg_rwstat_init(&stats->wait_time, gfp) ||
2790 blkg_rwstat_init(&stats->queued, gfp) ||
2791 blkg_stat_init(&stats->time, gfp) ||
2792 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
2793 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
2794 blkg_stat_init(&stats->dequeue, gfp) ||
2795 blkg_stat_init(&stats->group_wait_time, gfp) ||
2796 blkg_stat_init(&stats->idle_time, gfp) ||
2797 blkg_stat_init(&stats->empty_time, gfp)) {
2798 bfqg_stats_exit(stats);
2799 return -ENOMEM;
2800 }
2801
2802 return 0;
2803}
2804
2805static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
2806{
2807 return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
2808}
2809
2810static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
2811{
2812 return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
2813}
2814
2815static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
2816{
2817 struct bfq_group_data *bgd;
2818
2819 bgd = kzalloc(sizeof(*bgd), gfp);
2820 if (!bgd)
2821 return NULL;
2822 return &bgd->pd;
2823}
2824
2825static void bfq_cpd_init(struct blkcg_policy_data *cpd)
2826{
2827 struct bfq_group_data *d = cpd_to_bfqgd(cpd);
2828
2829 d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
2830 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
2831}
2832
2833static void bfq_cpd_free(struct blkcg_policy_data *cpd)
2834{
2835 kfree(cpd_to_bfqgd(cpd));
2836}
2837
2838static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
2839{
2840 struct bfq_group *bfqg;
2841
2842 bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
2843 if (!bfqg)
2844 return NULL;
2845
2846 if (bfqg_stats_init(&bfqg->stats, gfp)) {
2847 kfree(bfqg);
2848 return NULL;
2849 }
2850
2851 return &bfqg->pd;
2852}
2853
2854static void bfq_pd_init(struct blkg_policy_data *pd)
2855{
2856 struct blkcg_gq *blkg = pd_to_blkg(pd);
2857 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
2858 struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
2859 struct bfq_entity *entity = &bfqg->entity;
2860 struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
2861
2862 entity->orig_weight = entity->weight = entity->new_weight = d->weight;
2863 entity->my_sched_data = &bfqg->sched_data;
2864 bfqg->my_entity = entity; /*
2865 * the root_group's will be set to NULL
2866 * in bfq_init_queue()
2867 */
2868 bfqg->bfqd = bfqd;
2869}
2870
2871static void bfq_pd_free(struct blkg_policy_data *pd)
2872{
2873 struct bfq_group *bfqg = pd_to_bfqg(pd);
2874
2875 bfqg_stats_exit(&bfqg->stats);
2876 return kfree(bfqg);
2877}
2878
2879static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
2880{
2881 struct bfq_group *bfqg = pd_to_bfqg(pd);
2882
2883 bfqg_stats_reset(&bfqg->stats);
2884}
2885
2886static void bfq_group_set_parent(struct bfq_group *bfqg,
2887 struct bfq_group *parent)
2888{
2889 struct bfq_entity *entity;
2890
2891 entity = &bfqg->entity;
2892 entity->parent = parent->my_entity;
2893 entity->sched_data = &parent->sched_data;
2894}
2895
2896static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
2897 struct blkcg *blkcg)
2898{
2899 struct blkcg_gq *blkg;
2900
2901 blkg = blkg_lookup(blkcg, bfqd->queue);
2902 if (likely(blkg))
2903 return blkg_to_bfqg(blkg);
2904 return NULL;
2905}
2906
2907static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
2908 struct blkcg *blkcg)
2909{
2910 struct bfq_group *bfqg, *parent;
2911 struct bfq_entity *entity;
2912
2913 bfqg = bfq_lookup_bfqg(bfqd, blkcg);
2914
2915 if (unlikely(!bfqg))
2916 return NULL;
2917
2918 /*
2919 * Update chain of bfq_groups as we might be handling a leaf group
2920 * which, along with some of its relatives, has not been hooked yet
2921 * to the private hierarchy of BFQ.
2922 */
2923 entity = &bfqg->entity;
2924 for_each_entity(entity) {
2925 bfqg = container_of(entity, struct bfq_group, entity);
2926 if (bfqg != bfqd->root_group) {
2927 parent = bfqg_parent(bfqg);
2928 if (!parent)
2929 parent = bfqd->root_group;
2930 bfq_group_set_parent(bfqg, parent);
2931 }
2932 }
2933
2934 return bfqg;
2935}
2936
2937static void bfq_bfqq_expire(struct bfq_data *bfqd,
2938 struct bfq_queue *bfqq,
2939 bool compensate,
2940 enum bfqq_expiration reason);
2941
2942/**
2943 * bfq_bfqq_move - migrate @bfqq to @bfqg.
2944 * @bfqd: queue descriptor.
2945 * @bfqq: the queue to move.
2946 * @bfqg: the group to move to.
2947 *
2948 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
2949 * it on the new one. Avoid putting the entity on the old group idle tree.
2950 *
2951 * Must be called under the queue lock; the cgroup owning @bfqg must
2952 * not disappear (by now this just means that we are called under
2953 * rcu_read_lock()).
2954 */
2955static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2956 struct bfq_group *bfqg)
2957{
2958 struct bfq_entity *entity = &bfqq->entity;
2959
2960 /* If bfqq is empty, then bfq_bfqq_expire also invokes
2961 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
2962 * from data structures related to current group. Otherwise we
2963 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
2964 * we do below.
2965 */
2966 if (bfqq == bfqd->in_service_queue)
2967 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
2968 false, BFQQE_PREEMPTED);
2969
2970 if (bfq_bfqq_busy(bfqq))
2971 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
2972 else if (entity->on_st)
2973 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
2974 bfqg_put(bfqq_group(bfqq));
2975
2976 /*
2977 * Here we use a reference to bfqg. We don't need a refcounter
2978 * as the cgroup reference will not be dropped, so that its
2979 * destroy() callback will not be invoked.
2980 */
2981 entity->parent = bfqg->my_entity;
2982 entity->sched_data = &bfqg->sched_data;
2983 bfqg_get(bfqg);
2984
2985 if (bfq_bfqq_busy(bfqq))
2986 bfq_activate_bfqq(bfqd, bfqq);
2987
2988 if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
2989 bfq_schedule_dispatch(bfqd);
2990}
2991
2992/**
2993 * __bfq_bic_change_cgroup - move @bic to @cgroup.
2994 * @bfqd: the queue descriptor.
2995 * @bic: the bic to move.
2996 * @blkcg: the blk-cgroup to move to.
2997 *
2998 * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
2999 * has to make sure that the reference to cgroup is valid across the call.
3000 *
3001 * NOTE: an alternative approach might have been to store the current
3002 * cgroup in bfqq and getting a reference to it, reducing the lookup
3003 * time here, at the price of slightly more complex code.
3004 */
3005static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
3006 struct bfq_io_cq *bic,
3007 struct blkcg *blkcg)
3008{
3009 struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
3010 struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
3011 struct bfq_group *bfqg;
3012 struct bfq_entity *entity;
3013
3014 bfqg = bfq_find_set_group(bfqd, blkcg);
3015
3016 if (unlikely(!bfqg))
3017 bfqg = bfqd->root_group;
3018
3019 if (async_bfqq) {
3020 entity = &async_bfqq->entity;
3021
3022 if (entity->sched_data != &bfqg->sched_data) {
3023 bic_set_bfqq(bic, NULL, 0);
3024 bfq_log_bfqq(bfqd, async_bfqq,
3025 "bic_change_group: %p %d",
3026 async_bfqq,
3027 async_bfqq->ref);
3028 bfq_put_queue(async_bfqq);
3029 }
3030 }
3031
3032 if (sync_bfqq) {
3033 entity = &sync_bfqq->entity;
3034 if (entity->sched_data != &bfqg->sched_data)
3035 bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
3036 }
3037
3038 return bfqg;
3039}
3040
3041static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
3042{
3043 struct bfq_data *bfqd = bic_to_bfqd(bic);
3044 struct bfq_group *bfqg = NULL;
3045 uint64_t serial_nr;
3046
3047 rcu_read_lock();
3048 serial_nr = bio_blkcg(bio)->css.serial_nr;
3049
3050 /*
3051 * Check whether blkcg has changed. The condition may trigger
3052 * spuriously on a newly created cic but there's no harm.
3053 */
3054 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
3055 goto out;
3056
3057 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
3058 bic->blkcg_serial_nr = serial_nr;
3059out:
3060 rcu_read_unlock();
3061}
3062
3063/**
3064 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3065 * @st: the service tree being flushed.
3066 */
3067static void bfq_flush_idle_tree(struct bfq_service_tree *st)
3068{
3069 struct bfq_entity *entity = st->first_idle;
3070
3071 for (; entity ; entity = st->first_idle)
3072 __bfq_deactivate_entity(entity, false);
3073}
3074
3075/**
3076 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3077 * @bfqd: the device data structure with the root group.
3078 * @entity: the entity to move.
3079 */
3080static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
3081 struct bfq_entity *entity)
3082{
3083 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3084
3085 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
aee69d78
PV
3086}
3087
3088/**
e21b7a0b
AA
3089 * bfq_reparent_active_entities - move to the root group all active
3090 * entities.
3091 * @bfqd: the device data structure with the root group.
3092 * @bfqg: the group to move from.
3093 * @st: the service tree with the entities.
aee69d78 3094 *
e21b7a0b 3095 * Needs queue_lock to be taken and reference to be valid over the call.
aee69d78 3096 */
e21b7a0b
AA
3097static void bfq_reparent_active_entities(struct bfq_data *bfqd,
3098 struct bfq_group *bfqg,
3099 struct bfq_service_tree *st)
aee69d78 3100{
e21b7a0b
AA
3101 struct rb_root *active = &st->active;
3102 struct bfq_entity *entity = NULL;
aee69d78 3103
e21b7a0b
AA
3104 if (!RB_EMPTY_ROOT(&st->active))
3105 entity = bfq_entity_of(rb_first(active));
aee69d78 3106
e21b7a0b
AA
3107 for (; entity ; entity = bfq_entity_of(rb_first(active)))
3108 bfq_reparent_leaf_entity(bfqd, entity);
aee69d78 3109
e21b7a0b
AA
3110 if (bfqg->sched_data.in_service_entity)
3111 bfq_reparent_leaf_entity(bfqd,
3112 bfqg->sched_data.in_service_entity);
aee69d78
PV
3113}
3114
3115/**
e21b7a0b
AA
3116 * bfq_pd_offline - deactivate the entity associated with @pd,
3117 * and reparent its children entities.
3118 * @pd: descriptor of the policy going offline.
aee69d78 3119 *
e21b7a0b
AA
3120 * blkio already grabs the queue_lock for us, so no need to use
3121 * RCU-based magic
aee69d78 3122 */
e21b7a0b 3123static void bfq_pd_offline(struct blkg_policy_data *pd)
aee69d78 3124{
e21b7a0b
AA
3125 struct bfq_service_tree *st;
3126 struct bfq_group *bfqg = pd_to_bfqg(pd);
3127 struct bfq_data *bfqd = bfqg->bfqd;
3128 struct bfq_entity *entity = bfqg->my_entity;
3129 unsigned long flags;
3130 int i;
aee69d78 3131
e21b7a0b
AA
3132 if (!entity) /* root group */
3133 return;
3134
3135 spin_lock_irqsave(&bfqd->lock, flags);
aee69d78 3136 /*
e21b7a0b
AA
3137 * Empty all service_trees belonging to this group before
3138 * deactivating the group itself.
aee69d78 3139 */
e21b7a0b
AA
3140 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
3141 st = bfqg->sched_data.service_tree + i;
3142
3143 /*
3144 * The idle tree may still contain bfq_queues belonging
3145 * to exited task because they never migrated to a different
3146 * cgroup from the one being destroyed now. No one else
3147 * can access them so it's safe to act without any lock.
3148 */
3149 bfq_flush_idle_tree(st);
3150
3151 /*
3152 * It may happen that some queues are still active
3153 * (busy) upon group destruction (if the corresponding
3154 * processes have been forced to terminate). We move
3155 * all the leaf entities corresponding to these queues
3156 * to the root_group.
3157 * Also, it may happen that the group has an entity
3158 * in service, which is disconnected from the active
3159 * tree: it must be moved, too.
3160 * There is no need to put the sync queues, as the
3161 * scheduler has taken no reference.
3162 */
3163 bfq_reparent_active_entities(bfqd, bfqg, st);
aee69d78
PV
3164 }
3165
e21b7a0b
AA
3166 __bfq_deactivate_entity(entity, false);
3167 bfq_put_async_queues(bfqd, bfqg);
3168
3169 spin_unlock_irqrestore(&bfqd->lock, flags);
3170 /*
3171 * @blkg is going offline and will be ignored by
3172 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
3173 * that they don't get lost. If IOs complete after this point, the
3174 * stats for them will be lost. Oh well...
3175 */
3176 bfqg_stats_xfer_dead(bfqg);
aee69d78
PV
3177}
3178
44e44a1b
PV
3179static void bfq_end_wr_async(struct bfq_data *bfqd)
3180{
3181 struct blkcg_gq *blkg;
3182
3183 list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
3184 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3185
3186 bfq_end_wr_async_queues(bfqd, bfqg);
3187 }
3188 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3189}
3190
e21b7a0b 3191static int bfq_io_show_weight(struct seq_file *sf, void *v)
aee69d78 3192{
e21b7a0b
AA
3193 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3194 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3195 unsigned int val = 0;
aee69d78 3196
e21b7a0b
AA
3197 if (bfqgd)
3198 val = bfqgd->weight;
aee69d78 3199
e21b7a0b 3200 seq_printf(sf, "%u\n", val);
aee69d78 3201
e21b7a0b
AA
3202 return 0;
3203}
3204
3205static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
3206 struct cftype *cftype,
3207 u64 val)
aee69d78 3208{
e21b7a0b
AA
3209 struct blkcg *blkcg = css_to_blkcg(css);
3210 struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3211 struct blkcg_gq *blkg;
3212 int ret = -ERANGE;
aee69d78 3213
e21b7a0b
AA
3214 if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
3215 return ret;
aee69d78 3216
e21b7a0b
AA
3217 ret = 0;
3218 spin_lock_irq(&blkcg->lock);
3219 bfqgd->weight = (unsigned short)val;
3220 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3221 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3222
3223 if (!bfqg)
3224 continue;
3225 /*
3226 * Setting the prio_changed flag of the entity
3227 * to 1 with new_weight == weight would re-set
3228 * the value of the weight to its ioprio mapping.
3229 * Set the flag only if necessary.
3230 */
3231 if ((unsigned short)val != bfqg->entity.new_weight) {
3232 bfqg->entity.new_weight = (unsigned short)val;
3233 /*
3234 * Make sure that the above new value has been
3235 * stored in bfqg->entity.new_weight before
3236 * setting the prio_changed flag. In fact,
3237 * this flag may be read asynchronously (in
3238 * critical sections protected by a different
3239 * lock than that held here), and finding this
3240 * flag set may cause the execution of the code
3241 * for updating parameters whose value may
3242 * depend also on bfqg->entity.new_weight (in
3243 * __bfq_entity_update_weight_prio).
3244 * This barrier makes sure that the new value
3245 * of bfqg->entity.new_weight is correctly
3246 * seen in that code.
3247 */
3248 smp_wmb();
3249 bfqg->entity.prio_changed = 1;
3250 }
aee69d78 3251 }
e21b7a0b 3252 spin_unlock_irq(&blkcg->lock);
aee69d78 3253
e21b7a0b
AA
3254 return ret;
3255}
aee69d78 3256
e21b7a0b
AA
3257static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
3258 char *buf, size_t nbytes,
3259 loff_t off)
3260{
3261 u64 weight;
3262 /* First unsigned long found in the file is used */
3263 int ret = kstrtoull(strim(buf), 0, &weight);
3264
3265 if (ret)
3266 return ret;
3267
3268 return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
aee69d78
PV
3269}
3270
e21b7a0b 3271static int bfqg_print_stat(struct seq_file *sf, void *v)
aee69d78 3272{
e21b7a0b
AA
3273 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
3274 &blkcg_policy_bfq, seq_cft(sf)->private, false);
3275 return 0;
3276}
aee69d78 3277
e21b7a0b
AA
3278static int bfqg_print_rwstat(struct seq_file *sf, void *v)
3279{
3280 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
3281 &blkcg_policy_bfq, seq_cft(sf)->private, true);
3282 return 0;
3283}
aee69d78 3284
e21b7a0b
AA
3285static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
3286 struct blkg_policy_data *pd, int off)
3287{
3288 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
3289 &blkcg_policy_bfq, off);
3290 return __blkg_prfill_u64(sf, pd, sum);
3291}
aee69d78 3292
e21b7a0b
AA
3293static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
3294 struct blkg_policy_data *pd, int off)
3295{
3296 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
3297 &blkcg_policy_bfq,
3298 off);
3299 return __blkg_prfill_rwstat(sf, pd, &sum);
aee69d78
PV
3300}
3301
e21b7a0b 3302static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
aee69d78 3303{
e21b7a0b
AA
3304 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3305 bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
3306 seq_cft(sf)->private, false);
3307 return 0;
3308}
aee69d78 3309
e21b7a0b
AA
3310static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
3311{
3312 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3313 bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
3314 seq_cft(sf)->private, true);
3315 return 0;
aee69d78
PV
3316}
3317
e21b7a0b
AA
3318static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
3319 int off)
aee69d78 3320{
e21b7a0b 3321 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
aee69d78 3322
e21b7a0b 3323 return __blkg_prfill_u64(sf, pd, sum >> 9);
aee69d78
PV
3324}
3325
e21b7a0b 3326static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
aee69d78 3327{
e21b7a0b
AA
3328 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3329 bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
3330 return 0;
3331}
aee69d78 3332
e21b7a0b
AA
3333static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
3334 struct blkg_policy_data *pd, int off)
3335{
3336 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
3337 offsetof(struct blkcg_gq, stat_bytes));
3338 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
3339 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
aee69d78 3340
e21b7a0b
AA
3341 return __blkg_prfill_u64(sf, pd, sum >> 9);
3342}
aee69d78 3343
e21b7a0b
AA
3344static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
3345{
3346 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3347 bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
3348 false);
3349 return 0;
aee69d78
PV
3350}
3351
e21b7a0b
AA
3352static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
3353 struct blkg_policy_data *pd, int off)
aee69d78 3354{
e21b7a0b
AA
3355 struct bfq_group *bfqg = pd_to_bfqg(pd);
3356 u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
3357 u64 v = 0;
aee69d78 3358
e21b7a0b
AA
3359 if (samples) {
3360 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
3361 v = div64_u64(v, samples);
3362 }
3363 __blkg_prfill_u64(sf, pd, v);
3364 return 0;
3365}
aee69d78 3366
e21b7a0b
AA
3367/* print avg_queue_size */
3368static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
3369{
3370 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3371 bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
3372 0, false);
3373 return 0;
3374}
3375
3376static struct bfq_group *
3377bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
3378{
3379 int ret;
3380
3381 ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
3382 if (ret)
3383 return NULL;
3384
3385 return blkg_to_bfqg(bfqd->queue->root_blkg);
aee69d78
PV
3386}
3387
e21b7a0b
AA
3388static struct cftype bfq_blkcg_legacy_files[] = {
3389 {
3390 .name = "bfq.weight",
3391 .flags = CFTYPE_NOT_ON_ROOT,
3392 .seq_show = bfq_io_show_weight,
3393 .write_u64 = bfq_io_set_weight_legacy,
3394 },
3395
3396 /* statistics, covers only the tasks in the bfqg */
3397 {
3398 .name = "bfq.time",
3399 .private = offsetof(struct bfq_group, stats.time),
3400 .seq_show = bfqg_print_stat,
3401 },
3402 {
3403 .name = "bfq.sectors",
3404 .seq_show = bfqg_print_stat_sectors,
3405 },
3406 {
3407 .name = "bfq.io_service_bytes",
3408 .private = (unsigned long)&blkcg_policy_bfq,
3409 .seq_show = blkg_print_stat_bytes,
3410 },
3411 {
3412 .name = "bfq.io_serviced",
3413 .private = (unsigned long)&blkcg_policy_bfq,
3414 .seq_show = blkg_print_stat_ios,
3415 },
3416 {
3417 .name = "bfq.io_service_time",
3418 .private = offsetof(struct bfq_group, stats.service_time),
3419 .seq_show = bfqg_print_rwstat,
3420 },
3421 {
3422 .name = "bfq.io_wait_time",
3423 .private = offsetof(struct bfq_group, stats.wait_time),
3424 .seq_show = bfqg_print_rwstat,
3425 },
3426 {
3427 .name = "bfq.io_merged",
3428 .private = offsetof(struct bfq_group, stats.merged),
3429 .seq_show = bfqg_print_rwstat,
3430 },
3431 {
3432 .name = "bfq.io_queued",
3433 .private = offsetof(struct bfq_group, stats.queued),
3434 .seq_show = bfqg_print_rwstat,
3435 },
3436
3437 /* the same statictics which cover the bfqg and its descendants */
3438 {
3439 .name = "bfq.time_recursive",
3440 .private = offsetof(struct bfq_group, stats.time),
3441 .seq_show = bfqg_print_stat_recursive,
3442 },
3443 {
3444 .name = "bfq.sectors_recursive",
3445 .seq_show = bfqg_print_stat_sectors_recursive,
3446 },
3447 {
3448 .name = "bfq.io_service_bytes_recursive",
3449 .private = (unsigned long)&blkcg_policy_bfq,
3450 .seq_show = blkg_print_stat_bytes_recursive,
3451 },
3452 {
3453 .name = "bfq.io_serviced_recursive",
3454 .private = (unsigned long)&blkcg_policy_bfq,
3455 .seq_show = blkg_print_stat_ios_recursive,
3456 },
3457 {
3458 .name = "bfq.io_service_time_recursive",
3459 .private = offsetof(struct bfq_group, stats.service_time),
3460 .seq_show = bfqg_print_rwstat_recursive,
3461 },
3462 {
3463 .name = "bfq.io_wait_time_recursive",
3464 .private = offsetof(struct bfq_group, stats.wait_time),
3465 .seq_show = bfqg_print_rwstat_recursive,
3466 },
3467 {
3468 .name = "bfq.io_merged_recursive",
3469 .private = offsetof(struct bfq_group, stats.merged),
3470 .seq_show = bfqg_print_rwstat_recursive,
3471 },
3472 {
3473 .name = "bfq.io_queued_recursive",
3474 .private = offsetof(struct bfq_group, stats.queued),
3475 .seq_show = bfqg_print_rwstat_recursive,
3476 },
3477 {
3478 .name = "bfq.avg_queue_size",
3479 .seq_show = bfqg_print_avg_queue_size,
3480 },
3481 {
3482 .name = "bfq.group_wait_time",
3483 .private = offsetof(struct bfq_group, stats.group_wait_time),
3484 .seq_show = bfqg_print_stat,
3485 },
3486 {
3487 .name = "bfq.idle_time",
3488 .private = offsetof(struct bfq_group, stats.idle_time),
3489 .seq_show = bfqg_print_stat,
3490 },
3491 {
3492 .name = "bfq.empty_time",
3493 .private = offsetof(struct bfq_group, stats.empty_time),
3494 .seq_show = bfqg_print_stat,
3495 },
3496 {
3497 .name = "bfq.dequeue",
3498 .private = offsetof(struct bfq_group, stats.dequeue),
3499 .seq_show = bfqg_print_stat,
3500 },
3501 { } /* terminate */
3502};
3503
3504static struct cftype bfq_blkg_files[] = {
3505 {
3506 .name = "bfq.weight",
3507 .flags = CFTYPE_NOT_ON_ROOT,
3508 .seq_show = bfq_io_show_weight,
3509 .write = bfq_io_set_weight,
3510 },
3511 {} /* terminate */
3512};
3513
3514#else /* CONFIG_BFQ_GROUP_IOSCHED */
3515
3516static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
3517 struct bfq_queue *bfqq, unsigned int op) { }
3518static inline void
3519bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
3520static inline void
3521bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
3522static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
3523 uint64_t start_time, uint64_t io_start_time,
3524 unsigned int op) { }
3525static inline void
3526bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
3527 struct bfq_group *curr_bfqg) { }
3528static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
3529static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
3530static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
3531static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
3532static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
3533static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
3534
3535static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3536 struct bfq_group *bfqg) {}
3537
3538static void bfq_init_entity(struct bfq_entity *entity,
3539 struct bfq_group *bfqg)
aee69d78
PV
3540{
3541 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3542
3543 entity->weight = entity->new_weight;
3544 entity->orig_weight = entity->new_weight;
e21b7a0b
AA
3545 if (bfqq) {
3546 bfqq->ioprio = bfqq->new_ioprio;
3547 bfqq->ioprio_class = bfqq->new_ioprio_class;
3548 }
3549 entity->sched_data = &bfqg->sched_data;
3550}
3551
3552static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3553
44e44a1b
PV
3554static void bfq_end_wr_async(struct bfq_data *bfqd)
3555{
3556 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3557}
3558
e21b7a0b
AA
3559static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3560 struct blkcg *blkcg)
3561{
3562 return bfqd->root_group;
3563}
3564
3565static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
3566{
3567 return bfqq->bfqd->root_group;
3568}
aee69d78 3569
e21b7a0b
AA
3570static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
3571 int node)
3572{
3573 struct bfq_group *bfqg;
3574 int i;
3575
3576 bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
3577 if (!bfqg)
3578 return NULL;
aee69d78 3579
e21b7a0b
AA
3580 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3581 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3582
3583 return bfqg;
aee69d78 3584}
e21b7a0b 3585#endif /* CONFIG_BFQ_GROUP_IOSCHED */
aee69d78
PV
3586
3587#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3588#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3589
3590#define bfq_sample_valid(samples) ((samples) > 80)
3591
aee69d78
PV
3592/*
3593 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3594 * We choose the request that is closesr to the head right now. Distance
3595 * behind the head is penalized and only allowed to a certain extent.
3596 */
3597static struct request *bfq_choose_req(struct bfq_data *bfqd,
3598 struct request *rq1,
3599 struct request *rq2,
3600 sector_t last)
3601{
3602 sector_t s1, s2, d1 = 0, d2 = 0;
3603 unsigned long back_max;
3604#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
3605#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
3606 unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
3607
3608 if (!rq1 || rq1 == rq2)
3609 return rq2;
3610 if (!rq2)
3611 return rq1;
3612
3613 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
3614 return rq1;
3615 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
3616 return rq2;
3617 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
3618 return rq1;
3619 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
3620 return rq2;
3621
3622 s1 = blk_rq_pos(rq1);
3623 s2 = blk_rq_pos(rq2);
3624
3625 /*
3626 * By definition, 1KiB is 2 sectors.
3627 */
3628 back_max = bfqd->bfq_back_max * 2;
3629
3630 /*
3631 * Strict one way elevator _except_ in the case where we allow
3632 * short backward seeks which are biased as twice the cost of a
3633 * similar forward seek.
3634 */
3635 if (s1 >= last)
3636 d1 = s1 - last;
3637 else if (s1 + back_max >= last)
3638 d1 = (last - s1) * bfqd->bfq_back_penalty;
3639 else
3640 wrap |= BFQ_RQ1_WRAP;
3641
3642 if (s2 >= last)
3643 d2 = s2 - last;
3644 else if (s2 + back_max >= last)
3645 d2 = (last - s2) * bfqd->bfq_back_penalty;
3646 else
3647 wrap |= BFQ_RQ2_WRAP;
3648
3649 /* Found required data */
3650
3651 /*
3652 * By doing switch() on the bit mask "wrap" we avoid having to
3653 * check two variables for all permutations: --> faster!
3654 */
3655 switch (wrap) {
3656 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3657 if (d1 < d2)
3658 return rq1;
3659 else if (d2 < d1)
3660 return rq2;
3661
3662 if (s1 >= s2)
3663 return rq1;
3664 else
3665 return rq2;
3666
3667 case BFQ_RQ2_WRAP:
3668 return rq1;
3669 case BFQ_RQ1_WRAP:
3670 return rq2;
3671 case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
3672 default:
3673 /*
3674 * Since both rqs are wrapped,
3675 * start with the one that's further behind head
3676 * (--> only *one* back seek required),
3677 * since back seek takes more time than forward.
3678 */
3679 if (s1 <= s2)
3680 return rq1;
3681 else
3682 return rq2;
3683 }
3684}
3685
3686/*
3687 * Return expired entry, or NULL to just start from scratch in rbtree.
3688 */
3689static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
3690 struct request *last)
3691{
3692 struct request *rq;
3693
3694 if (bfq_bfqq_fifo_expire(bfqq))
3695 return NULL;
3696
3697 bfq_mark_bfqq_fifo_expire(bfqq);
3698
3699 rq = rq_entry_fifo(bfqq->fifo.next);
3700
3701 if (rq == last || ktime_get_ns() < rq->fifo_time)
3702 return NULL;
3703
3704 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
3705 return rq;
3706}
3707
3708static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
3709 struct bfq_queue *bfqq,
3710 struct request *last)
3711{
3712 struct rb_node *rbnext = rb_next(&last->rb_node);
3713 struct rb_node *rbprev = rb_prev(&last->rb_node);
3714 struct request *next, *prev = NULL;
3715
3716 /* Follow expired path, else get first next available. */
3717 next = bfq_check_fifo(bfqq, last);
3718 if (next)
3719 return next;
3720
3721 if (rbprev)
3722 prev = rb_entry_rq(rbprev);
3723
3724 if (rbnext)
3725 next = rb_entry_rq(rbnext);
3726 else {
3727 rbnext = rb_first(&bfqq->sort_list);
3728 if (rbnext && rbnext != &last->rb_node)
3729 next = rb_entry_rq(rbnext);
3730 }
3731
3732 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
3733}
3734
c074170e 3735/* see the definition of bfq_async_charge_factor for details */
aee69d78
PV
3736static unsigned long bfq_serv_to_charge(struct request *rq,
3737 struct bfq_queue *bfqq)
3738{
44e44a1b 3739 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
c074170e
PV
3740 return blk_rq_sectors(rq);
3741
3742 return blk_rq_sectors(rq) * bfq_async_charge_factor;
aee69d78
PV
3743}
3744
3745/**
3746 * bfq_updated_next_req - update the queue after a new next_rq selection.
3747 * @bfqd: the device data the queue belongs to.
3748 * @bfqq: the queue to update.
3749 *
3750 * If the first request of a queue changes we make sure that the queue
3751 * has enough budget to serve at least its first request (if the
3752 * request has grown). We do this because if the queue has not enough
3753 * budget for its first request, it has to go through two dispatch
3754 * rounds to actually get it dispatched.
3755 */
3756static void bfq_updated_next_req(struct bfq_data *bfqd,
3757 struct bfq_queue *bfqq)
3758{
3759 struct bfq_entity *entity = &bfqq->entity;
3760 struct request *next_rq = bfqq->next_rq;
3761 unsigned long new_budget;
3762
3763 if (!next_rq)
3764 return;
3765
3766 if (bfqq == bfqd->in_service_queue)
3767 /*
3768 * In order not to break guarantees, budgets cannot be
3769 * changed after an entity has been selected.
3770 */
3771 return;
3772
3773 new_budget = max_t(unsigned long, bfqq->max_budget,
3774 bfq_serv_to_charge(next_rq, bfqq));
3775 if (entity->budget != new_budget) {
3776 entity->budget = new_budget;
3777 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
3778 new_budget);
e21b7a0b 3779 bfq_requeue_bfqq(bfqd, bfqq);
aee69d78
PV
3780 }
3781}
3782
3783static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
3784{
3785 struct bfq_entity *entity = &bfqq->entity;
3786
3787 return entity->budget - entity->service;
3788}
3789
3790/*
3791 * If enough samples have been computed, return the current max budget
3792 * stored in bfqd, which is dynamically updated according to the
3793 * estimated disk peak rate; otherwise return the default max budget
3794 */
3795static int bfq_max_budget(struct bfq_data *bfqd)
3796{
3797 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3798 return bfq_default_max_budget;
3799 else
3800 return bfqd->bfq_max_budget;
3801}
3802
3803/*
3804 * Return min budget, which is a fraction of the current or default
3805 * max budget (trying with 1/32)
3806 */
3807static int bfq_min_budget(struct bfq_data *bfqd)
3808{
3809 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
3810 return bfq_default_max_budget / 32;
3811 else
3812 return bfqd->bfq_max_budget / 32;
3813}
3814
3815static void bfq_bfqq_expire(struct bfq_data *bfqd,
3816 struct bfq_queue *bfqq,
3817 bool compensate,
3818 enum bfqq_expiration reason);
3819
3820/*
3821 * The next function, invoked after the input queue bfqq switches from
3822 * idle to busy, updates the budget of bfqq. The function also tells
3823 * whether the in-service queue should be expired, by returning
3824 * true. The purpose of expiring the in-service queue is to give bfqq
3825 * the chance to possibly preempt the in-service queue, and the reason
44e44a1b
PV
3826 * for preempting the in-service queue is to achieve one of the two
3827 * goals below.
aee69d78 3828 *
44e44a1b
PV
3829 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
3830 * expired because it has remained idle. In particular, bfqq may have
3831 * expired for one of the following two reasons:
aee69d78
PV
3832 *
3833 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3834 * and did not make it to issue a new request before its last
3835 * request was served;
3836 *
3837 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
3838 * a new request before the expiration of the idling-time.
3839 *
3840 * Even if bfqq has expired for one of the above reasons, the process
3841 * associated with the queue may be however issuing requests greedily,
3842 * and thus be sensitive to the bandwidth it receives (bfqq may have
3843 * remained idle for other reasons: CPU high load, bfqq not enjoying
3844 * idling, I/O throttling somewhere in the path from the process to
3845 * the I/O scheduler, ...). But if, after every expiration for one of
3846 * the above two reasons, bfqq has to wait for the service of at least
3847 * one full budget of another queue before being served again, then
3848 * bfqq is likely to get a much lower bandwidth or resource time than
3849 * its reserved ones. To address this issue, two countermeasures need
3850 * to be taken.
3851 *
3852 * First, the budget and the timestamps of bfqq need to be updated in
3853 * a special way on bfqq reactivation: they need to be updated as if
3854 * bfqq did not remain idle and did not expire. In fact, if they are
3855 * computed as if bfqq expired and remained idle until reactivation,
3856 * then the process associated with bfqq is treated as if, instead of
3857 * being greedy, it stopped issuing requests when bfqq remained idle,
3858 * and restarts issuing requests only on this reactivation. In other
3859 * words, the scheduler does not help the process recover the "service
3860 * hole" between bfqq expiration and reactivation. As a consequence,
3861 * the process receives a lower bandwidth than its reserved one. In
3862 * contrast, to recover this hole, the budget must be updated as if
3863 * bfqq was not expired at all before this reactivation, i.e., it must
3864 * be set to the value of the remaining budget when bfqq was
3865 * expired. Along the same line, timestamps need to be assigned the
3866 * value they had the last time bfqq was selected for service, i.e.,
3867 * before last expiration. Thus timestamps need to be back-shifted
3868 * with respect to their normal computation (see [1] for more details
3869 * on this tricky aspect).
3870 *
3871 * Secondly, to allow the process to recover the hole, the in-service
3872 * queue must be expired too, to give bfqq the chance to preempt it
3873 * immediately. In fact, if bfqq has to wait for a full budget of the
3874 * in-service queue to be completed, then it may become impossible to
3875 * let the process recover the hole, even if the back-shifted
3876 * timestamps of bfqq are lower than those of the in-service queue. If
3877 * this happens for most or all of the holes, then the process may not
3878 * receive its reserved bandwidth. In this respect, it is worth noting
3879 * that, being the service of outstanding requests unpreemptible, a
3880 * little fraction of the holes may however be unrecoverable, thereby
3881 * causing a little loss of bandwidth.
3882 *
3883 * The last important point is detecting whether bfqq does need this
3884 * bandwidth recovery. In this respect, the next function deems the
3885 * process associated with bfqq greedy, and thus allows it to recover
3886 * the hole, if: 1) the process is waiting for the arrival of a new
3887 * request (which implies that bfqq expired for one of the above two
3888 * reasons), and 2) such a request has arrived soon. The first
3889 * condition is controlled through the flag non_blocking_wait_rq,
3890 * while the second through the flag arrived_in_time. If both
3891 * conditions hold, then the function computes the budget in the
3892 * above-described special way, and signals that the in-service queue
3893 * should be expired. Timestamp back-shifting is done later in
3894 * __bfq_activate_entity.
44e44a1b
PV
3895 *
3896 * 2. Reduce latency. Even if timestamps are not backshifted to let
3897 * the process associated with bfqq recover a service hole, bfqq may
3898 * however happen to have, after being (re)activated, a lower finish
3899 * timestamp than the in-service queue. That is, the next budget of
3900 * bfqq may have to be completed before the one of the in-service
3901 * queue. If this is the case, then preempting the in-service queue
3902 * allows this goal to be achieved, apart from the unpreemptible,
3903 * outstanding requests mentioned above.
3904 *
3905 * Unfortunately, regardless of which of the above two goals one wants
3906 * to achieve, service trees need first to be updated to know whether
3907 * the in-service queue must be preempted. To have service trees
3908 * correctly updated, the in-service queue must be expired and
3909 * rescheduled, and bfqq must be scheduled too. This is one of the
3910 * most costly operations (in future versions, the scheduling
3911 * mechanism may be re-designed in such a way to make it possible to
3912 * know whether preemption is needed without needing to update service
3913 * trees). In addition, queue preemptions almost always cause random
3914 * I/O, and thus loss of throughput. Because of these facts, the next
3915 * function adopts the following simple scheme to avoid both costly
3916 * operations and too frequent preemptions: it requests the expiration
3917 * of the in-service queue (unconditionally) only for queues that need
3918 * to recover a hole, or that either are weight-raised or deserve to
3919 * be weight-raised.
aee69d78
PV
3920 */
3921static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
3922 struct bfq_queue *bfqq,
44e44a1b
PV
3923 bool arrived_in_time,
3924 bool wr_or_deserves_wr)
aee69d78
PV
3925{
3926 struct bfq_entity *entity = &bfqq->entity;
3927
3928 if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
3929 /*
3930 * We do not clear the flag non_blocking_wait_rq here, as
3931 * the latter is used in bfq_activate_bfqq to signal
3932 * that timestamps need to be back-shifted (and is
3933 * cleared right after).
3934 */
3935
3936 /*
3937 * In next assignment we rely on that either
3938 * entity->service or entity->budget are not updated
3939 * on expiration if bfqq is empty (see
3940 * __bfq_bfqq_recalc_budget). Thus both quantities
3941 * remain unchanged after such an expiration, and the
3942 * following statement therefore assigns to
3943 * entity->budget the remaining budget on such an
3944 * expiration. For clarity, entity->service is not
3945 * updated on expiration in any case, and, in normal
3946 * operation, is reset only when bfqq is selected for
3947 * service (see bfq_get_next_queue).
3948 */
3949 entity->budget = min_t(unsigned long,
3950 bfq_bfqq_budget_left(bfqq),
3951 bfqq->max_budget);
3952
3953 return true;
3954 }
3955
3956 entity->budget = max_t(unsigned long, bfqq->max_budget,
3957 bfq_serv_to_charge(bfqq->next_rq, bfqq));
3958 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
44e44a1b
PV
3959 return wr_or_deserves_wr;
3960}
3961
3962static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
3963{
3964 u64 dur;
3965
3966 if (bfqd->bfq_wr_max_time > 0)
3967 return bfqd->bfq_wr_max_time;
3968
3969 dur = bfqd->RT_prod;
3970 do_div(dur, bfqd->peak_rate);
3971
3972 /*
3973 * Limit duration between 3 and 13 seconds. Tests show that
3974 * higher values than 13 seconds often yield the opposite of
3975 * the desired result, i.e., worsen responsiveness by letting
3976 * non-interactive and non-soft-real-time applications
3977 * preserve weight raising for a too long time interval.
3978 *
3979 * On the other end, lower values than 3 seconds make it
3980 * difficult for most interactive tasks to complete their jobs
3981 * before weight-raising finishes.
3982 */
3983 if (dur > msecs_to_jiffies(13000))
3984 dur = msecs_to_jiffies(13000);
3985 else if (dur < msecs_to_jiffies(3000))
3986 dur = msecs_to_jiffies(3000);
3987
3988 return dur;
3989}
3990
3991static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
3992 struct bfq_queue *bfqq,
3993 unsigned int old_wr_coeff,
3994 bool wr_or_deserves_wr,
3995 bool interactive)
3996{
3997 if (old_wr_coeff == 1 && wr_or_deserves_wr) {
3998 /* start a weight-raising period */
3999 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4000 /* update wr duration */
4001 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4002
4003 /*
4004 * If needed, further reduce budget to make sure it is
4005 * close to bfqq's backlog, so as to reduce the
4006 * scheduling-error component due to a too large
4007 * budget. Do not care about throughput consequences,
4008 * but only about latency. Finally, do not assign a
4009 * too small budget either, to avoid increasing
4010 * latency by causing too frequent expirations.
4011 */
4012 bfqq->entity.budget = min_t(unsigned long,
4013 bfqq->entity.budget,
4014 2 * bfq_min_budget(bfqd));
4015 } else if (old_wr_coeff > 1) {
4016 /* update wr duration */
4017 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4018 }
4019}
4020
4021static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
4022 struct bfq_queue *bfqq)
4023{
4024 return bfqq->dispatched == 0 &&
4025 time_is_before_jiffies(
4026 bfqq->budget_timeout +
4027 bfqd->bfq_wr_min_idle_time);
aee69d78
PV
4028}
4029
4030static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
4031 struct bfq_queue *bfqq,
44e44a1b
PV
4032 int old_wr_coeff,
4033 struct request *rq,
4034 bool *interactive)
aee69d78 4035{
44e44a1b
PV
4036 bool wr_or_deserves_wr, bfqq_wants_to_preempt,
4037 idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
aee69d78
PV
4038 /*
4039 * See the comments on
4040 * bfq_bfqq_update_budg_for_activation for
4041 * details on the usage of the next variable.
4042 */
4043 arrived_in_time = ktime_get_ns() <=
4044 bfqq->ttime.last_end_request +
4045 bfqd->bfq_slice_idle * 3;
4046
e21b7a0b
AA
4047 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
4048
aee69d78 4049 /*
44e44a1b
PV
4050 * bfqq deserves to be weight-raised if:
4051 * - it is sync,
4052 * - it has been idle for enough time.
4053 */
4054 *interactive = idle_for_long_time;
4055 wr_or_deserves_wr = bfqd->low_latency &&
4056 (bfqq->wr_coeff > 1 ||
4057 (bfq_bfqq_sync(bfqq) && *interactive));
4058
4059 /*
4060 * Using the last flag, update budget and check whether bfqq
4061 * may want to preempt the in-service queue.
aee69d78
PV
4062 */
4063 bfqq_wants_to_preempt =
4064 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
44e44a1b
PV
4065 arrived_in_time,
4066 wr_or_deserves_wr);
aee69d78
PV
4067
4068 if (!bfq_bfqq_IO_bound(bfqq)) {
4069 if (arrived_in_time) {
4070 bfqq->requests_within_timer++;
4071 if (bfqq->requests_within_timer >=
4072 bfqd->bfq_requests_within_timer)
4073 bfq_mark_bfqq_IO_bound(bfqq);
4074 } else
4075 bfqq->requests_within_timer = 0;
4076 }
4077
44e44a1b
PV
4078 if (bfqd->low_latency) {
4079 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
4080 old_wr_coeff,
4081 wr_or_deserves_wr,
4082 *interactive);
4083
4084 if (old_wr_coeff != bfqq->wr_coeff)
4085 bfqq->entity.prio_changed = 1;
4086 }
4087
aee69d78
PV
4088 bfq_add_bfqq_busy(bfqd, bfqq);
4089
4090 /*
4091 * Expire in-service queue only if preemption may be needed
4092 * for guarantees. In this respect, the function
4093 * next_queue_may_preempt just checks a simple, necessary
4094 * condition, and not a sufficient condition based on
4095 * timestamps. In fact, for the latter condition to be
4096 * evaluated, timestamps would need first to be updated, and
4097 * this operation is quite costly (see the comments on the
4098 * function bfq_bfqq_update_budg_for_activation).
4099 */
4100 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
44e44a1b 4101 bfqd->in_service_queue->wr_coeff == 1 &&
aee69d78
PV
4102 next_queue_may_preempt(bfqd))
4103 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
4104 false, BFQQE_PREEMPTED);
4105}
4106
4107static void bfq_add_request(struct request *rq)
4108{
4109 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4110 struct bfq_data *bfqd = bfqq->bfqd;
4111 struct request *next_rq, *prev;
44e44a1b
PV
4112 unsigned int old_wr_coeff = bfqq->wr_coeff;
4113 bool interactive = false;
aee69d78
PV
4114
4115 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
4116 bfqq->queued[rq_is_sync(rq)]++;
4117 bfqd->queued++;
4118
4119 elv_rb_add(&bfqq->sort_list, rq);
4120
4121 /*
4122 * Check if this request is a better next-serve candidate.
4123 */
4124 prev = bfqq->next_rq;
4125 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
4126 bfqq->next_rq = next_rq;
4127
4128 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
44e44a1b
PV
4129 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
4130 rq, &interactive);
4131 else {
4132 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
4133 time_is_before_jiffies(
4134 bfqq->last_wr_start_finish +
4135 bfqd->bfq_wr_min_inter_arr_async)) {
4136 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4137 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4138
4139 bfqq->entity.prio_changed = 1;
4140 }
4141 if (prev != bfqq->next_rq)
4142 bfq_updated_next_req(bfqd, bfqq);
4143 }
4144
4145 /*
4146 * Assign jiffies to last_wr_start_finish in the following
4147 * cases:
4148 *
4149 * . if bfqq is not going to be weight-raised, because, for
4150 * non weight-raised queues, last_wr_start_finish stores the
4151 * arrival time of the last request; as of now, this piece
4152 * of information is used only for deciding whether to
4153 * weight-raise async queues
4154 *
4155 * . if bfqq is not weight-raised, because, if bfqq is now
4156 * switching to weight-raised, then last_wr_start_finish
4157 * stores the time when weight-raising starts
4158 *
4159 * . if bfqq is interactive, because, regardless of whether
4160 * bfqq is currently weight-raised, the weight-raising
4161 * period must start or restart (this case is considered
4162 * separately because it is not detected by the above
4163 * conditions, if bfqq is already weight-raised)
4164 */
4165 if (bfqd->low_latency &&
4166 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
4167 bfqq->last_wr_start_finish = jiffies;
aee69d78
PV
4168}
4169
4170static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
4171 struct bio *bio,
4172 struct request_queue *q)
4173{
4174 struct bfq_queue *bfqq = bfqd->bio_bfqq;
4175
4176
4177 if (bfqq)
4178 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
4179
4180 return NULL;
4181}
4182
ab0e43e9
PV
4183static sector_t get_sdist(sector_t last_pos, struct request *rq)
4184{
4185 if (last_pos)
4186 return abs(blk_rq_pos(rq) - last_pos);
4187
4188 return 0;
4189}
4190
aee69d78
PV
4191#if 0 /* Still not clear if we can do without next two functions */
4192static void bfq_activate_request(struct request_queue *q, struct request *rq)
4193{
4194 struct bfq_data *bfqd = q->elevator->elevator_data;
4195
4196 bfqd->rq_in_driver++;
aee69d78
PV
4197}
4198
4199static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
4200{
4201 struct bfq_data *bfqd = q->elevator->elevator_data;
4202
4203 bfqd->rq_in_driver--;
4204}
4205#endif
4206
4207static void bfq_remove_request(struct request_queue *q,
4208 struct request *rq)
4209{
4210 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4211 struct bfq_data *bfqd = bfqq->bfqd;
4212 const int sync = rq_is_sync(rq);
4213
4214 if (bfqq->next_rq == rq) {
4215 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
4216 bfq_updated_next_req(bfqd, bfqq);
4217 }
4218
4219 if (rq->queuelist.prev != &rq->queuelist)
4220 list_del_init(&rq->queuelist);
4221 bfqq->queued[sync]--;
4222 bfqd->queued--;
4223 elv_rb_del(&bfqq->sort_list, rq);
4224
4225 elv_rqhash_del(q, rq);
4226 if (q->last_merge == rq)
4227 q->last_merge = NULL;
4228
4229 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4230 bfqq->next_rq = NULL;
4231
4232 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
e21b7a0b 4233 bfq_del_bfqq_busy(bfqd, bfqq, false);
aee69d78
PV
4234 /*
4235 * bfqq emptied. In normal operation, when
4236 * bfqq is empty, bfqq->entity.service and
4237 * bfqq->entity.budget must contain,
4238 * respectively, the service received and the
4239 * budget used last time bfqq emptied. These
4240 * facts do not hold in this case, as at least
4241 * this last removal occurred while bfqq is
4242 * not in service. To avoid inconsistencies,
4243 * reset both bfqq->entity.service and
4244 * bfqq->entity.budget, if bfqq has still a
4245 * process that may issue I/O requests to it.
4246 */
4247 bfqq->entity.budget = bfqq->entity.service = 0;
4248 }
4249 }
4250
4251 if (rq->cmd_flags & REQ_META)
4252 bfqq->meta_pending--;
e21b7a0b
AA
4253
4254 bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
aee69d78
PV
4255}
4256
4257static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
4258{
4259 struct request_queue *q = hctx->queue;
4260 struct bfq_data *bfqd = q->elevator->elevator_data;
4261 struct request *free = NULL;
4262 /*
4263 * bfq_bic_lookup grabs the queue_lock: invoke it now and
4264 * store its return value for later use, to avoid nesting
4265 * queue_lock inside the bfqd->lock. We assume that the bic
4266 * returned by bfq_bic_lookup does not go away before
4267 * bfqd->lock is taken.
4268 */
4269 struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
4270 bool ret;
4271
4272 spin_lock_irq(&bfqd->lock);
4273
4274 if (bic)
4275 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
4276 else
4277 bfqd->bio_bfqq = NULL;
4278 bfqd->bio_bic = bic;
4279
4280 ret = blk_mq_sched_try_merge(q, bio, &free);
4281
4282 if (free)
4283 blk_mq_free_request(free);
4284 spin_unlock_irq(&bfqd->lock);
4285
4286 return ret;
4287}
4288
4289static int bfq_request_merge(struct request_queue *q, struct request **req,
4290 struct bio *bio)
4291{
4292 struct bfq_data *bfqd = q->elevator->elevator_data;
4293 struct request *__rq;
4294
4295 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
4296 if (__rq && elv_bio_merge_ok(__rq, bio)) {
4297 *req = __rq;
4298 return ELEVATOR_FRONT_MERGE;
4299 }
4300
4301 return ELEVATOR_NO_MERGE;
4302}
4303
4304static void bfq_request_merged(struct request_queue *q, struct request *req,
4305 enum elv_merge type)
4306{
4307 if (type == ELEVATOR_FRONT_MERGE &&
4308 rb_prev(&req->rb_node) &&
4309 blk_rq_pos(req) <
4310 blk_rq_pos(container_of(rb_prev(&req->rb_node),
4311 struct request, rb_node))) {
4312 struct bfq_queue *bfqq = RQ_BFQQ(req);
4313 struct bfq_data *bfqd = bfqq->bfqd;
4314 struct request *prev, *next_rq;
4315
4316 /* Reposition request in its sort_list */
4317 elv_rb_del(&bfqq->sort_list, req);
4318 elv_rb_add(&bfqq->sort_list, req);
4319
4320 /* Choose next request to be served for bfqq */
4321 prev = bfqq->next_rq;
4322 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
4323 bfqd->last_position);
4324 bfqq->next_rq = next_rq;
4325 /*
4326 * If next_rq changes, update the queue's budget to fit
4327 * the new request.
4328 */
4329 if (prev != bfqq->next_rq)
4330 bfq_updated_next_req(bfqd, bfqq);
4331 }
4332}
4333
4334static void bfq_requests_merged(struct request_queue *q, struct request *rq,
4335 struct request *next)
4336{
4337 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
4338
4339 if (!RB_EMPTY_NODE(&rq->rb_node))
e21b7a0b 4340 goto end;
aee69d78
PV
4341 spin_lock_irq(&bfqq->bfqd->lock);
4342
4343 /*
4344 * If next and rq belong to the same bfq_queue and next is older
4345 * than rq, then reposition rq in the fifo (by substituting next
4346 * with rq). Otherwise, if next and rq belong to different
4347 * bfq_queues, never reposition rq: in fact, we would have to
4348 * reposition it with respect to next's position in its own fifo,
4349 * which would most certainly be too expensive with respect to
4350 * the benefits.
4351 */
4352 if (bfqq == next_bfqq &&
4353 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
4354 next->fifo_time < rq->fifo_time) {
4355 list_del_init(&rq->queuelist);
4356 list_replace_init(&next->queuelist, &rq->queuelist);
4357 rq->fifo_time = next->fifo_time;
4358 }
4359
4360 if (bfqq->next_rq == next)
4361 bfqq->next_rq = rq;
4362
4363 bfq_remove_request(q, next);
4364
4365 spin_unlock_irq(&bfqq->bfqd->lock);
e21b7a0b
AA
4366end:
4367 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
aee69d78
PV
4368}
4369
44e44a1b
PV
4370/* Must be called with bfqq != NULL */
4371static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
4372{
4373 bfqq->wr_coeff = 1;
4374 bfqq->wr_cur_max_time = 0;
4375 /*
4376 * Trigger a weight change on the next invocation of
4377 * __bfq_entity_update_weight_prio.
4378 */
4379 bfqq->entity.prio_changed = 1;
4380}
4381
4382static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
4383 struct bfq_group *bfqg)
4384{
4385 int i, j;
4386
4387 for (i = 0; i < 2; i++)
4388 for (j = 0; j < IOPRIO_BE_NR; j++)
4389 if (bfqg->async_bfqq[i][j])
4390 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
4391 if (bfqg->async_idle_bfqq)
4392 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
4393}
4394
4395static void bfq_end_wr(struct bfq_data *bfqd)
4396{
4397 struct bfq_queue *bfqq;
4398
4399 spin_lock_irq(&bfqd->lock);
4400
4401 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
4402 bfq_bfqq_end_wr(bfqq);
4403 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
4404 bfq_bfqq_end_wr(bfqq);
4405 bfq_end_wr_async(bfqd);
4406
4407 spin_unlock_irq(&bfqd->lock);
4408}
4409
aee69d78
PV
4410static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4411 struct bio *bio)
4412{
4413 struct bfq_data *bfqd = q->elevator->elevator_data;
4414 bool is_sync = op_is_sync(bio->bi_opf);
4415 struct bfq_queue *bfqq = bfqd->bio_bfqq;
4416
4417 /*
4418 * Disallow merge of a sync bio into an async request.
4419 */
4420 if (is_sync && !rq_is_sync(rq))
4421 return false;
4422
4423 /*
4424 * Lookup the bfqq that this bio will be queued with. Allow
4425 * merge only if rq is queued there.
4426 */
4427 if (!bfqq)
4428 return false;
4429
4430 return bfqq == RQ_BFQQ(rq);
4431}
4432
44e44a1b
PV
4433/*
4434 * Set the maximum time for the in-service queue to consume its
4435 * budget. This prevents seeky processes from lowering the throughput.
4436 * In practice, a time-slice service scheme is used with seeky
4437 * processes.
4438 */
4439static void bfq_set_budget_timeout(struct bfq_data *bfqd,
4440 struct bfq_queue *bfqq)
4441{
4442 bfqd->last_budget_start = ktime_get();
4443
4444 bfqq->budget_timeout = jiffies +
4445 bfqd->bfq_timeout *
4446 (bfqq->entity.weight / bfqq->entity.orig_weight);
4447}
4448
aee69d78
PV
4449static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
4450 struct bfq_queue *bfqq)
4451{
4452 if (bfqq) {
e21b7a0b 4453 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
aee69d78
PV
4454 bfq_clear_bfqq_fifo_expire(bfqq);
4455
4456 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
4457
44e44a1b 4458 bfq_set_budget_timeout(bfqd, bfqq);
aee69d78
PV
4459 bfq_log_bfqq(bfqd, bfqq,
4460 "set_in_service_queue, cur-budget = %d",
4461 bfqq->entity.budget);
4462 }
4463
4464 bfqd->in_service_queue = bfqq;
4465}
4466
4467/*
4468 * Get and set a new queue for service.
4469 */
4470static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
4471{
4472 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
4473
4474 __bfq_set_in_service_queue(bfqd, bfqq);
4475 return bfqq;
4476}
4477
aee69d78
PV
4478static void bfq_arm_slice_timer(struct bfq_data *bfqd)
4479{
4480 struct bfq_queue *bfqq = bfqd->in_service_queue;
4481 struct bfq_io_cq *bic;
4482 u32 sl;
4483
4484 /* Processes have exited, don't wait. */
4485 bic = bfqd->in_service_bic;
4486 if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
4487 return;
4488
4489 bfq_mark_bfqq_wait_request(bfqq);
4490
4491 /*
4492 * We don't want to idle for seeks, but we do want to allow
4493 * fair distribution of slice time for a process doing back-to-back
4494 * seeks. So allow a little bit of time for him to submit a new rq.
4495 */
4496 sl = bfqd->bfq_slice_idle;
4497 /*
44e44a1b
PV
4498 * Unless the queue is being weight-raised, grant only minimum
4499 * idle time if the queue is seeky. A long idling is preserved
4500 * for a weight-raised queue, because it is needed for
4501 * guaranteeing to the queue its reserved share of the
4502 * throughput.
aee69d78 4503 */
44e44a1b 4504 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
aee69d78
PV
4505 sl = min_t(u64, sl, BFQ_MIN_TT);
4506
4507 bfqd->last_idling_start = ktime_get();
4508 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
4509 HRTIMER_MODE_REL);
e21b7a0b 4510 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
aee69d78
PV
4511}
4512
ab0e43e9
PV
4513/*
4514 * In autotuning mode, max_budget is dynamically recomputed as the
4515 * amount of sectors transferred in timeout at the estimated peak
4516 * rate. This enables BFQ to utilize a full timeslice with a full
4517 * budget, even if the in-service queue is served at peak rate. And
4518 * this maximises throughput with sequential workloads.
4519 */
4520static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
4521{
4522 return (u64)bfqd->peak_rate * USEC_PER_MSEC *
4523 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
4524}
4525
44e44a1b
PV
4526/*
4527 * Update parameters related to throughput and responsiveness, as a
4528 * function of the estimated peak rate. See comments on
4529 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
4530 */
4531static void update_thr_responsiveness_params(struct bfq_data *bfqd)
4532{
4533 int dev_type = blk_queue_nonrot(bfqd->queue);
4534
4535 if (bfqd->bfq_user_max_budget == 0)
4536 bfqd->bfq_max_budget =
4537 bfq_calc_max_budget(bfqd);
4538
4539 if (bfqd->device_speed == BFQ_BFQD_FAST &&
4540 bfqd->peak_rate < device_speed_thresh[dev_type]) {
4541 bfqd->device_speed = BFQ_BFQD_SLOW;
4542 bfqd->RT_prod = R_slow[dev_type] *
4543 T_slow[dev_type];
4544 } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
4545 bfqd->peak_rate > device_speed_thresh[dev_type]) {
4546 bfqd->device_speed = BFQ_BFQD_FAST;
4547 bfqd->RT_prod = R_fast[dev_type] *
4548 T_fast[dev_type];
4549 }
4550
4551 bfq_log(bfqd,
4552"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
4553 dev_type == 0 ? "ROT" : "NONROT",
4554 bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
4555 bfqd->device_speed == BFQ_BFQD_FAST ?
4556 (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
4557 (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
4558 (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
4559 BFQ_RATE_SHIFT);
4560}
4561
ab0e43e9
PV
4562static void bfq_reset_rate_computation(struct bfq_data *bfqd,
4563 struct request *rq)
4564{
4565 if (rq != NULL) { /* new rq dispatch now, reset accordingly */
4566 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
4567 bfqd->peak_rate_samples = 1;
4568 bfqd->sequential_samples = 0;
4569 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
4570 blk_rq_sectors(rq);
4571 } else /* no new rq dispatched, just reset the number of samples */
4572 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
4573
4574 bfq_log(bfqd,
4575 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
4576 bfqd->peak_rate_samples, bfqd->sequential_samples,
4577 bfqd->tot_sectors_dispatched);
4578}
4579
4580static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
4581{
4582 u32 rate, weight, divisor;
4583
4584 /*
4585 * For the convergence property to hold (see comments on
4586 * bfq_update_peak_rate()) and for the assessment to be
4587 * reliable, a minimum number of samples must be present, and
4588 * a minimum amount of time must have elapsed. If not so, do
4589 * not compute new rate. Just reset parameters, to get ready
4590 * for a new evaluation attempt.
4591 */
4592 if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
4593 bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
4594 goto reset_computation;
4595
4596 /*
4597 * If a new request completion has occurred after last
4598 * dispatch, then, to approximate the rate at which requests
4599 * have been served by the device, it is more precise to
4600 * extend the observation interval to the last completion.
4601 */
4602 bfqd->delta_from_first =
4603 max_t(u64, bfqd->delta_from_first,
4604 bfqd->last_completion - bfqd->first_dispatch);
4605
4606 /*
4607 * Rate computed in sects/usec, and not sects/nsec, for
4608 * precision issues.
4609 */
4610 rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
4611 div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
4612
4613 /*
4614 * Peak rate not updated if:
4615 * - the percentage of sequential dispatches is below 3/4 of the
4616 * total, and rate is below the current estimated peak rate
4617 * - rate is unreasonably high (> 20M sectors/sec)
4618 */
4619 if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
4620 rate <= bfqd->peak_rate) ||
4621 rate > 20<<BFQ_RATE_SHIFT)
4622 goto reset_computation;
4623
4624 /*
4625 * We have to update the peak rate, at last! To this purpose,
4626 * we use a low-pass filter. We compute the smoothing constant
4627 * of the filter as a function of the 'weight' of the new
4628 * measured rate.
4629 *
4630 * As can be seen in next formulas, we define this weight as a
4631 * quantity proportional to how sequential the workload is,
4632 * and to how long the observation time interval is.
4633 *
4634 * The weight runs from 0 to 8. The maximum value of the
4635 * weight, 8, yields the minimum value for the smoothing
4636 * constant. At this minimum value for the smoothing constant,
4637 * the measured rate contributes for half of the next value of
4638 * the estimated peak rate.
4639 *
4640 * So, the first step is to compute the weight as a function
4641 * of how sequential the workload is. Note that the weight
4642 * cannot reach 9, because bfqd->sequential_samples cannot
4643 * become equal to bfqd->peak_rate_samples, which, in its
4644 * turn, holds true because bfqd->sequential_samples is not
4645 * incremented for the first sample.
4646 */
4647 weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
4648
4649 /*
4650 * Second step: further refine the weight as a function of the
4651 * duration of the observation interval.
4652 */
4653 weight = min_t(u32, 8,
4654 div_u64(weight * bfqd->delta_from_first,
4655 BFQ_RATE_REF_INTERVAL));
4656
4657 /*
4658 * Divisor ranging from 10, for minimum weight, to 2, for
4659 * maximum weight.
4660 */
4661 divisor = 10 - weight;
4662
4663 /*
4664 * Finally, update peak rate:
4665 *
4666 * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
4667 */
4668 bfqd->peak_rate *= divisor-1;
4669 bfqd->peak_rate /= divisor;
4670 rate /= divisor; /* smoothing constant alpha = 1/divisor */
4671
4672 bfqd->peak_rate += rate;
44e44a1b 4673 update_thr_responsiveness_params(bfqd);
ab0e43e9
PV
4674
4675reset_computation:
4676 bfq_reset_rate_computation(bfqd, rq);
4677}
4678
4679/*
4680 * Update the read/write peak rate (the main quantity used for
4681 * auto-tuning, see update_thr_responsiveness_params()).
4682 *
4683 * It is not trivial to estimate the peak rate (correctly): because of
4684 * the presence of sw and hw queues between the scheduler and the
4685 * device components that finally serve I/O requests, it is hard to
4686 * say exactly when a given dispatched request is served inside the
4687 * device, and for how long. As a consequence, it is hard to know
4688 * precisely at what rate a given set of requests is actually served
4689 * by the device.
4690 *
4691 * On the opposite end, the dispatch time of any request is trivially
4692 * available, and, from this piece of information, the "dispatch rate"
4693 * of requests can be immediately computed. So, the idea in the next
4694 * function is to use what is known, namely request dispatch times
4695 * (plus, when useful, request completion times), to estimate what is
4696 * unknown, namely in-device request service rate.
4697 *
4698 * The main issue is that, because of the above facts, the rate at
4699 * which a certain set of requests is dispatched over a certain time
4700 * interval can vary greatly with respect to the rate at which the
4701 * same requests are then served. But, since the size of any
4702 * intermediate queue is limited, and the service scheme is lossless
4703 * (no request is silently dropped), the following obvious convergence
4704 * property holds: the number of requests dispatched MUST become
4705 * closer and closer to the number of requests completed as the
4706 * observation interval grows. This is the key property used in
4707 * the next function to estimate the peak service rate as a function
4708 * of the observed dispatch rate. The function assumes to be invoked
4709 * on every request dispatch.
4710 */
4711static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
4712{
4713 u64 now_ns = ktime_get_ns();
4714
4715 if (bfqd->peak_rate_samples == 0) { /* first dispatch */
4716 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
4717 bfqd->peak_rate_samples);
4718 bfq_reset_rate_computation(bfqd, rq);
4719 goto update_last_values; /* will add one sample */
4720 }
4721
4722 /*
4723 * Device idle for very long: the observation interval lasting
4724 * up to this dispatch cannot be a valid observation interval
4725 * for computing a new peak rate (similarly to the late-
4726 * completion event in bfq_completed_request()). Go to
4727 * update_rate_and_reset to have the following three steps
4728 * taken:
4729 * - close the observation interval at the last (previous)
4730 * request dispatch or completion
4731 * - compute rate, if possible, for that observation interval
4732 * - start a new observation interval with this dispatch
4733 */
4734 if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
4735 bfqd->rq_in_driver == 0)
4736 goto update_rate_and_reset;
4737
4738 /* Update sampling information */
4739 bfqd->peak_rate_samples++;
4740
4741 if ((bfqd->rq_in_driver > 0 ||
4742 now_ns - bfqd->last_completion < BFQ_MIN_TT)
4743 && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
4744 bfqd->sequential_samples++;
4745
4746 bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
4747
4748 /* Reset max observed rq size every 32 dispatches */
4749 if (likely(bfqd->peak_rate_samples % 32))
4750 bfqd->last_rq_max_size =
4751 max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
4752 else
4753 bfqd->last_rq_max_size = blk_rq_sectors(rq);
4754
4755 bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
4756
4757 /* Target observation interval not yet reached, go on sampling */
4758 if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
4759 goto update_last_values;
4760
4761update_rate_and_reset:
4762 bfq_update_rate_reset(bfqd, rq);
4763update_last_values:
4764 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
4765 bfqd->last_dispatch = now_ns;
4766}
4767
aee69d78
PV
4768/*
4769 * Remove request from internal lists.
4770 */
4771static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4772{
4773 struct bfq_queue *bfqq = RQ_BFQQ(rq);
4774
4775 /*
4776 * For consistency, the next instruction should have been
4777 * executed after removing the request from the queue and
4778 * dispatching it. We execute instead this instruction before
4779 * bfq_remove_request() (and hence introduce a temporary
4780 * inconsistency), for efficiency. In fact, should this
4781 * dispatch occur for a non in-service bfqq, this anticipated
4782 * increment prevents two counters related to bfqq->dispatched
4783 * from risking to be, first, uselessly decremented, and then
4784 * incremented again when the (new) value of bfqq->dispatched
4785 * happens to be taken into account.
4786 */
4787 bfqq->dispatched++;
ab0e43e9 4788 bfq_update_peak_rate(q->elevator->elevator_data, rq);
aee69d78
PV
4789
4790 bfq_remove_request(q, rq);
4791}
4792
4793static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4794{
44e44a1b
PV
4795 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4796 if (bfqq->dispatched == 0)
4797 /*
4798 * Overloading budget_timeout field to store
4799 * the time at which the queue remains with no
4800 * backlog and no outstanding request; used by
4801 * the weight-raising mechanism.
4802 */
4803 bfqq->budget_timeout = jiffies;
4804
e21b7a0b 4805 bfq_del_bfqq_busy(bfqd, bfqq, true);
44e44a1b 4806 } else
e21b7a0b
AA
4807 bfq_requeue_bfqq(bfqd, bfqq);
4808
4809 /*
4810 * All in-service entities must have been properly deactivated
4811 * or requeued before executing the next function, which
4812 * resets all in-service entites as no more in service.
4813 */
4814 __bfq_bfqd_reset_in_service(bfqd);
aee69d78
PV
4815}
4816
4817/**
4818 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
4819 * @bfqd: device data.
4820 * @bfqq: queue to update.
4821 * @reason: reason for expiration.
4822 *
4823 * Handle the feedback on @bfqq budget at queue expiration.
4824 * See the body for detailed comments.
4825 */
4826static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4827 struct bfq_queue *bfqq,
4828 enum bfqq_expiration reason)
4829{
4830 struct request *next_rq;
4831 int budget, min_budget;
4832
aee69d78
PV
4833 min_budget = bfq_min_budget(bfqd);
4834
44e44a1b
PV
4835 if (bfqq->wr_coeff == 1)
4836 budget = bfqq->max_budget;
4837 else /*
4838 * Use a constant, low budget for weight-raised queues,
4839 * to help achieve a low latency. Keep it slightly higher
4840 * than the minimum possible budget, to cause a little
4841 * bit fewer expirations.
4842 */
4843 budget = 2 * min_budget;
4844
aee69d78
PV
4845 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
4846 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
4847 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
4848 budget, bfq_min_budget(bfqd));
4849 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
4850 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
4851
44e44a1b 4852 if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
aee69d78
PV
4853 switch (reason) {
4854 /*
4855 * Caveat: in all the following cases we trade latency
4856 * for throughput.
4857 */
4858 case BFQQE_TOO_IDLE:
54b60456
PV
4859 /*
4860 * This is the only case where we may reduce
4861 * the budget: if there is no request of the
4862 * process still waiting for completion, then
4863 * we assume (tentatively) that the timer has
4864 * expired because the batch of requests of
4865 * the process could have been served with a
4866 * smaller budget. Hence, betting that
4867 * process will behave in the same way when it
4868 * becomes backlogged again, we reduce its
4869 * next budget. As long as we guess right,
4870 * this budget cut reduces the latency
4871 * experienced by the process.
4872 *
4873 * However, if there are still outstanding
4874 * requests, then the process may have not yet
4875 * issued its next request just because it is
4876 * still waiting for the completion of some of
4877 * the still outstanding ones. So in this
4878 * subcase we do not reduce its budget, on the
4879 * contrary we increase it to possibly boost
4880 * the throughput, as discussed in the
4881 * comments to the BUDGET_TIMEOUT case.
4882 */
4883 if (bfqq->dispatched > 0) /* still outstanding reqs */
4884 budget = min(budget * 2, bfqd->bfq_max_budget);
4885 else {
4886 if (budget > 5 * min_budget)
4887 budget -= 4 * min_budget;
4888 else
4889 budget = min_budget;
4890 }
aee69d78
PV
4891 break;
4892 case BFQQE_BUDGET_TIMEOUT:
54b60456
PV
4893 /*
4894 * We double the budget here because it gives
4895 * the chance to boost the throughput if this
4896 * is not a seeky process (and has bumped into
4897 * this timeout because of, e.g., ZBR).
4898 */
4899 budget = min(budget * 2, bfqd->bfq_max_budget);
aee69d78
PV
4900 break;
4901 case BFQQE_BUDGET_EXHAUSTED:
4902 /*
4903 * The process still has backlog, and did not
4904 * let either the budget timeout or the disk
4905 * idling timeout expire. Hence it is not
4906 * seeky, has a short thinktime and may be
4907 * happy with a higher budget too. So
4908 * definitely increase the budget of this good
4909 * candidate to boost the disk throughput.
4910 */
54b60456 4911 budget = min(budget * 4, bfqd->bfq_max_budget);
aee69d78
PV
4912 break;
4913 case BFQQE_NO_MORE_REQUESTS:
4914 /*
4915 * For queues that expire for this reason, it
4916 * is particularly important to keep the
4917 * budget close to the actual service they
4918 * need. Doing so reduces the timestamp
4919 * misalignment problem described in the
4920 * comments in the body of
4921 * __bfq_activate_entity. In fact, suppose
4922 * that a queue systematically expires for
4923 * BFQQE_NO_MORE_REQUESTS and presents a
4924 * new request in time to enjoy timestamp
4925 * back-shifting. The larger the budget of the
4926 * queue is with respect to the service the
4927 * queue actually requests in each service
4928 * slot, the more times the queue can be
4929 * reactivated with the same virtual finish
4930 * time. It follows that, even if this finish
4931 * time is pushed to the system virtual time
4932 * to reduce the consequent timestamp
4933 * misalignment, the queue unjustly enjoys for
4934 * many re-activations a lower finish time
4935 * than all newly activated queues.
4936 *
4937 * The service needed by bfqq is measured
4938 * quite precisely by bfqq->entity.service.
4939 * Since bfqq does not enjoy device idling,
4940 * bfqq->entity.service is equal to the number
4941 * of sectors that the process associated with
4942 * bfqq requested to read/write before waiting
4943 * for request completions, or blocking for
4944 * other reasons.
4945 */
4946 budget = max_t(int, bfqq->entity.service, min_budget);
4947 break;
4948 default:
4949 return;
4950 }
44e44a1b 4951 } else if (!bfq_bfqq_sync(bfqq)) {
aee69d78
PV
4952 /*
4953 * Async queues get always the maximum possible
4954 * budget, as for them we do not care about latency
4955 * (in addition, their ability to dispatch is limited
4956 * by the charging factor).
4957 */
4958 budget = bfqd->bfq_max_budget;
4959 }
4960
4961 bfqq->max_budget = budget;
4962
4963 if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
4964 !bfqd->bfq_user_max_budget)
4965 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
4966
4967 /*
4968 * If there is still backlog, then assign a new budget, making
4969 * sure that it is large enough for the next request. Since
4970 * the finish time of bfqq must be kept in sync with the
4971 * budget, be sure to call __bfq_bfqq_expire() *after* this
4972 * update.
4973 *
4974 * If there is no backlog, then no need to update the budget;
4975 * it will be updated on the arrival of a new request.
4976 */
4977 next_rq = bfqq->next_rq;
4978 if (next_rq)
4979 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
4980 bfq_serv_to_charge(next_rq, bfqq));
4981
4982 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
4983 next_rq ? blk_rq_sectors(next_rq) : 0,
4984 bfqq->entity.budget);
4985}
4986
aee69d78 4987/*
ab0e43e9
PV
4988 * Return true if the process associated with bfqq is "slow". The slow
4989 * flag is used, in addition to the budget timeout, to reduce the
4990 * amount of service provided to seeky processes, and thus reduce
4991 * their chances to lower the throughput. More details in the comments
4992 * on the function bfq_bfqq_expire().
4993 *
4994 * An important observation is in order: as discussed in the comments
4995 * on the function bfq_update_peak_rate(), with devices with internal
4996 * queues, it is hard if ever possible to know when and for how long
4997 * an I/O request is processed by the device (apart from the trivial
4998 * I/O pattern where a new request is dispatched only after the
4999 * previous one has been completed). This makes it hard to evaluate
5000 * the real rate at which the I/O requests of each bfq_queue are
5001 * served. In fact, for an I/O scheduler like BFQ, serving a
5002 * bfq_queue means just dispatching its requests during its service
5003 * slot (i.e., until the budget of the queue is exhausted, or the
5004 * queue remains idle, or, finally, a timeout fires). But, during the
5005 * service slot of a bfq_queue, around 100 ms at most, the device may
5006 * be even still processing requests of bfq_queues served in previous
5007 * service slots. On the opposite end, the requests of the in-service
5008 * bfq_queue may be completed after the service slot of the queue
5009 * finishes.
5010 *
5011 * Anyway, unless more sophisticated solutions are used
5012 * (where possible), the sum of the sizes of the requests dispatched
5013 * during the service slot of a bfq_queue is probably the only
5014 * approximation available for the service received by the bfq_queue
5015 * during its service slot. And this sum is the quantity used in this
5016 * function to evaluate the I/O speed of a process.
aee69d78 5017 */
ab0e43e9
PV
5018static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5019 bool compensate, enum bfqq_expiration reason,
5020 unsigned long *delta_ms)
aee69d78 5021{
ab0e43e9
PV
5022 ktime_t delta_ktime;
5023 u32 delta_usecs;
5024 bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
aee69d78 5025
ab0e43e9 5026 if (!bfq_bfqq_sync(bfqq))
aee69d78
PV
5027 return false;
5028
5029 if (compensate)
ab0e43e9 5030 delta_ktime = bfqd->last_idling_start;
aee69d78 5031 else
ab0e43e9
PV
5032 delta_ktime = ktime_get();
5033 delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
5034 delta_usecs = ktime_to_us(delta_ktime);
aee69d78
PV
5035
5036 /* don't use too short time intervals */
ab0e43e9
PV
5037 if (delta_usecs < 1000) {
5038 if (blk_queue_nonrot(bfqd->queue))
5039 /*
5040 * give same worst-case guarantees as idling
5041 * for seeky
5042 */
5043 *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
5044 else /* charge at least one seek */
5045 *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
5046
5047 return slow;
5048 }
aee69d78 5049
ab0e43e9 5050 *delta_ms = delta_usecs / USEC_PER_MSEC;
aee69d78
PV
5051
5052 /*
ab0e43e9
PV
5053 * Use only long (> 20ms) intervals to filter out excessive
5054 * spikes in service rate estimation.
aee69d78 5055 */
ab0e43e9
PV
5056 if (delta_usecs > 20000) {
5057 /*
5058 * Caveat for rotational devices: processes doing I/O
5059 * in the slower disk zones tend to be slow(er) even
5060 * if not seeky. In this respect, the estimated peak
5061 * rate is likely to be an average over the disk
5062 * surface. Accordingly, to not be too harsh with
5063 * unlucky processes, a process is deemed slow only if
5064 * its rate has been lower than half of the estimated
5065 * peak rate.
5066 */
5067 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
aee69d78
PV
5068 }
5069
ab0e43e9 5070 bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
aee69d78 5071
ab0e43e9 5072 return slow;
aee69d78
PV
5073}
5074
5075/*
5076 * Return the farthest past time instant according to jiffies
5077 * macros.
5078 */
5079static unsigned long bfq_smallest_from_now(void)
5080{
5081 return jiffies - MAX_JIFFY_OFFSET;
5082}
5083
5084/**
5085 * bfq_bfqq_expire - expire a queue.
5086 * @bfqd: device owning the queue.
5087 * @bfqq: the queue to expire.
5088 * @compensate: if true, compensate for the time spent idling.
5089 * @reason: the reason causing the expiration.
5090 *
c074170e
PV
5091 * If the process associated with bfqq does slow I/O (e.g., because it
5092 * issues random requests), we charge bfqq with the time it has been
5093 * in service instead of the service it has received (see
5094 * bfq_bfqq_charge_time for details on how this goal is achieved). As
5095 * a consequence, bfqq will typically get higher timestamps upon
5096 * reactivation, and hence it will be rescheduled as if it had
5097 * received more service than what it has actually received. In the
5098 * end, bfqq receives less service in proportion to how slowly its
5099 * associated process consumes its budgets (and hence how seriously it
5100 * tends to lower the throughput). In addition, this time-charging
5101 * strategy guarantees time fairness among slow processes. In
5102 * contrast, if the process associated with bfqq is not slow, we
5103 * charge bfqq exactly with the service it has received.
aee69d78 5104 *
c074170e
PV
5105 * Charging time to the first type of queues and the exact service to
5106 * the other has the effect of using the WF2Q+ policy to schedule the
5107 * former on a timeslice basis, without violating service domain
5108 * guarantees among the latter.
aee69d78
PV
5109 */
5110static void bfq_bfqq_expire(struct bfq_data *bfqd,
5111 struct bfq_queue *bfqq,
5112 bool compensate,
5113 enum bfqq_expiration reason)
5114{
5115 bool slow;
ab0e43e9
PV
5116 unsigned long delta = 0;
5117 struct bfq_entity *entity = &bfqq->entity;
aee69d78
PV
5118 int ref;
5119
5120 /*
ab0e43e9 5121 * Check whether the process is slow (see bfq_bfqq_is_slow).
aee69d78 5122 */
ab0e43e9 5123 slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
aee69d78
PV
5124
5125 /*
c074170e
PV
5126 * As above explained, charge slow (typically seeky) and
5127 * timed-out queues with the time and not the service
5128 * received, to favor sequential workloads.
5129 *
5130 * Processes doing I/O in the slower disk zones will tend to
5131 * be slow(er) even if not seeky. Therefore, since the
5132 * estimated peak rate is actually an average over the disk
5133 * surface, these processes may timeout just for bad luck. To
5134 * avoid punishing them, do not charge time to processes that
5135 * succeeded in consuming at least 2/3 of their budget. This
5136 * allows BFQ to preserve enough elasticity to still perform
5137 * bandwidth, and not time, distribution with little unlucky
5138 * or quasi-sequential processes.
aee69d78 5139 */
44e44a1b
PV
5140 if (bfqq->wr_coeff == 1 &&
5141 (slow ||
5142 (reason == BFQQE_BUDGET_TIMEOUT &&
5143 bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
c074170e 5144 bfq_bfqq_charge_time(bfqd, bfqq, delta);
aee69d78
PV
5145
5146 if (reason == BFQQE_TOO_IDLE &&
ab0e43e9 5147 entity->service <= 2 * entity->budget / 10)
aee69d78
PV
5148 bfq_clear_bfqq_IO_bound(bfqq);
5149
44e44a1b
PV
5150 if (bfqd->low_latency && bfqq->wr_coeff == 1)
5151 bfqq->last_wr_start_finish = jiffies;
5152
aee69d78
PV
5153 bfq_log_bfqq(bfqd, bfqq,
5154 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
5155 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
5156
5157 /*
5158 * Increase, decrease or leave budget unchanged according to
5159 * reason.
5160 */
5161 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
5162 ref = bfqq->ref;
5163 __bfq_bfqq_expire(bfqd, bfqq);
5164
5165 /* mark bfqq as waiting a request only if a bic still points to it */
5166 if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
5167 reason != BFQQE_BUDGET_TIMEOUT &&
5168 reason != BFQQE_BUDGET_EXHAUSTED)
5169 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
5170}
5171
5172/*
5173 * Budget timeout is not implemented through a dedicated timer, but
5174 * just checked on request arrivals and completions, as well as on
5175 * idle timer expirations.
5176 */
5177static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
5178{
44e44a1b 5179 return time_is_before_eq_jiffies(bfqq->budget_timeout);
aee69d78
PV
5180}
5181
5182/*
5183 * If we expire a queue that is actively waiting (i.e., with the
5184 * device idled) for the arrival of a new request, then we may incur
5185 * the timestamp misalignment problem described in the body of the
5186 * function __bfq_activate_entity. Hence we return true only if this
5187 * condition does not hold, or if the queue is slow enough to deserve
5188 * only to be kicked off for preserving a high throughput.
5189 */
5190static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
5191{
5192 bfq_log_bfqq(bfqq->bfqd, bfqq,
5193 "may_budget_timeout: wait_request %d left %d timeout %d",
5194 bfq_bfqq_wait_request(bfqq),
5195 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
5196 bfq_bfqq_budget_timeout(bfqq));
5197
5198 return (!bfq_bfqq_wait_request(bfqq) ||
5199 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
5200 &&
5201 bfq_bfqq_budget_timeout(bfqq);
5202}
5203
5204/*
5205 * For a queue that becomes empty, device idling is allowed only if
44e44a1b
PV
5206 * this function returns true for the queue. As a consequence, since
5207 * device idling plays a critical role in both throughput boosting and
5208 * service guarantees, the return value of this function plays a
5209 * critical role in both these aspects as well.
5210 *
5211 * In a nutshell, this function returns true only if idling is
5212 * beneficial for throughput or, even if detrimental for throughput,
5213 * idling is however necessary to preserve service guarantees (low
5214 * latency, desired throughput distribution, ...). In particular, on
5215 * NCQ-capable devices, this function tries to return false, so as to
5216 * help keep the drives' internal queues full, whenever this helps the
5217 * device boost the throughput without causing any service-guarantee
5218 * issue.
5219 *
5220 * In more detail, the return value of this function is obtained by,
5221 * first, computing a number of boolean variables that take into
5222 * account throughput and service-guarantee issues, and, then,
5223 * combining these variables in a logical expression. Most of the
5224 * issues taken into account are not trivial. We discuss these issues
5225 * individually while introducing the variables.
aee69d78
PV
5226 */
5227static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
5228{
5229 struct bfq_data *bfqd = bfqq->bfqd;
44e44a1b 5230 bool idling_boosts_thr, asymmetric_scenario;
aee69d78
PV
5231
5232 if (bfqd->strict_guarantees)
5233 return true;
5234
5235 /*
44e44a1b
PV
5236 * The next variable takes into account the cases where idling
5237 * boosts the throughput.
5238 *
5239 * The value of the variable is computed considering that
aee69d78
PV
5240 * idling is usually beneficial for the throughput if:
5241 * (a) the device is not NCQ-capable, or
5242 * (b) regardless of the presence of NCQ, the request pattern
5243 * for bfqq is I/O-bound (possible throughput losses
5244 * caused by granting idling to seeky queues are mitigated
5245 * by the fact that, in all scenarios where boosting
5246 * throughput is the best thing to do, i.e., in all
5247 * symmetric scenarios, only a minimal idle time is
5248 * allowed to seeky queues).
5249 */
5250 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
5251
5252 /*
44e44a1b
PV
5253 * There is then a case where idling must be performed not for
5254 * throughput concerns, but to preserve service guarantees. To
5255 * introduce it, we can note that allowing the drive to
5256 * enqueue more than one request at a time, and hence
5257 * delegating de facto final scheduling decisions to the
5258 * drive's internal scheduler, causes loss of control on the
5259 * actual request service order. In particular, the critical
5260 * situation is when requests from different processes happens
5261 * to be present, at the same time, in the internal queue(s)
5262 * of the drive. In such a situation, the drive, by deciding
5263 * the service order of the internally-queued requests, does
5264 * determine also the actual throughput distribution among
5265 * these processes. But the drive typically has no notion or
5266 * concern about per-process throughput distribution, and
5267 * makes its decisions only on a per-request basis. Therefore,
5268 * the service distribution enforced by the drive's internal
5269 * scheduler is likely to coincide with the desired
5270 * device-throughput distribution only in a completely
5271 * symmetric scenario where: (i) each of these processes must
5272 * get the same throughput as the others; (ii) all these
5273 * processes have the same I/O pattern (either sequential or
5274 * random). In fact, in such a scenario, the drive will tend
5275 * to treat the requests of each of these processes in about
5276 * the same way as the requests of the others, and thus to
5277 * provide each of these processes with about the same
5278 * throughput (which is exactly the desired throughput
5279 * distribution). In contrast, in any asymmetric scenario,
5280 * device idling is certainly needed to guarantee that bfqq
5281 * receives its assigned fraction of the device throughput
5282 * (see [1] for details).
5283 *
5284 * As for sub-condition (i), actually we check only whether
5285 * bfqq is being weight-raised. In fact, if bfqq is not being
5286 * weight-raised, we have that:
5287 * - if the process associated with bfqq is not I/O-bound, then
5288 * it is not either latency- or throughput-critical; therefore
5289 * idling is not needed for bfqq;
5290 * - if the process asociated with bfqq is I/O-bound, then
5291 * idling is already granted with bfqq (see the comments on
5292 * idling_boosts_thr).
5293 *
5294 * We do not check sub-condition (ii) at all, i.e., the next
5295 * variable is true if and only if bfqq is being
5296 * weight-raised. We do not need to control sub-condition (ii)
5297 * for the following reason:
5298 * - if bfqq is being weight-raised, then idling is already
5299 * guaranteed to bfqq by sub-condition (i);
5300 * - if bfqq is not being weight-raised, then idling is
5301 * already guaranteed to bfqq (only) if it matters, i.e., if
5302 * bfqq is associated to a currently I/O-bound process (see
5303 * the above comment on sub-condition (i)).
5304 *
5305 * As a side note, it is worth considering that the above
5306 * device-idling countermeasures may however fail in the
5307 * following unlucky scenario: if idling is (correctly)
5308 * disabled in a time period during which the symmetry
5309 * sub-condition holds, and hence the device is allowed to
5310 * enqueue many requests, but at some later point in time some
5311 * sub-condition stops to hold, then it may become impossible
5312 * to let requests be served in the desired order until all
5313 * the requests already queued in the device have been served.
5314 */
5315 asymmetric_scenario = bfqq->wr_coeff > 1;
5316
5317 /*
5318 * We have now all the components we need to compute the return
5319 * value of the function, which is true only if both the following
5320 * conditions hold:
aee69d78 5321 * 1) bfqq is sync, because idling make sense only for sync queues;
44e44a1b
PV
5322 * 2) idling either boosts the throughput (without issues), or
5323 * is necessary to preserve service guarantees.
aee69d78 5324 */
44e44a1b
PV
5325 return bfq_bfqq_sync(bfqq) &&
5326 (idling_boosts_thr || asymmetric_scenario);
aee69d78
PV
5327}
5328
5329/*
5330 * If the in-service queue is empty but the function bfq_bfqq_may_idle
5331 * returns true, then:
5332 * 1) the queue must remain in service and cannot be expired, and
5333 * 2) the device must be idled to wait for the possible arrival of a new
5334 * request for the queue.
5335 * See the comments on the function bfq_bfqq_may_idle for the reasons
5336 * why performing device idling is the best choice to boost the throughput
5337 * and preserve service guarantees when bfq_bfqq_may_idle itself
5338 * returns true.
5339 */
5340static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
5341{
5342 struct bfq_data *bfqd = bfqq->bfqd;
5343
5344 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
5345 bfq_bfqq_may_idle(bfqq);
5346}
5347
5348/*
5349 * Select a queue for service. If we have a current queue in service,
5350 * check whether to continue servicing it, or retrieve and set a new one.
5351 */
5352static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
5353{
5354 struct bfq_queue *bfqq;
5355 struct request *next_rq;
5356 enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
5357
5358 bfqq = bfqd->in_service_queue;
5359 if (!bfqq)
5360 goto new_queue;
5361
5362 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
5363
5364 if (bfq_may_expire_for_budg_timeout(bfqq) &&
5365 !bfq_bfqq_wait_request(bfqq) &&
5366 !bfq_bfqq_must_idle(bfqq))
5367 goto expire;
5368
5369check_queue:
5370 /*
5371 * This loop is rarely executed more than once. Even when it
5372 * happens, it is much more convenient to re-execute this loop
5373 * than to return NULL and trigger a new dispatch to get a
5374 * request served.
5375 */
5376 next_rq = bfqq->next_rq;
5377 /*
5378 * If bfqq has requests queued and it has enough budget left to
5379 * serve them, keep the queue, otherwise expire it.
5380 */
5381 if (next_rq) {
5382 if (bfq_serv_to_charge(next_rq, bfqq) >
5383 bfq_bfqq_budget_left(bfqq)) {
5384 /*
5385 * Expire the queue for budget exhaustion,
5386 * which makes sure that the next budget is
5387 * enough to serve the next request, even if
5388 * it comes from the fifo expired path.
5389 */
5390 reason = BFQQE_BUDGET_EXHAUSTED;
5391 goto expire;
5392 } else {
5393 /*
5394 * The idle timer may be pending because we may
5395 * not disable disk idling even when a new request
5396 * arrives.
5397 */
5398 if (bfq_bfqq_wait_request(bfqq)) {
5399 /*
5400 * If we get here: 1) at least a new request
5401 * has arrived but we have not disabled the
5402 * timer because the request was too small,
5403 * 2) then the block layer has unplugged
5404 * the device, causing the dispatch to be
5405 * invoked.
5406 *
5407 * Since the device is unplugged, now the
5408 * requests are probably large enough to
5409 * provide a reasonable throughput.
5410 * So we disable idling.
5411 */
5412 bfq_clear_bfqq_wait_request(bfqq);
5413 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 5414 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
5415 }
5416 goto keep_queue;
5417 }
5418 }
5419
5420 /*
5421 * No requests pending. However, if the in-service queue is idling
5422 * for a new request, or has requests waiting for a completion and
5423 * may idle after their completion, then keep it anyway.
5424 */
5425 if (bfq_bfqq_wait_request(bfqq) ||
5426 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
5427 bfqq = NULL;
5428 goto keep_queue;
5429 }
5430
5431 reason = BFQQE_NO_MORE_REQUESTS;
5432expire:
5433 bfq_bfqq_expire(bfqd, bfqq, false, reason);
5434new_queue:
5435 bfqq = bfq_set_in_service_queue(bfqd);
5436 if (bfqq) {
5437 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
5438 goto check_queue;
5439 }
5440keep_queue:
5441 if (bfqq)
5442 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
5443 else
5444 bfq_log(bfqd, "select_queue: no queue returned");
5445
5446 return bfqq;
5447}
5448
44e44a1b
PV
5449static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5450{
5451 struct bfq_entity *entity = &bfqq->entity;
5452
5453 if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
5454 bfq_log_bfqq(bfqd, bfqq,
5455 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
5456 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
5457 jiffies_to_msecs(bfqq->wr_cur_max_time),
5458 bfqq->wr_coeff,
5459 bfqq->entity.weight, bfqq->entity.orig_weight);
5460
5461 if (entity->prio_changed)
5462 bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
5463
5464 /*
5465 * If too much time has elapsed from the beginning of
5466 * this weight-raising period, then end weight
5467 * raising.
5468 */
5469 if (time_is_before_jiffies(bfqq->last_wr_start_finish +
5470 bfqq->wr_cur_max_time)) {
5471 bfqq->last_wr_start_finish = jiffies;
5472 bfq_log_bfqq(bfqd, bfqq,
5473 "wrais ending at %lu, rais_max_time %u",
5474 bfqq->last_wr_start_finish,
5475 jiffies_to_msecs(bfqq->wr_cur_max_time));
5476 bfq_bfqq_end_wr(bfqq);
5477 }
5478 }
5479 /* Update weight both if it must be raised and if it must be lowered */
5480 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
5481 __bfq_entity_update_weight_prio(
5482 bfq_entity_service_tree(entity),
5483 entity);
5484}
5485
aee69d78
PV
5486/*
5487 * Dispatch next request from bfqq.
5488 */
5489static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
5490 struct bfq_queue *bfqq)
5491{
5492 struct request *rq = bfqq->next_rq;
5493 unsigned long service_to_charge;
5494
5495 service_to_charge = bfq_serv_to_charge(rq, bfqq);
5496
5497 bfq_bfqq_served(bfqq, service_to_charge);
5498
5499 bfq_dispatch_remove(bfqd->queue, rq);
5500
44e44a1b
PV
5501 /*
5502 * If weight raising has to terminate for bfqq, then next
5503 * function causes an immediate update of bfqq's weight,
5504 * without waiting for next activation. As a consequence, on
5505 * expiration, bfqq will be timestamped as if has never been
5506 * weight-raised during this service slot, even if it has
5507 * received part or even most of the service as a
5508 * weight-raised queue. This inflates bfqq's timestamps, which
5509 * is beneficial, as bfqq is then more willing to leave the
5510 * device immediately to possible other weight-raised queues.
5511 */
5512 bfq_update_wr_data(bfqd, bfqq);
5513
aee69d78
PV
5514 if (!bfqd->in_service_bic) {
5515 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
5516 bfqd->in_service_bic = RQ_BIC(rq);
5517 }
5518
5519 /*
5520 * Expire bfqq, pretending that its budget expired, if bfqq
5521 * belongs to CLASS_IDLE and other queues are waiting for
5522 * service.
5523 */
5524 if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
5525 goto expire;
5526
5527 return rq;
5528
5529expire:
5530 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
5531 return rq;
5532}
5533
5534static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
5535{
5536 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5537
5538 /*
5539 * Avoiding lock: a race on bfqd->busy_queues should cause at
5540 * most a call to dispatch for nothing
5541 */
5542 return !list_empty_careful(&bfqd->dispatch) ||
5543 bfqd->busy_queues > 0;
5544}
5545
5546static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
5547{
5548 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5549 struct request *rq = NULL;
5550 struct bfq_queue *bfqq = NULL;
5551
5552 if (!list_empty(&bfqd->dispatch)) {
5553 rq = list_first_entry(&bfqd->dispatch, struct request,
5554 queuelist);
5555 list_del_init(&rq->queuelist);
5556
5557 bfqq = RQ_BFQQ(rq);
5558
5559 if (bfqq) {
5560 /*
5561 * Increment counters here, because this
5562 * dispatch does not follow the standard
5563 * dispatch flow (where counters are
5564 * incremented)
5565 */
5566 bfqq->dispatched++;
5567
5568 goto inc_in_driver_start_rq;
5569 }
5570
5571 /*
5572 * We exploit the put_rq_private hook to decrement
5573 * rq_in_driver, but put_rq_private will not be
5574 * invoked on this request. So, to avoid unbalance,
5575 * just start this request, without incrementing
5576 * rq_in_driver. As a negative consequence,
5577 * rq_in_driver is deceptively lower than it should be
5578 * while this request is in service. This may cause
5579 * bfq_schedule_dispatch to be invoked uselessly.
5580 *
5581 * As for implementing an exact solution, the
5582 * put_request hook, if defined, is probably invoked
5583 * also on this request. So, by exploiting this hook,
5584 * we could 1) increment rq_in_driver here, and 2)
5585 * decrement it in put_request. Such a solution would
5586 * let the value of the counter be always accurate,
5587 * but it would entail using an extra interface
5588 * function. This cost seems higher than the benefit,
5589 * being the frequency of non-elevator-private
5590 * requests very low.
5591 */
5592 goto start_rq;
5593 }
5594
5595 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
5596
5597 if (bfqd->busy_queues == 0)
5598 goto exit;
5599
5600 /*
5601 * Force device to serve one request at a time if
5602 * strict_guarantees is true. Forcing this service scheme is
5603 * currently the ONLY way to guarantee that the request
5604 * service order enforced by the scheduler is respected by a
5605 * queueing device. Otherwise the device is free even to make
5606 * some unlucky request wait for as long as the device
5607 * wishes.
5608 *
5609 * Of course, serving one request at at time may cause loss of
5610 * throughput.
5611 */
5612 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
5613 goto exit;
5614
5615 bfqq = bfq_select_queue(bfqd);
5616 if (!bfqq)
5617 goto exit;
5618
5619 rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
5620
5621 if (rq) {
5622inc_in_driver_start_rq:
5623 bfqd->rq_in_driver++;
5624start_rq:
5625 rq->rq_flags |= RQF_STARTED;
5626 }
5627exit:
5628 return rq;
5629}
5630
5631static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
5632{
5633 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
5634 struct request *rq;
5635
5636 spin_lock_irq(&bfqd->lock);
5637 rq = __bfq_dispatch_request(hctx);
5638 spin_unlock_irq(&bfqd->lock);
5639
5640 return rq;
5641}
5642
5643/*
5644 * Task holds one reference to the queue, dropped when task exits. Each rq
5645 * in-flight on this queue also holds a reference, dropped when rq is freed.
5646 *
5647 * Scheduler lock must be held here. Recall not to use bfqq after calling
5648 * this function on it.
5649 */
5650static void bfq_put_queue(struct bfq_queue *bfqq)
5651{
e21b7a0b
AA
5652#ifdef CONFIG_BFQ_GROUP_IOSCHED
5653 struct bfq_group *bfqg = bfqq_group(bfqq);
5654#endif
5655
aee69d78
PV
5656 if (bfqq->bfqd)
5657 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
5658 bfqq, bfqq->ref);
5659
5660 bfqq->ref--;
5661 if (bfqq->ref)
5662 return;
5663
e21b7a0b
AA
5664 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
5665
aee69d78 5666 kmem_cache_free(bfq_pool, bfqq);
e21b7a0b
AA
5667#ifdef CONFIG_BFQ_GROUP_IOSCHED
5668 bfqg_put(bfqg);
5669#endif
aee69d78
PV
5670}
5671
5672static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5673{
5674 if (bfqq == bfqd->in_service_queue) {
5675 __bfq_bfqq_expire(bfqd, bfqq);
5676 bfq_schedule_dispatch(bfqd);
5677 }
5678
5679 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
5680
5681 bfq_put_queue(bfqq); /* release process reference */
5682}
5683
5684static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
5685{
5686 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
5687 struct bfq_data *bfqd;
5688
5689 if (bfqq)
5690 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
5691
5692 if (bfqq && bfqd) {
5693 unsigned long flags;
5694
5695 spin_lock_irqsave(&bfqd->lock, flags);
5696 bfq_exit_bfqq(bfqd, bfqq);
5697 bic_set_bfqq(bic, NULL, is_sync);
5698 spin_unlock_irq(&bfqd->lock);
5699 }
5700}
5701
5702static void bfq_exit_icq(struct io_cq *icq)
5703{
5704 struct bfq_io_cq *bic = icq_to_bic(icq);
5705
5706 bfq_exit_icq_bfqq(bic, true);
5707 bfq_exit_icq_bfqq(bic, false);
5708}
5709
5710/*
5711 * Update the entity prio values; note that the new values will not
5712 * be used until the next (re)activation.
5713 */
5714static void
5715bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
5716{
5717 struct task_struct *tsk = current;
5718 int ioprio_class;
5719 struct bfq_data *bfqd = bfqq->bfqd;
5720
5721 if (!bfqd)
5722 return;
5723
5724 ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
5725 switch (ioprio_class) {
5726 default:
5727 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
5728 "bfq: bad prio class %d\n", ioprio_class);
5729 case IOPRIO_CLASS_NONE:
5730 /*
5731 * No prio set, inherit CPU scheduling settings.
5732 */
5733 bfqq->new_ioprio = task_nice_ioprio(tsk);
5734 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
5735 break;
5736 case IOPRIO_CLASS_RT:
5737 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
5738 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
5739 break;
5740 case IOPRIO_CLASS_BE:
5741 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
5742 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
5743 break;
5744 case IOPRIO_CLASS_IDLE:
5745 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
5746 bfqq->new_ioprio = 7;
5747 bfq_clear_bfqq_idle_window(bfqq);
5748 break;
5749 }
5750
5751 if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
5752 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
5753 bfqq->new_ioprio);
5754 bfqq->new_ioprio = IOPRIO_BE_NR;
5755 }
5756
5757 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
5758 bfqq->entity.prio_changed = 1;
5759}
5760
5761static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
5762{
5763 struct bfq_data *bfqd = bic_to_bfqd(bic);
5764 struct bfq_queue *bfqq;
5765 int ioprio = bic->icq.ioc->ioprio;
5766
5767 /*
5768 * This condition may trigger on a newly created bic, be sure to
5769 * drop the lock before returning.
5770 */
5771 if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
5772 return;
5773
5774 bic->ioprio = ioprio;
5775
5776 bfqq = bic_to_bfqq(bic, false);
5777 if (bfqq) {
5778 /* release process reference on this queue */
5779 bfq_put_queue(bfqq);
5780 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
5781 bic_set_bfqq(bic, bfqq, false);
5782 }
5783
5784 bfqq = bic_to_bfqq(bic, true);
5785 if (bfqq)
5786 bfq_set_next_ioprio_data(bfqq, bic);
5787}
5788
5789static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5790 struct bfq_io_cq *bic, pid_t pid, int is_sync)
5791{
5792 RB_CLEAR_NODE(&bfqq->entity.rb_node);
5793 INIT_LIST_HEAD(&bfqq->fifo);
5794
5795 bfqq->ref = 0;
5796 bfqq->bfqd = bfqd;
5797
5798 if (bic)
5799 bfq_set_next_ioprio_data(bfqq, bic);
5800
5801 if (is_sync) {
5802 if (!bfq_class_idle(bfqq))
5803 bfq_mark_bfqq_idle_window(bfqq);
5804 bfq_mark_bfqq_sync(bfqq);
5805 } else
5806 bfq_clear_bfqq_sync(bfqq);
5807
5808 /* set end request to minus infinity from now */
5809 bfqq->ttime.last_end_request = ktime_get_ns() + 1;
5810
5811 bfq_mark_bfqq_IO_bound(bfqq);
5812
5813 bfqq->pid = pid;
5814
5815 /* Tentative initial value to trade off between thr and lat */
54b60456 5816 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
aee69d78 5817 bfqq->budget_timeout = bfq_smallest_from_now();
aee69d78 5818
44e44a1b
PV
5819 bfqq->wr_coeff = 1;
5820 bfqq->last_wr_start_finish = bfq_smallest_from_now();
5821
aee69d78
PV
5822 /* first request is almost certainly seeky */
5823 bfqq->seek_history = 1;
5824}
5825
5826static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
e21b7a0b 5827 struct bfq_group *bfqg,
aee69d78
PV
5828 int ioprio_class, int ioprio)
5829{
5830 switch (ioprio_class) {
5831 case IOPRIO_CLASS_RT:
e21b7a0b 5832 return &bfqg->async_bfqq[0][ioprio];
aee69d78
PV
5833 case IOPRIO_CLASS_NONE:
5834 ioprio = IOPRIO_NORM;
5835 /* fall through */
5836 case IOPRIO_CLASS_BE:
e21b7a0b 5837 return &bfqg->async_bfqq[1][ioprio];
aee69d78 5838 case IOPRIO_CLASS_IDLE:
e21b7a0b 5839 return &bfqg->async_idle_bfqq;
aee69d78
PV
5840 default:
5841 return NULL;
5842 }
5843}
5844
5845static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
5846 struct bio *bio, bool is_sync,
5847 struct bfq_io_cq *bic)
5848{
5849 const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
5850 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
5851 struct bfq_queue **async_bfqq = NULL;
5852 struct bfq_queue *bfqq;
e21b7a0b 5853 struct bfq_group *bfqg;
aee69d78
PV
5854
5855 rcu_read_lock();
5856
e21b7a0b
AA
5857 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
5858 if (!bfqg) {
5859 bfqq = &bfqd->oom_bfqq;
5860 goto out;
5861 }
5862
aee69d78 5863 if (!is_sync) {
e21b7a0b 5864 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
aee69d78
PV
5865 ioprio);
5866 bfqq = *async_bfqq;
5867 if (bfqq)
5868 goto out;
5869 }
5870
5871 bfqq = kmem_cache_alloc_node(bfq_pool,
5872 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
5873 bfqd->queue->node);
5874
5875 if (bfqq) {
5876 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
5877 is_sync);
e21b7a0b 5878 bfq_init_entity(&bfqq->entity, bfqg);
aee69d78
PV
5879 bfq_log_bfqq(bfqd, bfqq, "allocated");
5880 } else {
5881 bfqq = &bfqd->oom_bfqq;
5882 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
5883 goto out;
5884 }
5885
5886 /*
5887 * Pin the queue now that it's allocated, scheduler exit will
5888 * prune it.
5889 */
5890 if (async_bfqq) {
e21b7a0b
AA
5891 bfqq->ref++; /*
5892 * Extra group reference, w.r.t. sync
5893 * queue. This extra reference is removed
5894 * only if bfqq->bfqg disappears, to
5895 * guarantee that this queue is not freed
5896 * until its group goes away.
5897 */
5898 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
aee69d78
PV
5899 bfqq, bfqq->ref);
5900 *async_bfqq = bfqq;
5901 }
5902
5903out:
5904 bfqq->ref++; /* get a process reference to this queue */
5905 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
5906 rcu_read_unlock();
5907 return bfqq;
5908}
5909
5910static void bfq_update_io_thinktime(struct bfq_data *bfqd,
5911 struct bfq_queue *bfqq)
5912{
5913 struct bfq_ttime *ttime = &bfqq->ttime;
5914 u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
5915
5916 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
5917
5918 ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
5919 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
5920 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
5921 ttime->ttime_samples);
5922}
5923
5924static void
5925bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5926 struct request *rq)
5927{
aee69d78 5928 bfqq->seek_history <<= 1;
ab0e43e9
PV
5929 bfqq->seek_history |=
5930 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
aee69d78
PV
5931 (!blk_queue_nonrot(bfqd->queue) ||
5932 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
5933}
5934
5935/*
5936 * Disable idle window if the process thinks too long or seeks so much that
5937 * it doesn't matter.
5938 */
5939static void bfq_update_idle_window(struct bfq_data *bfqd,
5940 struct bfq_queue *bfqq,
5941 struct bfq_io_cq *bic)
5942{
5943 int enable_idle;
5944
5945 /* Don't idle for async or idle io prio class. */
5946 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
5947 return;
5948
5949 enable_idle = bfq_bfqq_idle_window(bfqq);
5950
5951 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
5952 bfqd->bfq_slice_idle == 0 ||
5953 (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
5954 enable_idle = 0;
5955 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
44e44a1b
PV
5956 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
5957 bfqq->wr_coeff == 1)
aee69d78
PV
5958 enable_idle = 0;
5959 else
5960 enable_idle = 1;
5961 }
5962 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
5963 enable_idle);
5964
5965 if (enable_idle)
5966 bfq_mark_bfqq_idle_window(bfqq);
5967 else
5968 bfq_clear_bfqq_idle_window(bfqq);
5969}
5970
5971/*
5972 * Called when a new fs request (rq) is added to bfqq. Check if there's
5973 * something we should do about it.
5974 */
5975static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5976 struct request *rq)
5977{
5978 struct bfq_io_cq *bic = RQ_BIC(rq);
5979
5980 if (rq->cmd_flags & REQ_META)
5981 bfqq->meta_pending++;
5982
5983 bfq_update_io_thinktime(bfqd, bfqq);
5984 bfq_update_io_seektime(bfqd, bfqq, rq);
5985 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
5986 !BFQQ_SEEKY(bfqq))
5987 bfq_update_idle_window(bfqd, bfqq, bic);
5988
5989 bfq_log_bfqq(bfqd, bfqq,
5990 "rq_enqueued: idle_window=%d (seeky %d)",
5991 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
5992
5993 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
5994
5995 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
5996 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
5997 blk_rq_sectors(rq) < 32;
5998 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
5999
6000 /*
6001 * There is just this request queued: if the request
6002 * is small and the queue is not to be expired, then
6003 * just exit.
6004 *
6005 * In this way, if the device is being idled to wait
6006 * for a new request from the in-service queue, we
6007 * avoid unplugging the device and committing the
6008 * device to serve just a small request. On the
6009 * contrary, we wait for the block layer to decide
6010 * when to unplug the device: hopefully, new requests
6011 * will be merged to this one quickly, then the device
6012 * will be unplugged and larger requests will be
6013 * dispatched.
6014 */
6015 if (small_req && !budget_timeout)
6016 return;
6017
6018 /*
6019 * A large enough request arrived, or the queue is to
6020 * be expired: in both cases disk idling is to be
6021 * stopped, so clear wait_request flag and reset
6022 * timer.
6023 */
6024 bfq_clear_bfqq_wait_request(bfqq);
6025 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
e21b7a0b 6026 bfqg_stats_update_idle_time(bfqq_group(bfqq));
aee69d78
PV
6027
6028 /*
6029 * The queue is not empty, because a new request just
6030 * arrived. Hence we can safely expire the queue, in
6031 * case of budget timeout, without risking that the
6032 * timestamps of the queue are not updated correctly.
6033 * See [1] for more details.
6034 */
6035 if (budget_timeout)
6036 bfq_bfqq_expire(bfqd, bfqq, false,
6037 BFQQE_BUDGET_TIMEOUT);
6038 }
6039}
6040
6041static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
6042{
6043 struct bfq_queue *bfqq = RQ_BFQQ(rq);
6044
6045 bfq_add_request(rq);
6046
6047 rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
6048 list_add_tail(&rq->queuelist, &bfqq->fifo);
6049
6050 bfq_rq_enqueued(bfqd, bfqq, rq);
6051}
6052
6053static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
6054 bool at_head)
6055{
6056 struct request_queue *q = hctx->queue;
6057 struct bfq_data *bfqd = q->elevator->elevator_data;
6058
6059 spin_lock_irq(&bfqd->lock);
6060 if (blk_mq_sched_try_insert_merge(q, rq)) {
6061 spin_unlock_irq(&bfqd->lock);
6062 return;
6063 }
6064
6065 spin_unlock_irq(&bfqd->lock);
6066
6067 blk_mq_sched_request_inserted(rq);
6068
6069 spin_lock_irq(&bfqd->lock);
6070 if (at_head || blk_rq_is_passthrough(rq)) {
6071 if (at_head)
6072 list_add(&rq->queuelist, &bfqd->dispatch);
6073 else
6074 list_add_tail(&rq->queuelist, &bfqd->dispatch);
6075 } else {
6076 __bfq_insert_request(bfqd, rq);
6077
6078 if (rq_mergeable(rq)) {
6079 elv_rqhash_add(q, rq);
6080 if (!q->last_merge)
6081 q->last_merge = rq;
6082 }
6083 }
6084
6085 spin_unlock_irq(&bfqd->lock);
6086}
6087
6088static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
6089 struct list_head *list, bool at_head)
6090{
6091 while (!list_empty(list)) {
6092 struct request *rq;
6093
6094 rq = list_first_entry(list, struct request, queuelist);
6095 list_del_init(&rq->queuelist);
6096 bfq_insert_request(hctx, rq, at_head);
6097 }
6098}
6099
6100static void bfq_update_hw_tag(struct bfq_data *bfqd)
6101{
6102 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
6103 bfqd->rq_in_driver);
6104
6105 if (bfqd->hw_tag == 1)
6106 return;
6107
6108 /*
6109 * This sample is valid if the number of outstanding requests
6110 * is large enough to allow a queueing behavior. Note that the
6111 * sum is not exact, as it's not taking into account deactivated
6112 * requests.
6113 */
6114 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
6115 return;
6116
6117 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
6118 return;
6119
6120 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
6121 bfqd->max_rq_in_driver = 0;
6122 bfqd->hw_tag_samples = 0;
6123}
6124
6125static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
6126{
ab0e43e9
PV
6127 u64 now_ns;
6128 u32 delta_us;
6129
aee69d78
PV
6130 bfq_update_hw_tag(bfqd);
6131
6132 bfqd->rq_in_driver--;
6133 bfqq->dispatched--;
6134
44e44a1b
PV
6135 if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
6136 /*
6137 * Set budget_timeout (which we overload to store the
6138 * time at which the queue remains with no backlog and
6139 * no outstanding request; used by the weight-raising
6140 * mechanism).
6141 */
6142 bfqq->budget_timeout = jiffies;
6143 }
6144
ab0e43e9
PV
6145 now_ns = ktime_get_ns();
6146
6147 bfqq->ttime.last_end_request = now_ns;
6148
6149 /*
6150 * Using us instead of ns, to get a reasonable precision in
6151 * computing rate in next check.
6152 */
6153 delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
6154
6155 /*
6156 * If the request took rather long to complete, and, according
6157 * to the maximum request size recorded, this completion latency
6158 * implies that the request was certainly served at a very low
6159 * rate (less than 1M sectors/sec), then the whole observation
6160 * interval that lasts up to this time instant cannot be a
6161 * valid time interval for computing a new peak rate. Invoke
6162 * bfq_update_rate_reset to have the following three steps
6163 * taken:
6164 * - close the observation interval at the last (previous)
6165 * request dispatch or completion
6166 * - compute rate, if possible, for that observation interval
6167 * - reset to zero samples, which will trigger a proper
6168 * re-initialization of the observation interval on next
6169 * dispatch
6170 */
6171 if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
6172 (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
6173 1UL<<(BFQ_RATE_SHIFT - 10))
6174 bfq_update_rate_reset(bfqd, NULL);
6175 bfqd->last_completion = now_ns;
aee69d78
PV
6176
6177 /*
6178 * If this is the in-service queue, check if it needs to be expired,
6179 * or if we want to idle in case it has no pending requests.
6180 */
6181 if (bfqd->in_service_queue == bfqq) {
44e44a1b 6182 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
aee69d78
PV
6183 bfq_arm_slice_timer(bfqd);
6184 return;
6185 } else if (bfq_may_expire_for_budg_timeout(bfqq))
6186 bfq_bfqq_expire(bfqd, bfqq, false,
6187 BFQQE_BUDGET_TIMEOUT);
6188 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
6189 (bfqq->dispatched == 0 ||
6190 !bfq_bfqq_may_idle(bfqq)))
6191 bfq_bfqq_expire(bfqd, bfqq, false,
6192 BFQQE_NO_MORE_REQUESTS);
6193 }
6194}
6195
6196static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
6197{
6198 bfqq->allocated--;
6199
6200 bfq_put_queue(bfqq);
6201}
6202
6203static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
6204{
6205 struct bfq_queue *bfqq = RQ_BFQQ(rq);
6206 struct bfq_data *bfqd = bfqq->bfqd;
6207
e21b7a0b
AA
6208 if (rq->rq_flags & RQF_STARTED)
6209 bfqg_stats_update_completion(bfqq_group(bfqq),
6210 rq_start_time_ns(rq),
6211 rq_io_start_time_ns(rq),
6212 rq->cmd_flags);
aee69d78
PV
6213
6214 if (likely(rq->rq_flags & RQF_STARTED)) {
6215 unsigned long flags;
6216
6217 spin_lock_irqsave(&bfqd->lock, flags);
6218
6219 bfq_completed_request(bfqq, bfqd);
6220 bfq_put_rq_priv_body(bfqq);
6221
6222 spin_unlock_irqrestore(&bfqd->lock, flags);
6223 } else {
6224 /*
6225 * Request rq may be still/already in the scheduler,
6226 * in which case we need to remove it. And we cannot
6227 * defer such a check and removal, to avoid
6228 * inconsistencies in the time interval from the end
6229 * of this function to the start of the deferred work.
6230 * This situation seems to occur only in process
6231 * context, as a consequence of a merge. In the
6232 * current version of the code, this implies that the
6233 * lock is held.
6234 */
6235
6236 if (!RB_EMPTY_NODE(&rq->rb_node))
6237 bfq_remove_request(q, rq);
6238 bfq_put_rq_priv_body(bfqq);
6239 }
6240
6241 rq->elv.priv[0] = NULL;
6242 rq->elv.priv[1] = NULL;
6243}
6244
6245/*
6246 * Allocate bfq data structures associated with this request.
6247 */
6248static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
6249 struct bio *bio)
6250{
6251 struct bfq_data *bfqd = q->elevator->elevator_data;
6252 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
6253 const int is_sync = rq_is_sync(rq);
6254 struct bfq_queue *bfqq;
6255
6256 spin_lock_irq(&bfqd->lock);
6257
6258 bfq_check_ioprio_change(bic, bio);
6259
6260 if (!bic)
6261 goto queue_fail;
6262
e21b7a0b
AA
6263 bfq_bic_update_cgroup(bic, bio);
6264
aee69d78
PV
6265 bfqq = bic_to_bfqq(bic, is_sync);
6266 if (!bfqq || bfqq == &bfqd->oom_bfqq) {
6267 if (bfqq)
6268 bfq_put_queue(bfqq);
6269 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
6270 bic_set_bfqq(bic, bfqq, is_sync);
6271 }
6272
6273 bfqq->allocated++;
6274 bfqq->ref++;
6275 bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
6276 rq, bfqq, bfqq->ref);
6277
6278 rq->elv.priv[0] = bic;
6279 rq->elv.priv[1] = bfqq;
6280
6281 spin_unlock_irq(&bfqd->lock);
6282
6283 return 0;
6284
6285queue_fail:
6286 spin_unlock_irq(&bfqd->lock);
6287
6288 return 1;
6289}
6290
6291static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
6292{
6293 struct bfq_data *bfqd = bfqq->bfqd;
6294 enum bfqq_expiration reason;
6295 unsigned long flags;
6296
6297 spin_lock_irqsave(&bfqd->lock, flags);
6298 bfq_clear_bfqq_wait_request(bfqq);
6299
6300 if (bfqq != bfqd->in_service_queue) {
6301 spin_unlock_irqrestore(&bfqd->lock, flags);
6302 return;
6303 }
6304
6305 if (bfq_bfqq_budget_timeout(bfqq))
6306 /*
6307 * Also here the queue can be safely expired
6308 * for budget timeout without wasting
6309 * guarantees
6310 */
6311 reason = BFQQE_BUDGET_TIMEOUT;
6312 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
6313 /*
6314 * The queue may not be empty upon timer expiration,
6315 * because we may not disable the timer when the
6316 * first request of the in-service queue arrives
6317 * during disk idling.
6318 */
6319 reason = BFQQE_TOO_IDLE;
6320 else
6321 goto schedule_dispatch;
6322
6323 bfq_bfqq_expire(bfqd, bfqq, true, reason);
6324
6325schedule_dispatch:
6326 spin_unlock_irqrestore(&bfqd->lock, flags);
6327 bfq_schedule_dispatch(bfqd);
6328}
6329
6330/*
6331 * Handler of the expiration of the timer running if the in-service queue
6332 * is idling inside its time slice.
6333 */
6334static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
6335{
6336 struct bfq_data *bfqd = container_of(timer, struct bfq_data,
6337 idle_slice_timer);
6338 struct bfq_queue *bfqq = bfqd->in_service_queue;
6339
6340 /*
6341 * Theoretical race here: the in-service queue can be NULL or
6342 * different from the queue that was idling if a new request
6343 * arrives for the current queue and there is a full dispatch
6344 * cycle that changes the in-service queue. This can hardly
6345 * happen, but in the worst case we just expire a queue too
6346 * early.
6347 */
6348 if (bfqq)
6349 bfq_idle_slice_timer_body(bfqq);
6350
6351 return HRTIMER_NORESTART;
6352}
6353
6354static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
6355 struct bfq_queue **bfqq_ptr)
6356{
6357 struct bfq_queue *bfqq = *bfqq_ptr;
6358
6359 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
6360 if (bfqq) {
e21b7a0b
AA
6361 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
6362
aee69d78
PV
6363 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
6364 bfqq, bfqq->ref);
6365 bfq_put_queue(bfqq);
6366 *bfqq_ptr = NULL;
6367 }
6368}
6369
6370/*
e21b7a0b
AA
6371 * Release all the bfqg references to its async queues. If we are
6372 * deallocating the group these queues may still contain requests, so
6373 * we reparent them to the root cgroup (i.e., the only one that will
6374 * exist for sure until all the requests on a device are gone).
aee69d78 6375 */
e21b7a0b 6376static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
aee69d78
PV
6377{
6378 int i, j;
6379
6380 for (i = 0; i < 2; i++)
6381 for (j = 0; j < IOPRIO_BE_NR; j++)
e21b7a0b 6382 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
aee69d78 6383
e21b7a0b 6384 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
aee69d78
PV
6385}
6386
6387static void bfq_exit_queue(struct elevator_queue *e)
6388{
6389 struct bfq_data *bfqd = e->elevator_data;
6390 struct bfq_queue *bfqq, *n;
6391
6392 hrtimer_cancel(&bfqd->idle_slice_timer);
6393
6394 spin_lock_irq(&bfqd->lock);
6395 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
e21b7a0b 6396 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
aee69d78
PV
6397 spin_unlock_irq(&bfqd->lock);
6398
6399 hrtimer_cancel(&bfqd->idle_slice_timer);
6400
e21b7a0b
AA
6401#ifdef CONFIG_BFQ_GROUP_IOSCHED
6402 blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
6403#else
6404 spin_lock_irq(&bfqd->lock);
6405 bfq_put_async_queues(bfqd, bfqd->root_group);
6406 kfree(bfqd->root_group);
6407 spin_unlock_irq(&bfqd->lock);
6408#endif
6409
aee69d78
PV
6410 kfree(bfqd);
6411}
6412
e21b7a0b
AA
6413static void bfq_init_root_group(struct bfq_group *root_group,
6414 struct bfq_data *bfqd)
6415{
6416 int i;
6417
6418#ifdef CONFIG_BFQ_GROUP_IOSCHED
6419 root_group->entity.parent = NULL;
6420 root_group->my_entity = NULL;
6421 root_group->bfqd = bfqd;
6422#endif
6423 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6424 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6425 root_group->sched_data.bfq_class_idle_last_service = jiffies;
6426}
6427
aee69d78
PV
6428static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
6429{
6430 struct bfq_data *bfqd;
6431 struct elevator_queue *eq;
aee69d78
PV
6432
6433 eq = elevator_alloc(q, e);
6434 if (!eq)
6435 return -ENOMEM;
6436
6437 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
6438 if (!bfqd) {
6439 kobject_put(&eq->kobj);
6440 return -ENOMEM;
6441 }
6442 eq->elevator_data = bfqd;
6443
e21b7a0b
AA
6444 spin_lock_irq(q->queue_lock);
6445 q->elevator = eq;
6446 spin_unlock_irq(q->queue_lock);
6447
aee69d78
PV
6448 /*
6449 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
6450 * Grab a permanent reference to it, so that the normal code flow
6451 * will not attempt to free it.
6452 */
6453 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
6454 bfqd->oom_bfqq.ref++;
6455 bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
6456 bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
6457 bfqd->oom_bfqq.entity.new_weight =
6458 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
6459 /*
6460 * Trigger weight initialization, according to ioprio, at the
6461 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
6462 * class won't be changed any more.
6463 */
6464 bfqd->oom_bfqq.entity.prio_changed = 1;
6465
6466 bfqd->queue = q;
6467
e21b7a0b 6468 INIT_LIST_HEAD(&bfqd->dispatch);
aee69d78
PV
6469
6470 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
6471 HRTIMER_MODE_REL);
6472 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
6473
6474 INIT_LIST_HEAD(&bfqd->active_list);
6475 INIT_LIST_HEAD(&bfqd->idle_list);
6476
6477 bfqd->hw_tag = -1;
6478
6479 bfqd->bfq_max_budget = bfq_default_max_budget;
6480
6481 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
6482 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
6483 bfqd->bfq_back_max = bfq_back_max;
6484 bfqd->bfq_back_penalty = bfq_back_penalty;
6485 bfqd->bfq_slice_idle = bfq_slice_idle;
aee69d78
PV
6486 bfqd->bfq_timeout = bfq_timeout;
6487
6488 bfqd->bfq_requests_within_timer = 120;
6489
44e44a1b
PV
6490 bfqd->low_latency = true;
6491
6492 /*
6493 * Trade-off between responsiveness and fairness.
6494 */
6495 bfqd->bfq_wr_coeff = 30;
6496 bfqd->bfq_wr_max_time = 0;
6497 bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
6498 bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
6499
6500 /*
6501 * Begin by assuming, optimistically, that the device is a
6502 * high-speed one, and that its peak rate is equal to 2/3 of
6503 * the highest reference rate.
6504 */
6505 bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
6506 T_fast[blk_queue_nonrot(bfqd->queue)];
6507 bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
6508 bfqd->device_speed = BFQ_BFQD_FAST;
6509
aee69d78 6510 spin_lock_init(&bfqd->lock);
aee69d78 6511
e21b7a0b
AA
6512 /*
6513 * The invocation of the next bfq_create_group_hierarchy
6514 * function is the head of a chain of function calls
6515 * (bfq_create_group_hierarchy->blkcg_activate_policy->
6516 * blk_mq_freeze_queue) that may lead to the invocation of the
6517 * has_work hook function. For this reason,
6518 * bfq_create_group_hierarchy is invoked only after all
6519 * scheduler data has been initialized, apart from the fields
6520 * that can be initialized only after invoking
6521 * bfq_create_group_hierarchy. This, in particular, enables
6522 * has_work to correctly return false. Of course, to avoid
6523 * other inconsistencies, the blk-mq stack must then refrain
6524 * from invoking further scheduler hooks before this init
6525 * function is finished.
6526 */
6527 bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
6528 if (!bfqd->root_group)
6529 goto out_free;
6530 bfq_init_root_group(bfqd->root_group, bfqd);
6531 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
6532
aee69d78
PV
6533
6534 return 0;
e21b7a0b
AA
6535
6536out_free:
6537 kfree(bfqd);
6538 kobject_put(&eq->kobj);
6539 return -ENOMEM;
aee69d78
PV
6540}
6541
6542static void bfq_slab_kill(void)
6543{
6544 kmem_cache_destroy(bfq_pool);
6545}
6546
6547static int __init bfq_slab_setup(void)
6548{
6549 bfq_pool = KMEM_CACHE(bfq_queue, 0);
6550 if (!bfq_pool)
6551 return -ENOMEM;
6552 return 0;
6553}
6554
6555static ssize_t bfq_var_show(unsigned int var, char *page)
6556{
6557 return sprintf(page, "%u\n", var);
6558}
6559
6560static ssize_t bfq_var_store(unsigned long *var, const char *page,
6561 size_t count)
6562{
6563 unsigned long new_val;
6564 int ret = kstrtoul(page, 10, &new_val);
6565
6566 if (ret == 0)
6567 *var = new_val;
6568
6569 return count;
6570}
6571
6572#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
6573static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6574{ \
6575 struct bfq_data *bfqd = e->elevator_data; \
6576 u64 __data = __VAR; \
6577 if (__CONV == 1) \
6578 __data = jiffies_to_msecs(__data); \
6579 else if (__CONV == 2) \
6580 __data = div_u64(__data, NSEC_PER_MSEC); \
6581 return bfq_var_show(__data, (page)); \
6582}
6583SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
6584SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
6585SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
6586SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
6587SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
6588SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
6589SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
6590SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
44e44a1b 6591SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
aee69d78
PV
6592#undef SHOW_FUNCTION
6593
6594#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
6595static ssize_t __FUNC(struct elevator_queue *e, char *page) \
6596{ \
6597 struct bfq_data *bfqd = e->elevator_data; \
6598 u64 __data = __VAR; \
6599 __data = div_u64(__data, NSEC_PER_USEC); \
6600 return bfq_var_show(__data, (page)); \
6601}
6602USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
6603#undef USEC_SHOW_FUNCTION
6604
6605#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
6606static ssize_t \
6607__FUNC(struct elevator_queue *e, const char *page, size_t count) \
6608{ \
6609 struct bfq_data *bfqd = e->elevator_data; \
6610 unsigned long uninitialized_var(__data); \
6611 int ret = bfq_var_store(&__data, (page), count); \
6612 if (__data < (MIN)) \
6613 __data = (MIN); \
6614 else if (__data > (MAX)) \
6615 __data = (MAX); \
6616 if (__CONV == 1) \
6617 *(__PTR) = msecs_to_jiffies(__data); \
6618 else if (__CONV == 2) \
6619 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
6620 else \
6621 *(__PTR) = __data; \
6622 return ret; \
6623}
6624STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
6625 INT_MAX, 2);
6626STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
6627 INT_MAX, 2);
6628STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
6629STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
6630 INT_MAX, 0);
6631STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
6632#undef STORE_FUNCTION
6633
6634#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
6635static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
6636{ \
6637 struct bfq_data *bfqd = e->elevator_data; \
6638 unsigned long uninitialized_var(__data); \
6639 int ret = bfq_var_store(&__data, (page), count); \
6640 if (__data < (MIN)) \
6641 __data = (MIN); \
6642 else if (__data > (MAX)) \
6643 __data = (MAX); \
6644 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
6645 return ret; \
6646}
6647USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
6648 UINT_MAX);
6649#undef USEC_STORE_FUNCTION
6650
aee69d78
PV
6651static ssize_t bfq_max_budget_store(struct elevator_queue *e,
6652 const char *page, size_t count)
6653{
6654 struct bfq_data *bfqd = e->elevator_data;
6655 unsigned long uninitialized_var(__data);
6656 int ret = bfq_var_store(&__data, (page), count);
6657
6658 if (__data == 0)
ab0e43e9 6659 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
aee69d78
PV
6660 else {
6661 if (__data > INT_MAX)
6662 __data = INT_MAX;
6663 bfqd->bfq_max_budget = __data;
6664 }
6665
6666 bfqd->bfq_user_max_budget = __data;
6667
6668 return ret;
6669}
6670
6671/*
6672 * Leaving this name to preserve name compatibility with cfq
6673 * parameters, but this timeout is used for both sync and async.
6674 */
6675static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
6676 const char *page, size_t count)
6677{
6678 struct bfq_data *bfqd = e->elevator_data;
6679 unsigned long uninitialized_var(__data);
6680 int ret = bfq_var_store(&__data, (page), count);
6681
6682 if (__data < 1)
6683 __data = 1;
6684 else if (__data > INT_MAX)
6685 __data = INT_MAX;
6686
6687 bfqd->bfq_timeout = msecs_to_jiffies(__data);
6688 if (bfqd->bfq_user_max_budget == 0)
ab0e43e9 6689 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
aee69d78
PV
6690
6691 return ret;
6692}
6693
6694static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
6695 const char *page, size_t count)
6696{
6697 struct bfq_data *bfqd = e->elevator_data;
6698 unsigned long uninitialized_var(__data);
6699 int ret = bfq_var_store(&__data, (page), count);
6700
6701 if (__data > 1)
6702 __data = 1;
6703 if (!bfqd->strict_guarantees && __data == 1
6704 && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
6705 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
6706
6707 bfqd->strict_guarantees = __data;
6708
6709 return ret;
6710}
6711
44e44a1b
PV
6712static ssize_t bfq_low_latency_store(struct elevator_queue *e,
6713 const char *page, size_t count)
6714{
6715 struct bfq_data *bfqd = e->elevator_data;
6716 unsigned long uninitialized_var(__data);
6717 int ret = bfq_var_store(&__data, (page), count);
6718
6719 if (__data > 1)
6720 __data = 1;
6721 if (__data == 0 && bfqd->low_latency != 0)
6722 bfq_end_wr(bfqd);
6723 bfqd->low_latency = __data;
6724
6725 return ret;
6726}
6727
aee69d78
PV
6728#define BFQ_ATTR(name) \
6729 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
6730
6731static struct elv_fs_entry bfq_attrs[] = {
6732 BFQ_ATTR(fifo_expire_sync),
6733 BFQ_ATTR(fifo_expire_async),
6734 BFQ_ATTR(back_seek_max),
6735 BFQ_ATTR(back_seek_penalty),
6736 BFQ_ATTR(slice_idle),
6737 BFQ_ATTR(slice_idle_us),
6738 BFQ_ATTR(max_budget),
6739 BFQ_ATTR(timeout_sync),
6740 BFQ_ATTR(strict_guarantees),
44e44a1b 6741 BFQ_ATTR(low_latency),
aee69d78
PV
6742 __ATTR_NULL
6743};
6744
6745static struct elevator_type iosched_bfq_mq = {
6746 .ops.mq = {
6747 .get_rq_priv = bfq_get_rq_private,
6748 .put_rq_priv = bfq_put_rq_private,
6749 .exit_icq = bfq_exit_icq,
6750 .insert_requests = bfq_insert_requests,
6751 .dispatch_request = bfq_dispatch_request,
6752 .next_request = elv_rb_latter_request,
6753 .former_request = elv_rb_former_request,
6754 .allow_merge = bfq_allow_bio_merge,
6755 .bio_merge = bfq_bio_merge,
6756 .request_merge = bfq_request_merge,
6757 .requests_merged = bfq_requests_merged,
6758 .request_merged = bfq_request_merged,
6759 .has_work = bfq_has_work,
6760 .init_sched = bfq_init_queue,
6761 .exit_sched = bfq_exit_queue,
6762 },
6763
6764 .uses_mq = true,
6765 .icq_size = sizeof(struct bfq_io_cq),
6766 .icq_align = __alignof__(struct bfq_io_cq),
6767 .elevator_attrs = bfq_attrs,
6768 .elevator_name = "bfq",
6769 .elevator_owner = THIS_MODULE,
6770};
6771
e21b7a0b
AA
6772#ifdef CONFIG_BFQ_GROUP_IOSCHED
6773static struct blkcg_policy blkcg_policy_bfq = {
6774 .dfl_cftypes = bfq_blkg_files,
6775 .legacy_cftypes = bfq_blkcg_legacy_files,
6776
6777 .cpd_alloc_fn = bfq_cpd_alloc,
6778 .cpd_init_fn = bfq_cpd_init,
6779 .cpd_bind_fn = bfq_cpd_init,
6780 .cpd_free_fn = bfq_cpd_free,
6781
6782 .pd_alloc_fn = bfq_pd_alloc,
6783 .pd_init_fn = bfq_pd_init,
6784 .pd_offline_fn = bfq_pd_offline,
6785 .pd_free_fn = bfq_pd_free,
6786 .pd_reset_stats_fn = bfq_pd_reset_stats,
6787};
6788#endif
6789
aee69d78
PV
6790static int __init bfq_init(void)
6791{
6792 int ret;
6793
e21b7a0b
AA
6794#ifdef CONFIG_BFQ_GROUP_IOSCHED
6795 ret = blkcg_policy_register(&blkcg_policy_bfq);
6796 if (ret)
6797 return ret;
6798#endif
6799
aee69d78
PV
6800 ret = -ENOMEM;
6801 if (bfq_slab_setup())
6802 goto err_pol_unreg;
6803
44e44a1b
PV
6804 /*
6805 * Times to load large popular applications for the typical
6806 * systems installed on the reference devices (see the
6807 * comments before the definitions of the next two
6808 * arrays). Actually, we use slightly slower values, as the
6809 * estimated peak rate tends to be smaller than the actual
6810 * peak rate. The reason for this last fact is that estimates
6811 * are computed over much shorter time intervals than the long
6812 * intervals typically used for benchmarking. Why? First, to
6813 * adapt more quickly to variations. Second, because an I/O
6814 * scheduler cannot rely on a peak-rate-evaluation workload to
6815 * be run for a long time.
6816 */
6817 T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
6818 T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
6819 T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
6820 T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
6821
6822 /*
6823 * Thresholds that determine the switch between speed classes
6824 * (see the comments before the definition of the array
6825 * device_speed_thresh). These thresholds are biased towards
6826 * transitions to the fast class. This is safer than the
6827 * opposite bias. In fact, a wrong transition to the slow
6828 * class results in short weight-raising periods, because the
6829 * speed of the device then tends to be higher that the
6830 * reference peak rate. On the opposite end, a wrong
6831 * transition to the fast class tends to increase
6832 * weight-raising periods, because of the opposite reason.
6833 */
6834 device_speed_thresh[0] = (4 * R_slow[0]) / 3;
6835 device_speed_thresh[1] = (4 * R_slow[1]) / 3;
6836
aee69d78
PV
6837 ret = elv_register(&iosched_bfq_mq);
6838 if (ret)
6839 goto err_pol_unreg;
6840
6841 return 0;
6842
6843err_pol_unreg:
e21b7a0b
AA
6844#ifdef CONFIG_BFQ_GROUP_IOSCHED
6845 blkcg_policy_unregister(&blkcg_policy_bfq);
6846#endif
aee69d78
PV
6847 return ret;
6848}
6849
6850static void __exit bfq_exit(void)
6851{
6852 elv_unregister(&iosched_bfq_mq);
e21b7a0b
AA
6853#ifdef CONFIG_BFQ_GROUP_IOSCHED
6854 blkcg_policy_unregister(&blkcg_policy_bfq);
6855#endif
aee69d78
PV
6856 bfq_slab_kill();
6857}
6858
6859module_init(bfq_init);
6860module_exit(bfq_exit);
6861
6862MODULE_AUTHOR("Paolo Valente");
6863MODULE_LICENSE("GPL");
6864MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");