]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/bfq-iosched.c
block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler
[mirror_ubuntu-bionic-kernel.git] / block / bfq-iosched.c
CommitLineData
aee69d78
PV
1/*
2 * Budget Fair Queueing (BFQ) I/O scheduler.
3 *
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
11 * Arianna Avanzini <avanzini@google.com>
12 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction
28 * on BFQ inner workings. Details on BFQ benefits, usage and
29 * limitations can be found in Documentation/block/bfq-iosched.txt.
30 *
31 * BFQ is a proportional-share storage-I/O scheduling algorithm based
32 * on the slice-by-slice service scheme of CFQ. But BFQ assigns
33 * budgets, measured in number of sectors, to processes instead of
34 * time slices. The device is not granted to the in-service process
35 * for a given time slice, but until it has exhausted its assigned
36 * budget. This change from the time to the service domain enables BFQ
37 * to distribute the device throughput among processes as desired,
38 * without any distortion due to throughput fluctuations, or to device
39 * internal queueing. BFQ uses an ad hoc internal scheduler, called
40 * B-WF2Q+, to schedule processes according to their budgets. More
41 * precisely, BFQ schedules queues associated with processes. Each
42 * process/queue is assigned a user-configurable weight, and B-WF2Q+
43 * guarantees that each queue receives a fraction of the throughput
44 * proportional to its weight. Thanks to the accurate policy of
45 * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
46 * processes issuing sequential requests (to boost the throughput),
47 * and yet guarantee a low latency to interactive and soft real-time
48 * applications.
49 *
50 * In particular, to provide these low-latency guarantees, BFQ
51 * explicitly privileges the I/O of two classes of time-sensitive
52 * applications: interactive and soft real-time. This feature enables
53 * BFQ to provide applications in these classes with a very low
54 * latency. Finally, BFQ also features additional heuristics for
55 * preserving both a low latency and a high throughput on NCQ-capable,
56 * rotational or flash-based devices, and to get the job done quickly
57 * for applications consisting in many I/O-bound processes.
58 *
59 * BFQ is described in [1], where also a reference to the initial, more
60 * theoretical paper on BFQ can be found. The interested reader can find
61 * in the latter paper full details on the main algorithm, as well as
62 * formulas of the guarantees and formal proofs of all the properties.
63 * With respect to the version of BFQ presented in these papers, this
64 * implementation adds a few more heuristics, such as the one that
65 * guarantees a low latency to soft real-time applications, and a
66 * hierarchical extension based on H-WF2Q+.
67 *
68 * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
69 * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
70 * with O(log N) complexity derives from the one introduced with EEVDF
71 * in [3].
72 *
73 * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
74 * Scheduler", Proceedings of the First Workshop on Mobile System
75 * Technologies (MST-2015), May 2015.
76 * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
77 *
78 * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
79 * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
80 * Oct 1997.
81 *
82 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
83 *
84 * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
85 * First: A Flexible and Accurate Mechanism for Proportional Share
86 * Resource Allocation", technical report.
87 *
88 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
89 */
90#include <linux/module.h>
91#include <linux/slab.h>
92#include <linux/blkdev.h>
93#include <linux/elevator.h>
94#include <linux/ktime.h>
95#include <linux/rbtree.h>
96#include <linux/ioprio.h>
97#include <linux/sbitmap.h>
98#include <linux/delay.h>
99
100#include "blk.h"
101#include "blk-mq.h"
102#include "blk-mq-tag.h"
103#include "blk-mq-sched.h"
104#include <linux/blktrace_api.h>
105#include <linux/hrtimer.h>
106#include <linux/blk-cgroup.h>
107
108#define BFQ_IOPRIO_CLASSES 3
109#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
110
111#define BFQ_MIN_WEIGHT 1
112#define BFQ_MAX_WEIGHT 1000
113#define BFQ_WEIGHT_CONVERSION_COEFF 10
114
115#define BFQ_DEFAULT_QUEUE_IOPRIO 4
116
117#define BFQ_DEFAULT_GRP_WEIGHT 10
118#define BFQ_DEFAULT_GRP_IOPRIO 0
119#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
120
121struct bfq_entity;
122
123/**
124 * struct bfq_service_tree - per ioprio_class service tree.
125 *
126 * Each service tree represents a B-WF2Q+ scheduler on its own. Each
127 * ioprio_class has its own independent scheduler, and so its own
128 * bfq_service_tree. All the fields are protected by the queue lock
129 * of the containing bfqd.
130 */
131struct bfq_service_tree {
132 /* tree for active entities (i.e., those backlogged) */
133 struct rb_root active;
134 /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
135 struct rb_root idle;
136
137 /* idle entity with minimum F_i */
138 struct bfq_entity *first_idle;
139 /* idle entity with maximum F_i */
140 struct bfq_entity *last_idle;
141
142 /* scheduler virtual time */
143 u64 vtime;
144 /* scheduler weight sum; active and idle entities contribute to it */
145 unsigned long wsum;
146};
147
148/**
149 * struct bfq_sched_data - multi-class scheduler.
150 *
151 * bfq_sched_data is the basic scheduler queue. It supports three
152 * ioprio_classes, and can be used either as a toplevel queue or as
153 * an intermediate queue on a hierarchical setup.
154 * @next_in_service points to the active entity of the sched_data
155 * service trees that will be scheduled next.
156 *
157 * The supported ioprio_classes are the same as in CFQ, in descending
158 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
159 * Requests from higher priority queues are served before all the
160 * requests from lower priority queues; among requests of the same
161 * queue requests are served according to B-WF2Q+.
162 * All the fields are protected by the queue lock of the containing bfqd.
163 */
164struct bfq_sched_data {
165 /* entity in service */
166 struct bfq_entity *in_service_entity;
167 /* head-of-the-line entity in the scheduler */
168 struct bfq_entity *next_in_service;
169 /* array of service trees, one per ioprio_class */
170 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
171};
172
173/**
174 * struct bfq_entity - schedulable entity.
175 *
176 * A bfq_entity is used to represent a bfq_queue (leaf node in the upper
177 * level scheduler). Each entity belongs to the sched_data of the parent
178 * group hierarchy. Non-leaf entities have also their own sched_data,
179 * stored in @my_sched_data.
180 *
181 * Each entity stores independently its priority values; this would
182 * allow different weights on different devices, but this
183 * functionality is not exported to userspace by now. Priorities and
184 * weights are updated lazily, first storing the new values into the
185 * new_* fields, then setting the @prio_changed flag. As soon as
186 * there is a transition in the entity state that allows the priority
187 * update to take place the effective and the requested priority
188 * values are synchronized.
189 *
190 * The weight value is calculated from the ioprio to export the same
191 * interface as CFQ. When dealing with ``well-behaved'' queues (i.e.,
192 * queues that do not spend too much time to consume their budget
193 * and have true sequential behavior, and when there are no external
194 * factors breaking anticipation) the relative weights at each level
195 * of the hierarchy should be guaranteed. All the fields are
196 * protected by the queue lock of the containing bfqd.
197 */
198struct bfq_entity {
199 /* service_tree member */
200 struct rb_node rb_node;
201
202 /*
203 * flag, true if the entity is on a tree (either the active or
204 * the idle one of its service_tree).
205 */
206 int on_st;
207
208 /* B-WF2Q+ start and finish timestamps [sectors/weight] */
209 u64 start, finish;
210
211 /* tree the entity is enqueued into; %NULL if not on a tree */
212 struct rb_root *tree;
213
214 /*
215 * minimum start time of the (active) subtree rooted at this
216 * entity; used for O(log N) lookups into active trees
217 */
218 u64 min_start;
219
220 /* amount of service received during the last service slot */
221 int service;
222
223 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
224 int budget;
225
226 /* weight of the queue */
227 int weight;
228 /* next weight if a change is in progress */
229 int new_weight;
230
231 /* original weight, used to implement weight boosting */
232 int orig_weight;
233
234 /* parent entity, for hierarchical scheduling */
235 struct bfq_entity *parent;
236
237 /*
238 * For non-leaf nodes in the hierarchy, the associated
239 * scheduler queue, %NULL on leaf nodes.
240 */
241 struct bfq_sched_data *my_sched_data;
242 /* the scheduler queue this entity belongs to */
243 struct bfq_sched_data *sched_data;
244
245 /* flag, set to request a weight, ioprio or ioprio_class change */
246 int prio_changed;
247};
248
249/**
250 * struct bfq_ttime - per process thinktime stats.
251 */
252struct bfq_ttime {
253 /* completion time of the last request */
254 u64 last_end_request;
255
256 /* total process thinktime */
257 u64 ttime_total;
258 /* number of thinktime samples */
259 unsigned long ttime_samples;
260 /* average process thinktime */
261 u64 ttime_mean;
262};
263
264/**
265 * struct bfq_queue - leaf schedulable entity.
266 *
267 * A bfq_queue is a leaf request queue; it can be associated with an
268 * io_context or more, if it is async.
269 */
270struct bfq_queue {
271 /* reference counter */
272 int ref;
273 /* parent bfq_data */
274 struct bfq_data *bfqd;
275
276 /* current ioprio and ioprio class */
277 unsigned short ioprio, ioprio_class;
278 /* next ioprio and ioprio class if a change is in progress */
279 unsigned short new_ioprio, new_ioprio_class;
280
281 /* sorted list of pending requests */
282 struct rb_root sort_list;
283 /* if fifo isn't expired, next request to serve */
284 struct request *next_rq;
285 /* number of sync and async requests queued */
286 int queued[2];
287 /* number of requests currently allocated */
288 int allocated;
289 /* number of pending metadata requests */
290 int meta_pending;
291 /* fifo list of requests in sort_list */
292 struct list_head fifo;
293
294 /* entity representing this queue in the scheduler */
295 struct bfq_entity entity;
296
297 /* maximum budget allowed from the feedback mechanism */
298 int max_budget;
299 /* budget expiration (in jiffies) */
300 unsigned long budget_timeout;
301
302 /* number of requests on the dispatch list or inside driver */
303 int dispatched;
304
305 /* status flags */
306 unsigned long flags;
307
308 /* node for active/idle bfqq list inside parent bfqd */
309 struct list_head bfqq_list;
310
311 /* associated @bfq_ttime struct */
312 struct bfq_ttime ttime;
313
314 /* bit vector: a 1 for each seeky requests in history */
315 u32 seek_history;
316 /* position of the last request enqueued */
317 sector_t last_request_pos;
318
319 /* Number of consecutive pairs of request completion and
320 * arrival, such that the queue becomes idle after the
321 * completion, but the next request arrives within an idle
322 * time slice; used only if the queue's IO_bound flag has been
323 * cleared.
324 */
325 unsigned int requests_within_timer;
326
327 /* pid of the process owning the queue, used for logging purposes */
328 pid_t pid;
329};
330
331/**
332 * struct bfq_io_cq - per (request_queue, io_context) structure.
333 */
334struct bfq_io_cq {
335 /* associated io_cq structure */
336 struct io_cq icq; /* must be the first member */
337 /* array of two process queues, the sync and the async */
338 struct bfq_queue *bfqq[2];
339 /* per (request_queue, blkcg) ioprio */
340 int ioprio;
341};
342
343/**
344 * struct bfq_data - per-device data structure.
345 *
346 * All the fields are protected by @lock.
347 */
348struct bfq_data {
349 /* device request queue */
350 struct request_queue *queue;
351 /* dispatch queue */
352 struct list_head dispatch;
353
354 /* root @bfq_sched_data for the device */
355 struct bfq_sched_data sched_data;
356
357 /*
358 * Number of bfq_queues containing requests (including the
359 * queue in service, even if it is idling).
360 */
361 int busy_queues;
362 /* number of queued requests */
363 int queued;
364 /* number of requests dispatched and waiting for completion */
365 int rq_in_driver;
366
367 /*
368 * Maximum number of requests in driver in the last
369 * @hw_tag_samples completed requests.
370 */
371 int max_rq_in_driver;
372 /* number of samples used to calculate hw_tag */
373 int hw_tag_samples;
374 /* flag set to one if the driver is showing a queueing behavior */
375 int hw_tag;
376
377 /* number of budgets assigned */
378 int budgets_assigned;
379
380 /*
381 * Timer set when idling (waiting) for the next request from
382 * the queue in service.
383 */
384 struct hrtimer idle_slice_timer;
385
386 /* bfq_queue in service */
387 struct bfq_queue *in_service_queue;
388 /* bfq_io_cq (bic) associated with the @in_service_queue */
389 struct bfq_io_cq *in_service_bic;
390
391 /* on-disk position of the last served request */
392 sector_t last_position;
393
394 /* beginning of the last budget */
395 ktime_t last_budget_start;
396 /* beginning of the last idle slice */
397 ktime_t last_idling_start;
398 /* number of samples used to calculate @peak_rate */
399 int peak_rate_samples;
400 /*
401 * Peak read/write rate, observed during the service of a
402 * budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
403 * left-shifted by BFQ_RATE_SHIFT to increase precision in
404 * fixed-point calculations.
405 */
406 u64 peak_rate;
407 /* maximum budget allotted to a bfq_queue before rescheduling */
408 int bfq_max_budget;
409
410 /* list of all the bfq_queues active on the device */
411 struct list_head active_list;
412 /* list of all the bfq_queues idle on the device */
413 struct list_head idle_list;
414
415 /*
416 * Timeout for async/sync requests; when it fires, requests
417 * are served in fifo order.
418 */
419 u64 bfq_fifo_expire[2];
420 /* weight of backward seeks wrt forward ones */
421 unsigned int bfq_back_penalty;
422 /* maximum allowed backward seek */
423 unsigned int bfq_back_max;
424 /* maximum idling time */
425 u32 bfq_slice_idle;
426 /* last time CLASS_IDLE was served */
427 u64 bfq_class_idle_last_service;
428
429 /* user-configured max budget value (0 for auto-tuning) */
430 int bfq_user_max_budget;
431 /*
432 * Timeout for bfq_queues to consume their budget; used to
433 * prevent seeky queues from imposing long latencies to
434 * sequential or quasi-sequential ones (this also implies that
435 * seeky queues cannot receive guarantees in the service
436 * domain; after a timeout they are charged for the time they
437 * have been in service, to preserve fairness among them, but
438 * without service-domain guarantees).
439 */
440 unsigned int bfq_timeout;
441
442 /*
443 * Number of consecutive requests that must be issued within
444 * the idle time slice to set again idling to a queue which
445 * was marked as non-I/O-bound (see the definition of the
446 * IO_bound flag for further details).
447 */
448 unsigned int bfq_requests_within_timer;
449
450 /*
451 * Force device idling whenever needed to provide accurate
452 * service guarantees, without caring about throughput
453 * issues. CAVEAT: this may even increase latencies, in case
454 * of useless idling for processes that did stop doing I/O.
455 */
456 bool strict_guarantees;
457
458 /* fallback dummy bfqq for extreme OOM conditions */
459 struct bfq_queue oom_bfqq;
460
461 spinlock_t lock;
462
463 /*
464 * bic associated with the task issuing current bio for
465 * merging. This and the next field are used as a support to
466 * be able to perform the bic lookup, needed by bio-merge
467 * functions, before the scheduler lock is taken, and thus
468 * avoid taking the request-queue lock while the scheduler
469 * lock is being held.
470 */
471 struct bfq_io_cq *bio_bic;
472 /* bfqq associated with the task issuing current bio for merging */
473 struct bfq_queue *bio_bfqq;
474};
475
476enum bfqq_state_flags {
477 BFQQF_busy = 0, /* has requests or is in service */
478 BFQQF_wait_request, /* waiting for a request */
479 BFQQF_non_blocking_wait_rq, /*
480 * waiting for a request
481 * without idling the device
482 */
483 BFQQF_fifo_expire, /* FIFO checked in this slice */
484 BFQQF_idle_window, /* slice idling enabled */
485 BFQQF_sync, /* synchronous queue */
486 BFQQF_budget_new, /* no completion with this budget */
487 BFQQF_IO_bound, /*
488 * bfqq has timed-out at least once
489 * having consumed at most 2/10 of
490 * its budget
491 */
492};
493
494#define BFQ_BFQQ_FNS(name) \
495static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
496{ \
497 __set_bit(BFQQF_##name, &(bfqq)->flags); \
498} \
499static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
500{ \
501 __clear_bit(BFQQF_##name, &(bfqq)->flags); \
502} \
503static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
504{ \
505 return test_bit(BFQQF_##name, &(bfqq)->flags); \
506}
507
508BFQ_BFQQ_FNS(busy);
509BFQ_BFQQ_FNS(wait_request);
510BFQ_BFQQ_FNS(non_blocking_wait_rq);
511BFQ_BFQQ_FNS(fifo_expire);
512BFQ_BFQQ_FNS(idle_window);
513BFQ_BFQQ_FNS(sync);
514BFQ_BFQQ_FNS(budget_new);
515BFQ_BFQQ_FNS(IO_bound);
516#undef BFQ_BFQQ_FNS
517
518/* Logging facilities. */
519#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
520 blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
521
522#define bfq_log(bfqd, fmt, args...) \
523 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
524
525/* Expiration reasons. */
526enum bfqq_expiration {
527 BFQQE_TOO_IDLE = 0, /*
528 * queue has been idling for
529 * too long
530 */
531 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
532 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
533 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
534 BFQQE_PREEMPTED /* preemption in progress */
535};
536
537static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
538
539static struct bfq_service_tree *
540bfq_entity_service_tree(struct bfq_entity *entity)
541{
542 struct bfq_sched_data *sched_data = entity->sched_data;
543 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
544 unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
545 BFQ_DEFAULT_GRP_CLASS - 1;
546
547 return sched_data->service_tree + idx;
548}
549
550static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
551{
552 return bic->bfqq[is_sync];
553}
554
555static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
556 bool is_sync)
557{
558 bic->bfqq[is_sync] = bfqq;
559}
560
561static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
562{
563 return bic->icq.q->elevator->elevator_data;
564}
565
566static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
567static void bfq_put_queue(struct bfq_queue *bfqq);
568static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
569 struct bio *bio, bool is_sync,
570 struct bfq_io_cq *bic);
571static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
572
573/*
574 * Array of async queues for all the processes, one queue
575 * per ioprio value per ioprio_class.
576 */
577struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
578/* Async queue for the idle class (ioprio is ignored) */
579struct bfq_queue *async_idle_bfqq;
580
581/* Expiration time of sync (0) and async (1) requests, in ns. */
582static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
583
584/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
585static const int bfq_back_max = 16 * 1024;
586
587/* Penalty of a backwards seek, in number of sectors. */
588static const int bfq_back_penalty = 2;
589
590/* Idling period duration, in ns. */
591static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
592
593/* Minimum number of assigned budgets for which stats are safe to compute. */
594static const int bfq_stats_min_budgets = 194;
595
596/* Default maximum budget values, in sectors and number of requests. */
597static const int bfq_default_max_budget = 16 * 1024;
598
599/* Default timeout values, in jiffies, approximating CFQ defaults. */
600static const int bfq_timeout = HZ / 8;
601
602static struct kmem_cache *bfq_pool;
603
604/* Below this threshold (in ms), we consider thinktime immediate. */
605#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
606
607/* hw_tag detection: parallel requests threshold and min samples needed. */
608#define BFQ_HW_QUEUE_THRESHOLD 4
609#define BFQ_HW_QUEUE_SAMPLES 32
610
611#define BFQQ_SEEK_THR (sector_t)(8 * 100)
612#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
613#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
614#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
615
616/* Budget feedback step. */
617#define BFQ_BUDGET_STEP 128
618
619/* Min samples used for peak rate estimation (for autotuning). */
620#define BFQ_PEAK_RATE_SAMPLES 32
621
622/* Shift used for peak rate fixed precision calculations. */
623#define BFQ_RATE_SHIFT 16
624
625#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
626 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
627
628#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
629#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
630
631/**
632 * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
633 * @icq: the iocontext queue.
634 */
635static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
636{
637 /* bic->icq is the first member, %NULL will convert to %NULL */
638 return container_of(icq, struct bfq_io_cq, icq);
639}
640
641/**
642 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
643 * @bfqd: the lookup key.
644 * @ioc: the io_context of the process doing I/O.
645 * @q: the request queue.
646 */
647static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
648 struct io_context *ioc,
649 struct request_queue *q)
650{
651 if (ioc) {
652 unsigned long flags;
653 struct bfq_io_cq *icq;
654
655 spin_lock_irqsave(q->queue_lock, flags);
656 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
657 spin_unlock_irqrestore(q->queue_lock, flags);
658
659 return icq;
660 }
661
662 return NULL;
663}
664
665/*
666 * Next two macros are just fake loops for the moment. They will
667 * become true loops in the cgroups-enabled variant of the code. Such
668 * a variant, in its turn, will be introduced by next commit.
669 */
670#define for_each_entity(entity) \
671 for (; entity ; entity = NULL)
672
673#define for_each_entity_safe(entity, parent) \
674 for (parent = NULL; entity ; entity = parent)
675
676static int bfq_update_next_in_service(struct bfq_sched_data *sd)
677{
678 return 0;
679}
680
681static void bfq_check_next_in_service(struct bfq_sched_data *sd,
682 struct bfq_entity *entity)
683{
684}
685
686static void bfq_update_budget(struct bfq_entity *next_in_service)
687{
688}
689
690/*
691 * Shift for timestamp calculations. This actually limits the maximum
692 * service allowed in one timestamp delta (small shift values increase it),
693 * the maximum total weight that can be used for the queues in the system
694 * (big shift values increase it), and the period of virtual time
695 * wraparounds.
696 */
697#define WFQ_SERVICE_SHIFT 22
698
699/**
700 * bfq_gt - compare two timestamps.
701 * @a: first ts.
702 * @b: second ts.
703 *
704 * Return @a > @b, dealing with wrapping correctly.
705 */
706static int bfq_gt(u64 a, u64 b)
707{
708 return (s64)(a - b) > 0;
709}
710
711static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
712{
713 struct bfq_queue *bfqq = NULL;
714
715 if (!entity->my_sched_data)
716 bfqq = container_of(entity, struct bfq_queue, entity);
717
718 return bfqq;
719}
720
721
722/**
723 * bfq_delta - map service into the virtual time domain.
724 * @service: amount of service.
725 * @weight: scale factor (weight of an entity or weight sum).
726 */
727static u64 bfq_delta(unsigned long service, unsigned long weight)
728{
729 u64 d = (u64)service << WFQ_SERVICE_SHIFT;
730
731 do_div(d, weight);
732 return d;
733}
734
735/**
736 * bfq_calc_finish - assign the finish time to an entity.
737 * @entity: the entity to act upon.
738 * @service: the service to be charged to the entity.
739 */
740static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
741{
742 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
743
744 entity->finish = entity->start +
745 bfq_delta(service, entity->weight);
746
747 if (bfqq) {
748 bfq_log_bfqq(bfqq->bfqd, bfqq,
749 "calc_finish: serv %lu, w %d",
750 service, entity->weight);
751 bfq_log_bfqq(bfqq->bfqd, bfqq,
752 "calc_finish: start %llu, finish %llu, delta %llu",
753 entity->start, entity->finish,
754 bfq_delta(service, entity->weight));
755 }
756}
757
758/**
759 * bfq_entity_of - get an entity from a node.
760 * @node: the node field of the entity.
761 *
762 * Convert a node pointer to the relative entity. This is used only
763 * to simplify the logic of some functions and not as the generic
764 * conversion mechanism because, e.g., in the tree walking functions,
765 * the check for a %NULL value would be redundant.
766 */
767static struct bfq_entity *bfq_entity_of(struct rb_node *node)
768{
769 struct bfq_entity *entity = NULL;
770
771 if (node)
772 entity = rb_entry(node, struct bfq_entity, rb_node);
773
774 return entity;
775}
776
777/**
778 * bfq_extract - remove an entity from a tree.
779 * @root: the tree root.
780 * @entity: the entity to remove.
781 */
782static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
783{
784 entity->tree = NULL;
785 rb_erase(&entity->rb_node, root);
786}
787
788/**
789 * bfq_idle_extract - extract an entity from the idle tree.
790 * @st: the service tree of the owning @entity.
791 * @entity: the entity being removed.
792 */
793static void bfq_idle_extract(struct bfq_service_tree *st,
794 struct bfq_entity *entity)
795{
796 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
797 struct rb_node *next;
798
799 if (entity == st->first_idle) {
800 next = rb_next(&entity->rb_node);
801 st->first_idle = bfq_entity_of(next);
802 }
803
804 if (entity == st->last_idle) {
805 next = rb_prev(&entity->rb_node);
806 st->last_idle = bfq_entity_of(next);
807 }
808
809 bfq_extract(&st->idle, entity);
810
811 if (bfqq)
812 list_del(&bfqq->bfqq_list);
813}
814
815/**
816 * bfq_insert - generic tree insertion.
817 * @root: tree root.
818 * @entity: entity to insert.
819 *
820 * This is used for the idle and the active tree, since they are both
821 * ordered by finish time.
822 */
823static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
824{
825 struct bfq_entity *entry;
826 struct rb_node **node = &root->rb_node;
827 struct rb_node *parent = NULL;
828
829 while (*node) {
830 parent = *node;
831 entry = rb_entry(parent, struct bfq_entity, rb_node);
832
833 if (bfq_gt(entry->finish, entity->finish))
834 node = &parent->rb_left;
835 else
836 node = &parent->rb_right;
837 }
838
839 rb_link_node(&entity->rb_node, parent, node);
840 rb_insert_color(&entity->rb_node, root);
841
842 entity->tree = root;
843}
844
845/**
846 * bfq_update_min - update the min_start field of a entity.
847 * @entity: the entity to update.
848 * @node: one of its children.
849 *
850 * This function is called when @entity may store an invalid value for
851 * min_start due to updates to the active tree. The function assumes
852 * that the subtree rooted at @node (which may be its left or its right
853 * child) has a valid min_start value.
854 */
855static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
856{
857 struct bfq_entity *child;
858
859 if (node) {
860 child = rb_entry(node, struct bfq_entity, rb_node);
861 if (bfq_gt(entity->min_start, child->min_start))
862 entity->min_start = child->min_start;
863 }
864}
865
866/**
867 * bfq_update_active_node - recalculate min_start.
868 * @node: the node to update.
869 *
870 * @node may have changed position or one of its children may have moved,
871 * this function updates its min_start value. The left and right subtrees
872 * are assumed to hold a correct min_start value.
873 */
874static void bfq_update_active_node(struct rb_node *node)
875{
876 struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
877
878 entity->min_start = entity->start;
879 bfq_update_min(entity, node->rb_right);
880 bfq_update_min(entity, node->rb_left);
881}
882
883/**
884 * bfq_update_active_tree - update min_start for the whole active tree.
885 * @node: the starting node.
886 *
887 * @node must be the deepest modified node after an update. This function
888 * updates its min_start using the values held by its children, assuming
889 * that they did not change, and then updates all the nodes that may have
890 * changed in the path to the root. The only nodes that may have changed
891 * are the ones in the path or their siblings.
892 */
893static void bfq_update_active_tree(struct rb_node *node)
894{
895 struct rb_node *parent;
896
897up:
898 bfq_update_active_node(node);
899
900 parent = rb_parent(node);
901 if (!parent)
902 return;
903
904 if (node == parent->rb_left && parent->rb_right)
905 bfq_update_active_node(parent->rb_right);
906 else if (parent->rb_left)
907 bfq_update_active_node(parent->rb_left);
908
909 node = parent;
910 goto up;
911}
912
913/**
914 * bfq_active_insert - insert an entity in the active tree of its
915 * group/device.
916 * @st: the service tree of the entity.
917 * @entity: the entity being inserted.
918 *
919 * The active tree is ordered by finish time, but an extra key is kept
920 * per each node, containing the minimum value for the start times of
921 * its children (and the node itself), so it's possible to search for
922 * the eligible node with the lowest finish time in logarithmic time.
923 */
924static void bfq_active_insert(struct bfq_service_tree *st,
925 struct bfq_entity *entity)
926{
927 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
928 struct rb_node *node = &entity->rb_node;
929
930 bfq_insert(&st->active, entity);
931
932 if (node->rb_left)
933 node = node->rb_left;
934 else if (node->rb_right)
935 node = node->rb_right;
936
937 bfq_update_active_tree(node);
938
939 if (bfqq)
940 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
941}
942
943/**
944 * bfq_ioprio_to_weight - calc a weight from an ioprio.
945 * @ioprio: the ioprio value to convert.
946 */
947static unsigned short bfq_ioprio_to_weight(int ioprio)
948{
949 return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
950}
951
952/**
953 * bfq_weight_to_ioprio - calc an ioprio from a weight.
954 * @weight: the weight value to convert.
955 *
956 * To preserve as much as possible the old only-ioprio user interface,
957 * 0 is used as an escape ioprio value for weights (numerically) equal or
958 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
959 */
960static unsigned short bfq_weight_to_ioprio(int weight)
961{
962 return max_t(int, 0,
963 IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
964}
965
966static void bfq_get_entity(struct bfq_entity *entity)
967{
968 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
969
970 if (bfqq) {
971 bfqq->ref++;
972 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
973 bfqq, bfqq->ref);
974 }
975}
976
977/**
978 * bfq_find_deepest - find the deepest node that an extraction can modify.
979 * @node: the node being removed.
980 *
981 * Do the first step of an extraction in an rb tree, looking for the
982 * node that will replace @node, and returning the deepest node that
983 * the following modifications to the tree can touch. If @node is the
984 * last node in the tree return %NULL.
985 */
986static struct rb_node *bfq_find_deepest(struct rb_node *node)
987{
988 struct rb_node *deepest;
989
990 if (!node->rb_right && !node->rb_left)
991 deepest = rb_parent(node);
992 else if (!node->rb_right)
993 deepest = node->rb_left;
994 else if (!node->rb_left)
995 deepest = node->rb_right;
996 else {
997 deepest = rb_next(node);
998 if (deepest->rb_right)
999 deepest = deepest->rb_right;
1000 else if (rb_parent(deepest) != node)
1001 deepest = rb_parent(deepest);
1002 }
1003
1004 return deepest;
1005}
1006
1007/**
1008 * bfq_active_extract - remove an entity from the active tree.
1009 * @st: the service_tree containing the tree.
1010 * @entity: the entity being removed.
1011 */
1012static void bfq_active_extract(struct bfq_service_tree *st,
1013 struct bfq_entity *entity)
1014{
1015 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1016 struct rb_node *node;
1017
1018 node = bfq_find_deepest(&entity->rb_node);
1019 bfq_extract(&st->active, entity);
1020
1021 if (node)
1022 bfq_update_active_tree(node);
1023
1024 if (bfqq)
1025 list_del(&bfqq->bfqq_list);
1026}
1027
1028/**
1029 * bfq_idle_insert - insert an entity into the idle tree.
1030 * @st: the service tree containing the tree.
1031 * @entity: the entity to insert.
1032 */
1033static void bfq_idle_insert(struct bfq_service_tree *st,
1034 struct bfq_entity *entity)
1035{
1036 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1037 struct bfq_entity *first_idle = st->first_idle;
1038 struct bfq_entity *last_idle = st->last_idle;
1039
1040 if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
1041 st->first_idle = entity;
1042 if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
1043 st->last_idle = entity;
1044
1045 bfq_insert(&st->idle, entity);
1046
1047 if (bfqq)
1048 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
1049}
1050
1051/**
1052 * bfq_forget_entity - do not consider entity any longer for scheduling
1053 * @st: the service tree.
1054 * @entity: the entity being removed.
1055 * @is_in_service: true if entity is currently the in-service entity.
1056 *
1057 * Forget everything about @entity. In addition, if entity represents
1058 * a queue, and the latter is not in service, then release the service
1059 * reference to the queue (the one taken through bfq_get_entity). In
1060 * fact, in this case, there is really no more service reference to
1061 * the queue, as the latter is also outside any service tree. If,
1062 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1063 * will take care of putting the reference when the queue finally
1064 * stops being served.
1065 */
1066static void bfq_forget_entity(struct bfq_service_tree *st,
1067 struct bfq_entity *entity,
1068 bool is_in_service)
1069{
1070 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1071
1072 entity->on_st = 0;
1073 st->wsum -= entity->weight;
1074 if (bfqq && !is_in_service)
1075 bfq_put_queue(bfqq);
1076}
1077
1078/**
1079 * bfq_put_idle_entity - release the idle tree ref of an entity.
1080 * @st: service tree for the entity.
1081 * @entity: the entity being released.
1082 */
1083static void bfq_put_idle_entity(struct bfq_service_tree *st,
1084 struct bfq_entity *entity)
1085{
1086 bfq_idle_extract(st, entity);
1087 bfq_forget_entity(st, entity,
1088 entity == entity->sched_data->in_service_entity);
1089}
1090
1091/**
1092 * bfq_forget_idle - update the idle tree if necessary.
1093 * @st: the service tree to act upon.
1094 *
1095 * To preserve the global O(log N) complexity we only remove one entry here;
1096 * as the idle tree will not grow indefinitely this can be done safely.
1097 */
1098static void bfq_forget_idle(struct bfq_service_tree *st)
1099{
1100 struct bfq_entity *first_idle = st->first_idle;
1101 struct bfq_entity *last_idle = st->last_idle;
1102
1103 if (RB_EMPTY_ROOT(&st->active) && last_idle &&
1104 !bfq_gt(last_idle->finish, st->vtime)) {
1105 /*
1106 * Forget the whole idle tree, increasing the vtime past
1107 * the last finish time of idle entities.
1108 */
1109 st->vtime = last_idle->finish;
1110 }
1111
1112 if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
1113 bfq_put_idle_entity(st, first_idle);
1114}
1115
1116static struct bfq_service_tree *
1117__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1118 struct bfq_entity *entity)
1119{
1120 struct bfq_service_tree *new_st = old_st;
1121
1122 if (entity->prio_changed) {
1123 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1124 unsigned short prev_weight, new_weight;
1125 struct bfq_data *bfqd = NULL;
1126
1127 if (bfqq)
1128 bfqd = bfqq->bfqd;
1129
1130 old_st->wsum -= entity->weight;
1131
1132 if (entity->new_weight != entity->orig_weight) {
1133 if (entity->new_weight < BFQ_MIN_WEIGHT ||
1134 entity->new_weight > BFQ_MAX_WEIGHT) {
1135 pr_crit("update_weight_prio: new_weight %d\n",
1136 entity->new_weight);
1137 if (entity->new_weight < BFQ_MIN_WEIGHT)
1138 entity->new_weight = BFQ_MIN_WEIGHT;
1139 else
1140 entity->new_weight = BFQ_MAX_WEIGHT;
1141 }
1142 entity->orig_weight = entity->new_weight;
1143 if (bfqq)
1144 bfqq->ioprio =
1145 bfq_weight_to_ioprio(entity->orig_weight);
1146 }
1147
1148 if (bfqq)
1149 bfqq->ioprio_class = bfqq->new_ioprio_class;
1150 entity->prio_changed = 0;
1151
1152 /*
1153 * NOTE: here we may be changing the weight too early,
1154 * this will cause unfairness. The correct approach
1155 * would have required additional complexity to defer
1156 * weight changes to the proper time instants (i.e.,
1157 * when entity->finish <= old_st->vtime).
1158 */
1159 new_st = bfq_entity_service_tree(entity);
1160
1161 prev_weight = entity->weight;
1162 new_weight = entity->orig_weight;
1163 entity->weight = new_weight;
1164
1165 new_st->wsum += entity->weight;
1166
1167 if (new_st != old_st)
1168 entity->start = new_st->vtime;
1169 }
1170
1171 return new_st;
1172}
1173
1174/**
1175 * bfq_bfqq_served - update the scheduler status after selection for
1176 * service.
1177 * @bfqq: the queue being served.
1178 * @served: bytes to transfer.
1179 *
1180 * NOTE: this can be optimized, as the timestamps of upper level entities
1181 * are synchronized every time a new bfqq is selected for service. By now,
1182 * we keep it to better check consistency.
1183 */
1184static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1185{
1186 struct bfq_entity *entity = &bfqq->entity;
1187 struct bfq_service_tree *st;
1188
1189 for_each_entity(entity) {
1190 st = bfq_entity_service_tree(entity);
1191
1192 entity->service += served;
1193
1194 st->vtime += bfq_delta(served, st->wsum);
1195 bfq_forget_idle(st);
1196 }
1197 bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1198}
1199
1200/**
1201 * bfq_bfqq_charge_full_budget - set the service to the entity budget.
1202 * @bfqq: the queue that needs a service update.
1203 *
1204 * When it's not possible to be fair in the service domain, because
1205 * a queue is not consuming its budget fast enough (the meaning of
1206 * fast depends on the timeout parameter), we charge it a full
1207 * budget. In this way we should obtain a sort of time-domain
1208 * fairness among all the seeky/slow queues.
1209 */
1210static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
1211{
1212 struct bfq_entity *entity = &bfqq->entity;
1213
1214 bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
1215
1216 bfq_bfqq_served(bfqq, entity->budget - entity->service);
1217}
1218
1219/**
1220 * __bfq_activate_entity - activate an entity.
1221 * @entity: the entity being activated.
1222 * @non_blocking_wait_rq: true if this entity was waiting for a request
1223 *
1224 * Called whenever an entity is activated, i.e., it is not active and one
1225 * of its children receives a new request, or has to be reactivated due to
1226 * budget exhaustion. It uses the current budget of the entity (and the
1227 * service received if @entity is active) of the queue to calculate its
1228 * timestamps.
1229 */
1230static void __bfq_activate_entity(struct bfq_entity *entity,
1231 bool non_blocking_wait_rq)
1232{
1233 struct bfq_sched_data *sd = entity->sched_data;
1234 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1235 bool backshifted = false;
1236
1237 if (entity == sd->in_service_entity) {
1238 /*
1239 * If we are requeueing the current entity we have
1240 * to take care of not charging to it service it has
1241 * not received.
1242 */
1243 bfq_calc_finish(entity, entity->service);
1244 entity->start = entity->finish;
1245 sd->in_service_entity = NULL;
1246 } else if (entity->tree == &st->active) {
1247 /*
1248 * Requeueing an entity due to a change of some
1249 * next_in_service entity below it. We reuse the
1250 * old start time.
1251 */
1252 bfq_active_extract(st, entity);
1253 } else {
1254 unsigned long long min_vstart;
1255
1256 /* See comments on bfq_fqq_update_budg_for_activation */
1257 if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
1258 backshifted = true;
1259 min_vstart = entity->finish;
1260 } else
1261 min_vstart = st->vtime;
1262
1263 if (entity->tree == &st->idle) {
1264 /*
1265 * Must be on the idle tree, bfq_idle_extract() will
1266 * check for that.
1267 */
1268 bfq_idle_extract(st, entity);
1269 entity->start = bfq_gt(min_vstart, entity->finish) ?
1270 min_vstart : entity->finish;
1271 } else {
1272 /*
1273 * The finish time of the entity may be invalid, and
1274 * it is in the past for sure, otherwise the queue
1275 * would have been on the idle tree.
1276 */
1277 entity->start = min_vstart;
1278 st->wsum += entity->weight;
1279 /*
1280 * entity is about to be inserted into a service tree,
1281 * and then set in service: get a reference to make
1282 * sure entity does not disappear until it is no
1283 * longer in service or scheduled for service.
1284 */
1285 bfq_get_entity(entity);
1286
1287 entity->on_st = 1;
1288 }
1289 }
1290
1291 st = __bfq_entity_update_weight_prio(st, entity);
1292 bfq_calc_finish(entity, entity->budget);
1293
1294 /*
1295 * If some queues enjoy backshifting for a while, then their
1296 * (virtual) finish timestamps may happen to become lower and
1297 * lower than the system virtual time. In particular, if
1298 * these queues often happen to be idle for short time
1299 * periods, and during such time periods other queues with
1300 * higher timestamps happen to be busy, then the backshifted
1301 * timestamps of the former queues can become much lower than
1302 * the system virtual time. In fact, to serve the queues with
1303 * higher timestamps while the ones with lower timestamps are
1304 * idle, the system virtual time may be pushed-up to much
1305 * higher values than the finish timestamps of the idle
1306 * queues. As a consequence, the finish timestamps of all new
1307 * or newly activated queues may end up being much larger than
1308 * those of lucky queues with backshifted timestamps. The
1309 * latter queues may then monopolize the device for a lot of
1310 * time. This would simply break service guarantees.
1311 *
1312 * To reduce this problem, push up a little bit the
1313 * backshifted timestamps of the queue associated with this
1314 * entity (only a queue can happen to have the backshifted
1315 * flag set): just enough to let the finish timestamp of the
1316 * queue be equal to the current value of the system virtual
1317 * time. This may introduce a little unfairness among queues
1318 * with backshifted timestamps, but it does not break
1319 * worst-case fairness guarantees.
1320 */
1321 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
1322 unsigned long delta = st->vtime - entity->finish;
1323
1324 entity->start += delta;
1325 entity->finish += delta;
1326 }
1327
1328 bfq_active_insert(st, entity);
1329}
1330
1331/**
1332 * bfq_activate_entity - activate an entity and its ancestors if necessary.
1333 * @entity: the entity to activate.
1334 * @non_blocking_wait_rq: true if this entity was waiting for a request
1335 *
1336 * Activate @entity and all the entities on the path from it to the root.
1337 */
1338static void bfq_activate_entity(struct bfq_entity *entity,
1339 bool non_blocking_wait_rq)
1340{
1341 struct bfq_sched_data *sd;
1342
1343 for_each_entity(entity) {
1344 __bfq_activate_entity(entity, non_blocking_wait_rq);
1345
1346 sd = entity->sched_data;
1347 if (!bfq_update_next_in_service(sd))
1348 /*
1349 * No need to propagate the activation to the
1350 * upper entities, as they will be updated when
1351 * the in-service entity is rescheduled.
1352 */
1353 break;
1354 }
1355}
1356
1357/**
1358 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1359 * @entity: the entity to deactivate.
1360 * @requeue: if false, the entity will not be put into the idle tree.
1361 *
1362 * Deactivate an entity, independently from its previous state. If the
1363 * entity was not on a service tree just return, otherwise if it is on
1364 * any scheduler tree, extract it from that tree, and if necessary
1365 * and if the caller did not specify @requeue, put it on the idle tree.
1366 *
1367 * Return %1 if the caller should update the entity hierarchy, i.e.,
1368 * if the entity was in service or if it was the next_in_service for
1369 * its sched_data; return %0 otherwise.
1370 */
1371static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
1372{
1373 struct bfq_sched_data *sd = entity->sched_data;
1374 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1375 int is_in_service = entity == sd->in_service_entity;
1376 int ret = 0;
1377
1378 if (!entity->on_st)
1379 return 0;
1380
1381 if (is_in_service) {
1382 bfq_calc_finish(entity, entity->service);
1383 sd->in_service_entity = NULL;
1384 } else if (entity->tree == &st->active)
1385 bfq_active_extract(st, entity);
1386 else if (entity->tree == &st->idle)
1387 bfq_idle_extract(st, entity);
1388
1389 if (is_in_service || sd->next_in_service == entity)
1390 ret = bfq_update_next_in_service(sd);
1391
1392 if (!requeue || !bfq_gt(entity->finish, st->vtime))
1393 bfq_forget_entity(st, entity, is_in_service);
1394 else
1395 bfq_idle_insert(st, entity);
1396
1397 return ret;
1398}
1399
1400/**
1401 * bfq_deactivate_entity - deactivate an entity.
1402 * @entity: the entity to deactivate.
1403 * @requeue: true if the entity can be put on the idle tree
1404 */
1405static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
1406{
1407 struct bfq_sched_data *sd;
1408 struct bfq_entity *parent = NULL;
1409
1410 for_each_entity_safe(entity, parent) {
1411 sd = entity->sched_data;
1412
1413 if (!__bfq_deactivate_entity(entity, requeue))
1414 /*
1415 * The parent entity is still backlogged, and
1416 * we don't need to update it as it is still
1417 * in service.
1418 */
1419 break;
1420
1421 if (sd->next_in_service)
1422 /*
1423 * The parent entity is still backlogged and
1424 * the budgets on the path towards the root
1425 * need to be updated.
1426 */
1427 goto update;
1428
1429 /*
1430 * If we get here, then the parent is no more backlogged and
1431 * we want to propagate the deactivation upwards.
1432 */
1433 requeue = 1;
1434 }
1435
1436 return;
1437
1438update:
1439 entity = parent;
1440 for_each_entity(entity) {
1441 __bfq_activate_entity(entity, false);
1442
1443 sd = entity->sched_data;
1444 if (!bfq_update_next_in_service(sd))
1445 break;
1446 }
1447}
1448
1449/**
1450 * bfq_update_vtime - update vtime if necessary.
1451 * @st: the service tree to act upon.
1452 *
1453 * If necessary update the service tree vtime to have at least one
1454 * eligible entity, skipping to its start time. Assumes that the
1455 * active tree of the device is not empty.
1456 *
1457 * NOTE: this hierarchical implementation updates vtimes quite often,
1458 * we may end up with reactivated processes getting timestamps after a
1459 * vtime skip done because we needed a ->first_active entity on some
1460 * intermediate node.
1461 */
1462static void bfq_update_vtime(struct bfq_service_tree *st)
1463{
1464 struct bfq_entity *entry;
1465 struct rb_node *node = st->active.rb_node;
1466
1467 entry = rb_entry(node, struct bfq_entity, rb_node);
1468 if (bfq_gt(entry->min_start, st->vtime)) {
1469 st->vtime = entry->min_start;
1470 bfq_forget_idle(st);
1471 }
1472}
1473
1474/**
1475 * bfq_first_active_entity - find the eligible entity with
1476 * the smallest finish time
1477 * @st: the service tree to select from.
1478 *
1479 * This function searches the first schedulable entity, starting from the
1480 * root of the tree and going on the left every time on this side there is
1481 * a subtree with at least one eligible (start >= vtime) entity. The path on
1482 * the right is followed only if a) the left subtree contains no eligible
1483 * entities and b) no eligible entity has been found yet.
1484 */
1485static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
1486{
1487 struct bfq_entity *entry, *first = NULL;
1488 struct rb_node *node = st->active.rb_node;
1489
1490 while (node) {
1491 entry = rb_entry(node, struct bfq_entity, rb_node);
1492left:
1493 if (!bfq_gt(entry->start, st->vtime))
1494 first = entry;
1495
1496 if (node->rb_left) {
1497 entry = rb_entry(node->rb_left,
1498 struct bfq_entity, rb_node);
1499 if (!bfq_gt(entry->min_start, st->vtime)) {
1500 node = node->rb_left;
1501 goto left;
1502 }
1503 }
1504 if (first)
1505 break;
1506 node = node->rb_right;
1507 }
1508
1509 return first;
1510}
1511
1512/**
1513 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1514 * @st: the service tree.
1515 *
1516 * Update the virtual time in @st and return the first eligible entity
1517 * it contains.
1518 */
1519static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
1520 bool force)
1521{
1522 struct bfq_entity *entity, *new_next_in_service = NULL;
1523
1524 if (RB_EMPTY_ROOT(&st->active))
1525 return NULL;
1526
1527 bfq_update_vtime(st);
1528 entity = bfq_first_active_entity(st);
1529
1530 /*
1531 * If the chosen entity does not match with the sched_data's
1532 * next_in_service and we are forcedly serving the IDLE priority
1533 * class tree, bubble up budget update.
1534 */
1535 if (unlikely(force && entity != entity->sched_data->next_in_service)) {
1536 new_next_in_service = entity;
1537 for_each_entity(new_next_in_service)
1538 bfq_update_budget(new_next_in_service);
1539 }
1540
1541 return entity;
1542}
1543
1544/**
1545 * bfq_lookup_next_entity - return the first eligible entity in @sd.
1546 * @sd: the sched_data.
1547 * @extract: if true the returned entity will be also extracted from @sd.
1548 *
1549 * NOTE: since we cache the next_in_service entity at each level of the
1550 * hierarchy, the complexity of the lookup can be decreased with
1551 * absolutely no effort just returning the cached next_in_service value;
1552 * we prefer to do full lookups to test the consistency of the data
1553 * structures.
1554 */
1555static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
1556 int extract,
1557 struct bfq_data *bfqd)
1558{
1559 struct bfq_service_tree *st = sd->service_tree;
1560 struct bfq_entity *entity;
1561 int i = 0;
1562
1563 /*
1564 * Choose from idle class, if needed to guarantee a minimum
1565 * bandwidth to this class. This should also mitigate
1566 * priority-inversion problems in case a low priority task is
1567 * holding file system resources.
1568 */
1569 if (bfqd &&
1570 jiffies - bfqd->bfq_class_idle_last_service >
1571 BFQ_CL_IDLE_TIMEOUT) {
1572 entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
1573 true);
1574 if (entity) {
1575 i = BFQ_IOPRIO_CLASSES - 1;
1576 bfqd->bfq_class_idle_last_service = jiffies;
1577 sd->next_in_service = entity;
1578 }
1579 }
1580 for (; i < BFQ_IOPRIO_CLASSES; i++) {
1581 entity = __bfq_lookup_next_entity(st + i, false);
1582 if (entity) {
1583 if (extract) {
1584 bfq_check_next_in_service(sd, entity);
1585 bfq_active_extract(st + i, entity);
1586 sd->in_service_entity = entity;
1587 sd->next_in_service = NULL;
1588 }
1589 break;
1590 }
1591 }
1592
1593 return entity;
1594}
1595
1596static bool next_queue_may_preempt(struct bfq_data *bfqd)
1597{
1598 struct bfq_sched_data *sd = &bfqd->sched_data;
1599
1600 return sd->next_in_service != sd->in_service_entity;
1601}
1602
1603
1604/*
1605 * Get next queue for service.
1606 */
1607static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
1608{
1609 struct bfq_entity *entity = NULL;
1610 struct bfq_sched_data *sd;
1611 struct bfq_queue *bfqq;
1612
1613 if (bfqd->busy_queues == 0)
1614 return NULL;
1615
1616 sd = &bfqd->sched_data;
1617 for (; sd ; sd = entity->my_sched_data) {
1618 entity = bfq_lookup_next_entity(sd, 1, bfqd);
1619 entity->service = 0;
1620 }
1621
1622 bfqq = bfq_entity_to_bfqq(entity);
1623
1624 return bfqq;
1625}
1626
1627static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1628{
1629 struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
1630 struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
1631
1632 if (bfqd->in_service_bic) {
1633 put_io_context(bfqd->in_service_bic->icq.ioc);
1634 bfqd->in_service_bic = NULL;
1635 }
1636
1637 bfq_clear_bfqq_wait_request(in_serv_bfqq);
1638 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
1639 bfqd->in_service_queue = NULL;
1640
1641 /*
1642 * in_serv_entity is no longer in service, so, if it is in no
1643 * service tree either, then release the service reference to
1644 * the queue it represents (taken with bfq_get_entity).
1645 */
1646 if (!in_serv_entity->on_st)
1647 bfq_put_queue(in_serv_bfqq);
1648}
1649
1650static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1651 int requeue)
1652{
1653 struct bfq_entity *entity = &bfqq->entity;
1654
1655 bfq_deactivate_entity(entity, requeue);
1656}
1657
1658static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1659{
1660 struct bfq_entity *entity = &bfqq->entity;
1661
1662 bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
1663 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1664}
1665
1666/*
1667 * Called when the bfqq no longer has requests pending, remove it from
1668 * the service tree.
1669 */
1670static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1671 int requeue)
1672{
1673 bfq_log_bfqq(bfqd, bfqq, "del from busy");
1674
1675 bfq_clear_bfqq_busy(bfqq);
1676
1677 bfqd->busy_queues--;
1678
1679 bfq_deactivate_bfqq(bfqd, bfqq, requeue);
1680}
1681
1682/*
1683 * Called when an inactive queue receives a new request.
1684 */
1685static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1686{
1687 bfq_log_bfqq(bfqd, bfqq, "add to busy");
1688
1689 bfq_activate_bfqq(bfqd, bfqq);
1690
1691 bfq_mark_bfqq_busy(bfqq);
1692 bfqd->busy_queues++;
1693}
1694
1695static void bfq_init_entity(struct bfq_entity *entity)
1696{
1697 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1698
1699 entity->weight = entity->new_weight;
1700 entity->orig_weight = entity->new_weight;
1701
1702 bfqq->ioprio = bfqq->new_ioprio;
1703 bfqq->ioprio_class = bfqq->new_ioprio_class;
1704
1705 entity->sched_data = &bfqq->bfqd->sched_data;
1706}
1707
1708#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
1709#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
1710
1711#define bfq_sample_valid(samples) ((samples) > 80)
1712
1713/*
1714 * Scheduler run of queue, if there are requests pending and no one in the
1715 * driver that will restart queueing.
1716 */
1717static void bfq_schedule_dispatch(struct bfq_data *bfqd)
1718{
1719 if (bfqd->queued != 0) {
1720 bfq_log(bfqd, "schedule dispatch");
1721 blk_mq_run_hw_queues(bfqd->queue, true);
1722 }
1723}
1724
1725/*
1726 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1727 * We choose the request that is closesr to the head right now. Distance
1728 * behind the head is penalized and only allowed to a certain extent.
1729 */
1730static struct request *bfq_choose_req(struct bfq_data *bfqd,
1731 struct request *rq1,
1732 struct request *rq2,
1733 sector_t last)
1734{
1735 sector_t s1, s2, d1 = 0, d2 = 0;
1736 unsigned long back_max;
1737#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1738#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1739 unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
1740
1741 if (!rq1 || rq1 == rq2)
1742 return rq2;
1743 if (!rq2)
1744 return rq1;
1745
1746 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1747 return rq1;
1748 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1749 return rq2;
1750 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1751 return rq1;
1752 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1753 return rq2;
1754
1755 s1 = blk_rq_pos(rq1);
1756 s2 = blk_rq_pos(rq2);
1757
1758 /*
1759 * By definition, 1KiB is 2 sectors.
1760 */
1761 back_max = bfqd->bfq_back_max * 2;
1762
1763 /*
1764 * Strict one way elevator _except_ in the case where we allow
1765 * short backward seeks which are biased as twice the cost of a
1766 * similar forward seek.
1767 */
1768 if (s1 >= last)
1769 d1 = s1 - last;
1770 else if (s1 + back_max >= last)
1771 d1 = (last - s1) * bfqd->bfq_back_penalty;
1772 else
1773 wrap |= BFQ_RQ1_WRAP;
1774
1775 if (s2 >= last)
1776 d2 = s2 - last;
1777 else if (s2 + back_max >= last)
1778 d2 = (last - s2) * bfqd->bfq_back_penalty;
1779 else
1780 wrap |= BFQ_RQ2_WRAP;
1781
1782 /* Found required data */
1783
1784 /*
1785 * By doing switch() on the bit mask "wrap" we avoid having to
1786 * check two variables for all permutations: --> faster!
1787 */
1788 switch (wrap) {
1789 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1790 if (d1 < d2)
1791 return rq1;
1792 else if (d2 < d1)
1793 return rq2;
1794
1795 if (s1 >= s2)
1796 return rq1;
1797 else
1798 return rq2;
1799
1800 case BFQ_RQ2_WRAP:
1801 return rq1;
1802 case BFQ_RQ1_WRAP:
1803 return rq2;
1804 case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
1805 default:
1806 /*
1807 * Since both rqs are wrapped,
1808 * start with the one that's further behind head
1809 * (--> only *one* back seek required),
1810 * since back seek takes more time than forward.
1811 */
1812 if (s1 <= s2)
1813 return rq1;
1814 else
1815 return rq2;
1816 }
1817}
1818
1819/*
1820 * Return expired entry, or NULL to just start from scratch in rbtree.
1821 */
1822static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
1823 struct request *last)
1824{
1825 struct request *rq;
1826
1827 if (bfq_bfqq_fifo_expire(bfqq))
1828 return NULL;
1829
1830 bfq_mark_bfqq_fifo_expire(bfqq);
1831
1832 rq = rq_entry_fifo(bfqq->fifo.next);
1833
1834 if (rq == last || ktime_get_ns() < rq->fifo_time)
1835 return NULL;
1836
1837 bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
1838 return rq;
1839}
1840
1841static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1842 struct bfq_queue *bfqq,
1843 struct request *last)
1844{
1845 struct rb_node *rbnext = rb_next(&last->rb_node);
1846 struct rb_node *rbprev = rb_prev(&last->rb_node);
1847 struct request *next, *prev = NULL;
1848
1849 /* Follow expired path, else get first next available. */
1850 next = bfq_check_fifo(bfqq, last);
1851 if (next)
1852 return next;
1853
1854 if (rbprev)
1855 prev = rb_entry_rq(rbprev);
1856
1857 if (rbnext)
1858 next = rb_entry_rq(rbnext);
1859 else {
1860 rbnext = rb_first(&bfqq->sort_list);
1861 if (rbnext && rbnext != &last->rb_node)
1862 next = rb_entry_rq(rbnext);
1863 }
1864
1865 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1866}
1867
1868static unsigned long bfq_serv_to_charge(struct request *rq,
1869 struct bfq_queue *bfqq)
1870{
1871 return blk_rq_sectors(rq);
1872}
1873
1874/**
1875 * bfq_updated_next_req - update the queue after a new next_rq selection.
1876 * @bfqd: the device data the queue belongs to.
1877 * @bfqq: the queue to update.
1878 *
1879 * If the first request of a queue changes we make sure that the queue
1880 * has enough budget to serve at least its first request (if the
1881 * request has grown). We do this because if the queue has not enough
1882 * budget for its first request, it has to go through two dispatch
1883 * rounds to actually get it dispatched.
1884 */
1885static void bfq_updated_next_req(struct bfq_data *bfqd,
1886 struct bfq_queue *bfqq)
1887{
1888 struct bfq_entity *entity = &bfqq->entity;
1889 struct request *next_rq = bfqq->next_rq;
1890 unsigned long new_budget;
1891
1892 if (!next_rq)
1893 return;
1894
1895 if (bfqq == bfqd->in_service_queue)
1896 /*
1897 * In order not to break guarantees, budgets cannot be
1898 * changed after an entity has been selected.
1899 */
1900 return;
1901
1902 new_budget = max_t(unsigned long, bfqq->max_budget,
1903 bfq_serv_to_charge(next_rq, bfqq));
1904 if (entity->budget != new_budget) {
1905 entity->budget = new_budget;
1906 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
1907 new_budget);
1908 bfq_activate_bfqq(bfqd, bfqq);
1909 }
1910}
1911
1912static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
1913{
1914 struct bfq_entity *entity = &bfqq->entity;
1915
1916 return entity->budget - entity->service;
1917}
1918
1919/*
1920 * If enough samples have been computed, return the current max budget
1921 * stored in bfqd, which is dynamically updated according to the
1922 * estimated disk peak rate; otherwise return the default max budget
1923 */
1924static int bfq_max_budget(struct bfq_data *bfqd)
1925{
1926 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
1927 return bfq_default_max_budget;
1928 else
1929 return bfqd->bfq_max_budget;
1930}
1931
1932/*
1933 * Return min budget, which is a fraction of the current or default
1934 * max budget (trying with 1/32)
1935 */
1936static int bfq_min_budget(struct bfq_data *bfqd)
1937{
1938 if (bfqd->budgets_assigned < bfq_stats_min_budgets)
1939 return bfq_default_max_budget / 32;
1940 else
1941 return bfqd->bfq_max_budget / 32;
1942}
1943
1944static void bfq_bfqq_expire(struct bfq_data *bfqd,
1945 struct bfq_queue *bfqq,
1946 bool compensate,
1947 enum bfqq_expiration reason);
1948
1949/*
1950 * The next function, invoked after the input queue bfqq switches from
1951 * idle to busy, updates the budget of bfqq. The function also tells
1952 * whether the in-service queue should be expired, by returning
1953 * true. The purpose of expiring the in-service queue is to give bfqq
1954 * the chance to possibly preempt the in-service queue, and the reason
1955 * for preempting the in-service queue is to achieve the following
1956 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has
1957 * expired because it has remained idle.
1958 *
1959 * In particular, bfqq may have expired for one of the following two
1960 * reasons:
1961 *
1962 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
1963 * and did not make it to issue a new request before its last
1964 * request was served;
1965 *
1966 * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
1967 * a new request before the expiration of the idling-time.
1968 *
1969 * Even if bfqq has expired for one of the above reasons, the process
1970 * associated with the queue may be however issuing requests greedily,
1971 * and thus be sensitive to the bandwidth it receives (bfqq may have
1972 * remained idle for other reasons: CPU high load, bfqq not enjoying
1973 * idling, I/O throttling somewhere in the path from the process to
1974 * the I/O scheduler, ...). But if, after every expiration for one of
1975 * the above two reasons, bfqq has to wait for the service of at least
1976 * one full budget of another queue before being served again, then
1977 * bfqq is likely to get a much lower bandwidth or resource time than
1978 * its reserved ones. To address this issue, two countermeasures need
1979 * to be taken.
1980 *
1981 * First, the budget and the timestamps of bfqq need to be updated in
1982 * a special way on bfqq reactivation: they need to be updated as if
1983 * bfqq did not remain idle and did not expire. In fact, if they are
1984 * computed as if bfqq expired and remained idle until reactivation,
1985 * then the process associated with bfqq is treated as if, instead of
1986 * being greedy, it stopped issuing requests when bfqq remained idle,
1987 * and restarts issuing requests only on this reactivation. In other
1988 * words, the scheduler does not help the process recover the "service
1989 * hole" between bfqq expiration and reactivation. As a consequence,
1990 * the process receives a lower bandwidth than its reserved one. In
1991 * contrast, to recover this hole, the budget must be updated as if
1992 * bfqq was not expired at all before this reactivation, i.e., it must
1993 * be set to the value of the remaining budget when bfqq was
1994 * expired. Along the same line, timestamps need to be assigned the
1995 * value they had the last time bfqq was selected for service, i.e.,
1996 * before last expiration. Thus timestamps need to be back-shifted
1997 * with respect to their normal computation (see [1] for more details
1998 * on this tricky aspect).
1999 *
2000 * Secondly, to allow the process to recover the hole, the in-service
2001 * queue must be expired too, to give bfqq the chance to preempt it
2002 * immediately. In fact, if bfqq has to wait for a full budget of the
2003 * in-service queue to be completed, then it may become impossible to
2004 * let the process recover the hole, even if the back-shifted
2005 * timestamps of bfqq are lower than those of the in-service queue. If
2006 * this happens for most or all of the holes, then the process may not
2007 * receive its reserved bandwidth. In this respect, it is worth noting
2008 * that, being the service of outstanding requests unpreemptible, a
2009 * little fraction of the holes may however be unrecoverable, thereby
2010 * causing a little loss of bandwidth.
2011 *
2012 * The last important point is detecting whether bfqq does need this
2013 * bandwidth recovery. In this respect, the next function deems the
2014 * process associated with bfqq greedy, and thus allows it to recover
2015 * the hole, if: 1) the process is waiting for the arrival of a new
2016 * request (which implies that bfqq expired for one of the above two
2017 * reasons), and 2) such a request has arrived soon. The first
2018 * condition is controlled through the flag non_blocking_wait_rq,
2019 * while the second through the flag arrived_in_time. If both
2020 * conditions hold, then the function computes the budget in the
2021 * above-described special way, and signals that the in-service queue
2022 * should be expired. Timestamp back-shifting is done later in
2023 * __bfq_activate_entity.
2024 */
2025static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
2026 struct bfq_queue *bfqq,
2027 bool arrived_in_time)
2028{
2029 struct bfq_entity *entity = &bfqq->entity;
2030
2031 if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
2032 /*
2033 * We do not clear the flag non_blocking_wait_rq here, as
2034 * the latter is used in bfq_activate_bfqq to signal
2035 * that timestamps need to be back-shifted (and is
2036 * cleared right after).
2037 */
2038
2039 /*
2040 * In next assignment we rely on that either
2041 * entity->service or entity->budget are not updated
2042 * on expiration if bfqq is empty (see
2043 * __bfq_bfqq_recalc_budget). Thus both quantities
2044 * remain unchanged after such an expiration, and the
2045 * following statement therefore assigns to
2046 * entity->budget the remaining budget on such an
2047 * expiration. For clarity, entity->service is not
2048 * updated on expiration in any case, and, in normal
2049 * operation, is reset only when bfqq is selected for
2050 * service (see bfq_get_next_queue).
2051 */
2052 entity->budget = min_t(unsigned long,
2053 bfq_bfqq_budget_left(bfqq),
2054 bfqq->max_budget);
2055
2056 return true;
2057 }
2058
2059 entity->budget = max_t(unsigned long, bfqq->max_budget,
2060 bfq_serv_to_charge(bfqq->next_rq, bfqq));
2061 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
2062 return false;
2063}
2064
2065static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
2066 struct bfq_queue *bfqq,
2067 struct request *rq)
2068{
2069 bool bfqq_wants_to_preempt,
2070 /*
2071 * See the comments on
2072 * bfq_bfqq_update_budg_for_activation for
2073 * details on the usage of the next variable.
2074 */
2075 arrived_in_time = ktime_get_ns() <=
2076 bfqq->ttime.last_end_request +
2077 bfqd->bfq_slice_idle * 3;
2078
2079 /*
2080 * Update budget and check whether bfqq may want to preempt
2081 * the in-service queue.
2082 */
2083 bfqq_wants_to_preempt =
2084 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
2085 arrived_in_time);
2086
2087 if (!bfq_bfqq_IO_bound(bfqq)) {
2088 if (arrived_in_time) {
2089 bfqq->requests_within_timer++;
2090 if (bfqq->requests_within_timer >=
2091 bfqd->bfq_requests_within_timer)
2092 bfq_mark_bfqq_IO_bound(bfqq);
2093 } else
2094 bfqq->requests_within_timer = 0;
2095 }
2096
2097 bfq_add_bfqq_busy(bfqd, bfqq);
2098
2099 /*
2100 * Expire in-service queue only if preemption may be needed
2101 * for guarantees. In this respect, the function
2102 * next_queue_may_preempt just checks a simple, necessary
2103 * condition, and not a sufficient condition based on
2104 * timestamps. In fact, for the latter condition to be
2105 * evaluated, timestamps would need first to be updated, and
2106 * this operation is quite costly (see the comments on the
2107 * function bfq_bfqq_update_budg_for_activation).
2108 */
2109 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
2110 next_queue_may_preempt(bfqd))
2111 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
2112 false, BFQQE_PREEMPTED);
2113}
2114
2115static void bfq_add_request(struct request *rq)
2116{
2117 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2118 struct bfq_data *bfqd = bfqq->bfqd;
2119 struct request *next_rq, *prev;
2120
2121 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
2122 bfqq->queued[rq_is_sync(rq)]++;
2123 bfqd->queued++;
2124
2125 elv_rb_add(&bfqq->sort_list, rq);
2126
2127 /*
2128 * Check if this request is a better next-serve candidate.
2129 */
2130 prev = bfqq->next_rq;
2131 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
2132 bfqq->next_rq = next_rq;
2133
2134 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
2135 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
2136 else if (prev != bfqq->next_rq)
2137 bfq_updated_next_req(bfqd, bfqq);
2138}
2139
2140static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
2141 struct bio *bio,
2142 struct request_queue *q)
2143{
2144 struct bfq_queue *bfqq = bfqd->bio_bfqq;
2145
2146
2147 if (bfqq)
2148 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
2149
2150 return NULL;
2151}
2152
2153#if 0 /* Still not clear if we can do without next two functions */
2154static void bfq_activate_request(struct request_queue *q, struct request *rq)
2155{
2156 struct bfq_data *bfqd = q->elevator->elevator_data;
2157
2158 bfqd->rq_in_driver++;
2159 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2160 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
2161 (unsigned long long)bfqd->last_position);
2162}
2163
2164static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
2165{
2166 struct bfq_data *bfqd = q->elevator->elevator_data;
2167
2168 bfqd->rq_in_driver--;
2169}
2170#endif
2171
2172static void bfq_remove_request(struct request_queue *q,
2173 struct request *rq)
2174{
2175 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2176 struct bfq_data *bfqd = bfqq->bfqd;
2177 const int sync = rq_is_sync(rq);
2178
2179 if (bfqq->next_rq == rq) {
2180 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
2181 bfq_updated_next_req(bfqd, bfqq);
2182 }
2183
2184 if (rq->queuelist.prev != &rq->queuelist)
2185 list_del_init(&rq->queuelist);
2186 bfqq->queued[sync]--;
2187 bfqd->queued--;
2188 elv_rb_del(&bfqq->sort_list, rq);
2189
2190 elv_rqhash_del(q, rq);
2191 if (q->last_merge == rq)
2192 q->last_merge = NULL;
2193
2194 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2195 bfqq->next_rq = NULL;
2196
2197 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
2198 bfq_del_bfqq_busy(bfqd, bfqq, 1);
2199 /*
2200 * bfqq emptied. In normal operation, when
2201 * bfqq is empty, bfqq->entity.service and
2202 * bfqq->entity.budget must contain,
2203 * respectively, the service received and the
2204 * budget used last time bfqq emptied. These
2205 * facts do not hold in this case, as at least
2206 * this last removal occurred while bfqq is
2207 * not in service. To avoid inconsistencies,
2208 * reset both bfqq->entity.service and
2209 * bfqq->entity.budget, if bfqq has still a
2210 * process that may issue I/O requests to it.
2211 */
2212 bfqq->entity.budget = bfqq->entity.service = 0;
2213 }
2214 }
2215
2216 if (rq->cmd_flags & REQ_META)
2217 bfqq->meta_pending--;
2218}
2219
2220static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
2221{
2222 struct request_queue *q = hctx->queue;
2223 struct bfq_data *bfqd = q->elevator->elevator_data;
2224 struct request *free = NULL;
2225 /*
2226 * bfq_bic_lookup grabs the queue_lock: invoke it now and
2227 * store its return value for later use, to avoid nesting
2228 * queue_lock inside the bfqd->lock. We assume that the bic
2229 * returned by bfq_bic_lookup does not go away before
2230 * bfqd->lock is taken.
2231 */
2232 struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
2233 bool ret;
2234
2235 spin_lock_irq(&bfqd->lock);
2236
2237 if (bic)
2238 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
2239 else
2240 bfqd->bio_bfqq = NULL;
2241 bfqd->bio_bic = bic;
2242
2243 ret = blk_mq_sched_try_merge(q, bio, &free);
2244
2245 if (free)
2246 blk_mq_free_request(free);
2247 spin_unlock_irq(&bfqd->lock);
2248
2249 return ret;
2250}
2251
2252static int bfq_request_merge(struct request_queue *q, struct request **req,
2253 struct bio *bio)
2254{
2255 struct bfq_data *bfqd = q->elevator->elevator_data;
2256 struct request *__rq;
2257
2258 __rq = bfq_find_rq_fmerge(bfqd, bio, q);
2259 if (__rq && elv_bio_merge_ok(__rq, bio)) {
2260 *req = __rq;
2261 return ELEVATOR_FRONT_MERGE;
2262 }
2263
2264 return ELEVATOR_NO_MERGE;
2265}
2266
2267static void bfq_request_merged(struct request_queue *q, struct request *req,
2268 enum elv_merge type)
2269{
2270 if (type == ELEVATOR_FRONT_MERGE &&
2271 rb_prev(&req->rb_node) &&
2272 blk_rq_pos(req) <
2273 blk_rq_pos(container_of(rb_prev(&req->rb_node),
2274 struct request, rb_node))) {
2275 struct bfq_queue *bfqq = RQ_BFQQ(req);
2276 struct bfq_data *bfqd = bfqq->bfqd;
2277 struct request *prev, *next_rq;
2278
2279 /* Reposition request in its sort_list */
2280 elv_rb_del(&bfqq->sort_list, req);
2281 elv_rb_add(&bfqq->sort_list, req);
2282
2283 /* Choose next request to be served for bfqq */
2284 prev = bfqq->next_rq;
2285 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
2286 bfqd->last_position);
2287 bfqq->next_rq = next_rq;
2288 /*
2289 * If next_rq changes, update the queue's budget to fit
2290 * the new request.
2291 */
2292 if (prev != bfqq->next_rq)
2293 bfq_updated_next_req(bfqd, bfqq);
2294 }
2295}
2296
2297static void bfq_requests_merged(struct request_queue *q, struct request *rq,
2298 struct request *next)
2299{
2300 struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
2301
2302 if (!RB_EMPTY_NODE(&rq->rb_node))
2303 return;
2304 spin_lock_irq(&bfqq->bfqd->lock);
2305
2306 /*
2307 * If next and rq belong to the same bfq_queue and next is older
2308 * than rq, then reposition rq in the fifo (by substituting next
2309 * with rq). Otherwise, if next and rq belong to different
2310 * bfq_queues, never reposition rq: in fact, we would have to
2311 * reposition it with respect to next's position in its own fifo,
2312 * which would most certainly be too expensive with respect to
2313 * the benefits.
2314 */
2315 if (bfqq == next_bfqq &&
2316 !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2317 next->fifo_time < rq->fifo_time) {
2318 list_del_init(&rq->queuelist);
2319 list_replace_init(&next->queuelist, &rq->queuelist);
2320 rq->fifo_time = next->fifo_time;
2321 }
2322
2323 if (bfqq->next_rq == next)
2324 bfqq->next_rq = rq;
2325
2326 bfq_remove_request(q, next);
2327
2328 spin_unlock_irq(&bfqq->bfqd->lock);
2329}
2330
2331static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2332 struct bio *bio)
2333{
2334 struct bfq_data *bfqd = q->elevator->elevator_data;
2335 bool is_sync = op_is_sync(bio->bi_opf);
2336 struct bfq_queue *bfqq = bfqd->bio_bfqq;
2337
2338 /*
2339 * Disallow merge of a sync bio into an async request.
2340 */
2341 if (is_sync && !rq_is_sync(rq))
2342 return false;
2343
2344 /*
2345 * Lookup the bfqq that this bio will be queued with. Allow
2346 * merge only if rq is queued there.
2347 */
2348 if (!bfqq)
2349 return false;
2350
2351 return bfqq == RQ_BFQQ(rq);
2352}
2353
2354static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2355 struct bfq_queue *bfqq)
2356{
2357 if (bfqq) {
2358 bfq_mark_bfqq_budget_new(bfqq);
2359 bfq_clear_bfqq_fifo_expire(bfqq);
2360
2361 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
2362
2363 bfq_log_bfqq(bfqd, bfqq,
2364 "set_in_service_queue, cur-budget = %d",
2365 bfqq->entity.budget);
2366 }
2367
2368 bfqd->in_service_queue = bfqq;
2369}
2370
2371/*
2372 * Get and set a new queue for service.
2373 */
2374static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
2375{
2376 struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
2377
2378 __bfq_set_in_service_queue(bfqd, bfqq);
2379 return bfqq;
2380}
2381
2382/*
2383 * bfq_default_budget - return the default budget for @bfqq on @bfqd.
2384 * @bfqd: the device descriptor.
2385 * @bfqq: the queue to consider.
2386 *
2387 * We use 3/4 of the @bfqd maximum budget as the default value
2388 * for the max_budget field of the queues. This lets the feedback
2389 * mechanism to start from some middle ground, then the behavior
2390 * of the process will drive the heuristics towards high values, if
2391 * it behaves as a greedy sequential reader, or towards small values
2392 * if it shows a more intermittent behavior.
2393 */
2394static unsigned long bfq_default_budget(struct bfq_data *bfqd,
2395 struct bfq_queue *bfqq)
2396{
2397 unsigned long budget;
2398
2399 /*
2400 * When we need an estimate of the peak rate we need to avoid
2401 * to give budgets that are too short due to previous
2402 * measurements. So, in the first 10 assignments use a
2403 * ``safe'' budget value. For such first assignment the value
2404 * of bfqd->budgets_assigned happens to be lower than 194.
2405 * See __bfq_set_in_service_queue for the formula by which
2406 * this field is computed.
2407 */
2408 if (bfqd->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
2409 budget = bfq_default_max_budget;
2410 else
2411 budget = bfqd->bfq_max_budget;
2412
2413 return budget - budget / 4;
2414}
2415
2416static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2417{
2418 struct bfq_queue *bfqq = bfqd->in_service_queue;
2419 struct bfq_io_cq *bic;
2420 u32 sl;
2421
2422 /* Processes have exited, don't wait. */
2423 bic = bfqd->in_service_bic;
2424 if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
2425 return;
2426
2427 bfq_mark_bfqq_wait_request(bfqq);
2428
2429 /*
2430 * We don't want to idle for seeks, but we do want to allow
2431 * fair distribution of slice time for a process doing back-to-back
2432 * seeks. So allow a little bit of time for him to submit a new rq.
2433 */
2434 sl = bfqd->bfq_slice_idle;
2435 /*
2436 * Grant only minimum idle time if the queue is seeky.
2437 */
2438 if (BFQQ_SEEKY(bfqq))
2439 sl = min_t(u64, sl, BFQ_MIN_TT);
2440
2441 bfqd->last_idling_start = ktime_get();
2442 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2443 HRTIMER_MODE_REL);
2444}
2445
2446/*
2447 * Set the maximum time for the in-service queue to consume its
2448 * budget. This prevents seeky processes from lowering the disk
2449 * throughput (always guaranteed with a time slice scheme as in CFQ).
2450 */
2451static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2452{
2453 struct bfq_queue *bfqq = bfqd->in_service_queue;
2454 unsigned int timeout_coeff = bfqq->entity.weight /
2455 bfqq->entity.orig_weight;
2456
2457 bfqd->last_budget_start = ktime_get();
2458
2459 bfq_clear_bfqq_budget_new(bfqq);
2460 bfqq->budget_timeout = jiffies +
2461 bfqd->bfq_timeout * timeout_coeff;
2462
2463 bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2464 jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
2465}
2466
2467/*
2468 * Remove request from internal lists.
2469 */
2470static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
2471{
2472 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2473
2474 /*
2475 * For consistency, the next instruction should have been
2476 * executed after removing the request from the queue and
2477 * dispatching it. We execute instead this instruction before
2478 * bfq_remove_request() (and hence introduce a temporary
2479 * inconsistency), for efficiency. In fact, should this
2480 * dispatch occur for a non in-service bfqq, this anticipated
2481 * increment prevents two counters related to bfqq->dispatched
2482 * from risking to be, first, uselessly decremented, and then
2483 * incremented again when the (new) value of bfqq->dispatched
2484 * happens to be taken into account.
2485 */
2486 bfqq->dispatched++;
2487
2488 bfq_remove_request(q, rq);
2489}
2490
2491static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2492{
2493 __bfq_bfqd_reset_in_service(bfqd);
2494
2495 if (RB_EMPTY_ROOT(&bfqq->sort_list))
2496 bfq_del_bfqq_busy(bfqd, bfqq, 1);
2497 else
2498 bfq_activate_bfqq(bfqd, bfqq);
2499}
2500
2501/**
2502 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2503 * @bfqd: device data.
2504 * @bfqq: queue to update.
2505 * @reason: reason for expiration.
2506 *
2507 * Handle the feedback on @bfqq budget at queue expiration.
2508 * See the body for detailed comments.
2509 */
2510static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2511 struct bfq_queue *bfqq,
2512 enum bfqq_expiration reason)
2513{
2514 struct request *next_rq;
2515 int budget, min_budget;
2516
2517 budget = bfqq->max_budget;
2518 min_budget = bfq_min_budget(bfqd);
2519
2520 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
2521 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2522 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
2523 budget, bfq_min_budget(bfqd));
2524 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2525 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2526
2527 if (bfq_bfqq_sync(bfqq)) {
2528 switch (reason) {
2529 /*
2530 * Caveat: in all the following cases we trade latency
2531 * for throughput.
2532 */
2533 case BFQQE_TOO_IDLE:
2534 if (budget > min_budget + BFQ_BUDGET_STEP)
2535 budget -= BFQ_BUDGET_STEP;
2536 else
2537 budget = min_budget;
2538 break;
2539 case BFQQE_BUDGET_TIMEOUT:
2540 budget = bfq_default_budget(bfqd, bfqq);
2541 break;
2542 case BFQQE_BUDGET_EXHAUSTED:
2543 /*
2544 * The process still has backlog, and did not
2545 * let either the budget timeout or the disk
2546 * idling timeout expire. Hence it is not
2547 * seeky, has a short thinktime and may be
2548 * happy with a higher budget too. So
2549 * definitely increase the budget of this good
2550 * candidate to boost the disk throughput.
2551 */
2552 budget = min(budget + 8 * BFQ_BUDGET_STEP,
2553 bfqd->bfq_max_budget);
2554 break;
2555 case BFQQE_NO_MORE_REQUESTS:
2556 /*
2557 * For queues that expire for this reason, it
2558 * is particularly important to keep the
2559 * budget close to the actual service they
2560 * need. Doing so reduces the timestamp
2561 * misalignment problem described in the
2562 * comments in the body of
2563 * __bfq_activate_entity. In fact, suppose
2564 * that a queue systematically expires for
2565 * BFQQE_NO_MORE_REQUESTS and presents a
2566 * new request in time to enjoy timestamp
2567 * back-shifting. The larger the budget of the
2568 * queue is with respect to the service the
2569 * queue actually requests in each service
2570 * slot, the more times the queue can be
2571 * reactivated with the same virtual finish
2572 * time. It follows that, even if this finish
2573 * time is pushed to the system virtual time
2574 * to reduce the consequent timestamp
2575 * misalignment, the queue unjustly enjoys for
2576 * many re-activations a lower finish time
2577 * than all newly activated queues.
2578 *
2579 * The service needed by bfqq is measured
2580 * quite precisely by bfqq->entity.service.
2581 * Since bfqq does not enjoy device idling,
2582 * bfqq->entity.service is equal to the number
2583 * of sectors that the process associated with
2584 * bfqq requested to read/write before waiting
2585 * for request completions, or blocking for
2586 * other reasons.
2587 */
2588 budget = max_t(int, bfqq->entity.service, min_budget);
2589 break;
2590 default:
2591 return;
2592 }
2593 } else {
2594 /*
2595 * Async queues get always the maximum possible
2596 * budget, as for them we do not care about latency
2597 * (in addition, their ability to dispatch is limited
2598 * by the charging factor).
2599 */
2600 budget = bfqd->bfq_max_budget;
2601 }
2602
2603 bfqq->max_budget = budget;
2604
2605 if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
2606 !bfqd->bfq_user_max_budget)
2607 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
2608
2609 /*
2610 * If there is still backlog, then assign a new budget, making
2611 * sure that it is large enough for the next request. Since
2612 * the finish time of bfqq must be kept in sync with the
2613 * budget, be sure to call __bfq_bfqq_expire() *after* this
2614 * update.
2615 *
2616 * If there is no backlog, then no need to update the budget;
2617 * it will be updated on the arrival of a new request.
2618 */
2619 next_rq = bfqq->next_rq;
2620 if (next_rq)
2621 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2622 bfq_serv_to_charge(next_rq, bfqq));
2623
2624 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
2625 next_rq ? blk_rq_sectors(next_rq) : 0,
2626 bfqq->entity.budget);
2627}
2628
2629static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2630{
2631 unsigned long max_budget;
2632
2633 /*
2634 * The max_budget calculated when autotuning is equal to the
2635 * amount of sectors transferred in timeout at the estimated
2636 * peak rate. To get this value, peak_rate is, first,
2637 * multiplied by 1000, because timeout is measured in ms,
2638 * while peak_rate is measured in sectors/usecs. Then the
2639 * result of this multiplication is right-shifted by
2640 * BFQ_RATE_SHIFT, because peak_rate is equal to the value of
2641 * the peak rate left-shifted by BFQ_RATE_SHIFT.
2642 */
2643 max_budget = (unsigned long)(peak_rate * 1000 *
2644 timeout >> BFQ_RATE_SHIFT);
2645
2646 return max_budget;
2647}
2648
2649/*
2650 * In addition to updating the peak rate, checks whether the process
2651 * is "slow", and returns 1 if so. This slow flag is used, in addition
2652 * to the budget timeout, to reduce the amount of service provided to
2653 * seeky processes, and hence reduce their chances to lower the
2654 * throughput. See the code for more details.
2655 */
2656static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2657 bool compensate)
2658{
2659 u64 bw, usecs, expected, timeout;
2660 ktime_t delta;
2661 int update = 0;
2662
2663 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2664 return false;
2665
2666 if (compensate)
2667 delta = bfqd->last_idling_start;
2668 else
2669 delta = ktime_get();
2670 delta = ktime_sub(delta, bfqd->last_budget_start);
2671 usecs = ktime_to_us(delta);
2672
2673 /* don't use too short time intervals */
2674 if (usecs < 1000)
2675 return false;
2676
2677 /*
2678 * Calculate the bandwidth for the last slice. We use a 64 bit
2679 * value to store the peak rate, in sectors per usec in fixed
2680 * point math. We do so to have enough precision in the estimate
2681 * and to avoid overflows.
2682 */
2683 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2684 do_div(bw, (unsigned long)usecs);
2685
2686 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
2687
2688 /*
2689 * Use only long (> 20ms) intervals to filter out spikes for
2690 * the peak rate estimation.
2691 */
2692 if (usecs > 20000) {
2693 if (bw > bfqd->peak_rate) {
2694 bfqd->peak_rate = bw;
2695 update = 1;
2696 bfq_log(bfqd, "new peak_rate=%llu", bw);
2697 }
2698
2699 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2700
2701 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2702 bfqd->peak_rate_samples++;
2703
2704 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2705 update && bfqd->bfq_user_max_budget == 0) {
2706 bfqd->bfq_max_budget =
2707 bfq_calc_max_budget(bfqd->peak_rate,
2708 timeout);
2709 bfq_log(bfqd, "new max_budget=%d",
2710 bfqd->bfq_max_budget);
2711 }
2712 }
2713
2714 /*
2715 * A process is considered ``slow'' (i.e., seeky, so that we
2716 * cannot treat it fairly in the service domain, as it would
2717 * slow down too much the other processes) if, when a slice
2718 * ends for whatever reason, it has received service at a
2719 * rate that would not be high enough to complete the budget
2720 * before the budget timeout expiration.
2721 */
2722 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2723
2724 /*
2725 * Caveat: processes doing IO in the slower disk zones will
2726 * tend to be slow(er) even if not seeky. And the estimated
2727 * peak rate will actually be an average over the disk
2728 * surface. Hence, to not be too harsh with unlucky processes,
2729 * we keep a budget/3 margin of safety before declaring a
2730 * process slow.
2731 */
2732 return expected > (4 * bfqq->entity.budget) / 3;
2733}
2734
2735/*
2736 * Return the farthest past time instant according to jiffies
2737 * macros.
2738 */
2739static unsigned long bfq_smallest_from_now(void)
2740{
2741 return jiffies - MAX_JIFFY_OFFSET;
2742}
2743
2744/**
2745 * bfq_bfqq_expire - expire a queue.
2746 * @bfqd: device owning the queue.
2747 * @bfqq: the queue to expire.
2748 * @compensate: if true, compensate for the time spent idling.
2749 * @reason: the reason causing the expiration.
2750 *
2751 *
2752 * If the process associated with the queue is slow (i.e., seeky), or
2753 * in case of budget timeout, or, finally, if it is async, we
2754 * artificially charge it an entire budget (independently of the
2755 * actual service it received). As a consequence, the queue will get
2756 * higher timestamps than the correct ones upon reactivation, and
2757 * hence it will be rescheduled as if it had received more service
2758 * than what it actually received. In the end, this class of processes
2759 * will receive less service in proportion to how slowly they consume
2760 * their budgets (and hence how seriously they tend to lower the
2761 * throughput).
2762 *
2763 * In contrast, when a queue expires because it has been idling for
2764 * too much or because it exhausted its budget, we do not touch the
2765 * amount of service it has received. Hence when the queue will be
2766 * reactivated and its timestamps updated, the latter will be in sync
2767 * with the actual service received by the queue until expiration.
2768 *
2769 * Charging a full budget to the first type of queues and the exact
2770 * service to the others has the effect of using the WF2Q+ policy to
2771 * schedule the former on a timeslice basis, without violating the
2772 * service domain guarantees of the latter.
2773 */
2774static void bfq_bfqq_expire(struct bfq_data *bfqd,
2775 struct bfq_queue *bfqq,
2776 bool compensate,
2777 enum bfqq_expiration reason)
2778{
2779 bool slow;
2780 int ref;
2781
2782 /*
2783 * Update device peak rate for autotuning and check whether the
2784 * process is slow (see bfq_update_peak_rate).
2785 */
2786 slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
2787
2788 /*
2789 * As above explained, 'punish' slow (i.e., seeky), timed-out
2790 * and async queues, to favor sequential sync workloads.
2791 */
2792 if (slow || reason == BFQQE_BUDGET_TIMEOUT)
2793 bfq_bfqq_charge_full_budget(bfqq);
2794
2795 if (reason == BFQQE_TOO_IDLE &&
2796 bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
2797 bfq_clear_bfqq_IO_bound(bfqq);
2798
2799 bfq_log_bfqq(bfqd, bfqq,
2800 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
2801 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2802
2803 /*
2804 * Increase, decrease or leave budget unchanged according to
2805 * reason.
2806 */
2807 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2808 ref = bfqq->ref;
2809 __bfq_bfqq_expire(bfqd, bfqq);
2810
2811 /* mark bfqq as waiting a request only if a bic still points to it */
2812 if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
2813 reason != BFQQE_BUDGET_TIMEOUT &&
2814 reason != BFQQE_BUDGET_EXHAUSTED)
2815 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
2816}
2817
2818/*
2819 * Budget timeout is not implemented through a dedicated timer, but
2820 * just checked on request arrivals and completions, as well as on
2821 * idle timer expirations.
2822 */
2823static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2824{
2825 if (bfq_bfqq_budget_new(bfqq) ||
2826 time_is_after_jiffies(bfqq->budget_timeout))
2827 return false;
2828 return true;
2829}
2830
2831/*
2832 * If we expire a queue that is actively waiting (i.e., with the
2833 * device idled) for the arrival of a new request, then we may incur
2834 * the timestamp misalignment problem described in the body of the
2835 * function __bfq_activate_entity. Hence we return true only if this
2836 * condition does not hold, or if the queue is slow enough to deserve
2837 * only to be kicked off for preserving a high throughput.
2838 */
2839static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2840{
2841 bfq_log_bfqq(bfqq->bfqd, bfqq,
2842 "may_budget_timeout: wait_request %d left %d timeout %d",
2843 bfq_bfqq_wait_request(bfqq),
2844 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2845 bfq_bfqq_budget_timeout(bfqq));
2846
2847 return (!bfq_bfqq_wait_request(bfqq) ||
2848 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2849 &&
2850 bfq_bfqq_budget_timeout(bfqq);
2851}
2852
2853/*
2854 * For a queue that becomes empty, device idling is allowed only if
2855 * this function returns true for the queue. And this function returns
2856 * true only if idling is beneficial for throughput.
2857 */
2858static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
2859{
2860 struct bfq_data *bfqd = bfqq->bfqd;
2861 bool idling_boosts_thr;
2862
2863 if (bfqd->strict_guarantees)
2864 return true;
2865
2866 /*
2867 * The value of the next variable is computed considering that
2868 * idling is usually beneficial for the throughput if:
2869 * (a) the device is not NCQ-capable, or
2870 * (b) regardless of the presence of NCQ, the request pattern
2871 * for bfqq is I/O-bound (possible throughput losses
2872 * caused by granting idling to seeky queues are mitigated
2873 * by the fact that, in all scenarios where boosting
2874 * throughput is the best thing to do, i.e., in all
2875 * symmetric scenarios, only a minimal idle time is
2876 * allowed to seeky queues).
2877 */
2878 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
2879
2880 /*
2881 * We have now the components we need to compute the return
2882 * value of the function, which is true only if both the
2883 * following conditions hold:
2884 * 1) bfqq is sync, because idling make sense only for sync queues;
2885 * 2) idling boosts the throughput.
2886 */
2887 return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
2888}
2889
2890/*
2891 * If the in-service queue is empty but the function bfq_bfqq_may_idle
2892 * returns true, then:
2893 * 1) the queue must remain in service and cannot be expired, and
2894 * 2) the device must be idled to wait for the possible arrival of a new
2895 * request for the queue.
2896 * See the comments on the function bfq_bfqq_may_idle for the reasons
2897 * why performing device idling is the best choice to boost the throughput
2898 * and preserve service guarantees when bfq_bfqq_may_idle itself
2899 * returns true.
2900 */
2901static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
2902{
2903 struct bfq_data *bfqd = bfqq->bfqd;
2904
2905 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
2906 bfq_bfqq_may_idle(bfqq);
2907}
2908
2909/*
2910 * Select a queue for service. If we have a current queue in service,
2911 * check whether to continue servicing it, or retrieve and set a new one.
2912 */
2913static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2914{
2915 struct bfq_queue *bfqq;
2916 struct request *next_rq;
2917 enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
2918
2919 bfqq = bfqd->in_service_queue;
2920 if (!bfqq)
2921 goto new_queue;
2922
2923 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
2924
2925 if (bfq_may_expire_for_budg_timeout(bfqq) &&
2926 !bfq_bfqq_wait_request(bfqq) &&
2927 !bfq_bfqq_must_idle(bfqq))
2928 goto expire;
2929
2930check_queue:
2931 /*
2932 * This loop is rarely executed more than once. Even when it
2933 * happens, it is much more convenient to re-execute this loop
2934 * than to return NULL and trigger a new dispatch to get a
2935 * request served.
2936 */
2937 next_rq = bfqq->next_rq;
2938 /*
2939 * If bfqq has requests queued and it has enough budget left to
2940 * serve them, keep the queue, otherwise expire it.
2941 */
2942 if (next_rq) {
2943 if (bfq_serv_to_charge(next_rq, bfqq) >
2944 bfq_bfqq_budget_left(bfqq)) {
2945 /*
2946 * Expire the queue for budget exhaustion,
2947 * which makes sure that the next budget is
2948 * enough to serve the next request, even if
2949 * it comes from the fifo expired path.
2950 */
2951 reason = BFQQE_BUDGET_EXHAUSTED;
2952 goto expire;
2953 } else {
2954 /*
2955 * The idle timer may be pending because we may
2956 * not disable disk idling even when a new request
2957 * arrives.
2958 */
2959 if (bfq_bfqq_wait_request(bfqq)) {
2960 /*
2961 * If we get here: 1) at least a new request
2962 * has arrived but we have not disabled the
2963 * timer because the request was too small,
2964 * 2) then the block layer has unplugged
2965 * the device, causing the dispatch to be
2966 * invoked.
2967 *
2968 * Since the device is unplugged, now the
2969 * requests are probably large enough to
2970 * provide a reasonable throughput.
2971 * So we disable idling.
2972 */
2973 bfq_clear_bfqq_wait_request(bfqq);
2974 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
2975 }
2976 goto keep_queue;
2977 }
2978 }
2979
2980 /*
2981 * No requests pending. However, if the in-service queue is idling
2982 * for a new request, or has requests waiting for a completion and
2983 * may idle after their completion, then keep it anyway.
2984 */
2985 if (bfq_bfqq_wait_request(bfqq) ||
2986 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
2987 bfqq = NULL;
2988 goto keep_queue;
2989 }
2990
2991 reason = BFQQE_NO_MORE_REQUESTS;
2992expire:
2993 bfq_bfqq_expire(bfqd, bfqq, false, reason);
2994new_queue:
2995 bfqq = bfq_set_in_service_queue(bfqd);
2996 if (bfqq) {
2997 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
2998 goto check_queue;
2999 }
3000keep_queue:
3001 if (bfqq)
3002 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
3003 else
3004 bfq_log(bfqd, "select_queue: no queue returned");
3005
3006 return bfqq;
3007}
3008
3009/*
3010 * Dispatch next request from bfqq.
3011 */
3012static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
3013 struct bfq_queue *bfqq)
3014{
3015 struct request *rq = bfqq->next_rq;
3016 unsigned long service_to_charge;
3017
3018 service_to_charge = bfq_serv_to_charge(rq, bfqq);
3019
3020 bfq_bfqq_served(bfqq, service_to_charge);
3021
3022 bfq_dispatch_remove(bfqd->queue, rq);
3023
3024 if (!bfqd->in_service_bic) {
3025 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3026 bfqd->in_service_bic = RQ_BIC(rq);
3027 }
3028
3029 /*
3030 * Expire bfqq, pretending that its budget expired, if bfqq
3031 * belongs to CLASS_IDLE and other queues are waiting for
3032 * service.
3033 */
3034 if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
3035 goto expire;
3036
3037 return rq;
3038
3039expire:
3040 bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
3041 return rq;
3042}
3043
3044static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
3045{
3046 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3047
3048 /*
3049 * Avoiding lock: a race on bfqd->busy_queues should cause at
3050 * most a call to dispatch for nothing
3051 */
3052 return !list_empty_careful(&bfqd->dispatch) ||
3053 bfqd->busy_queues > 0;
3054}
3055
3056static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3057{
3058 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3059 struct request *rq = NULL;
3060 struct bfq_queue *bfqq = NULL;
3061
3062 if (!list_empty(&bfqd->dispatch)) {
3063 rq = list_first_entry(&bfqd->dispatch, struct request,
3064 queuelist);
3065 list_del_init(&rq->queuelist);
3066
3067 bfqq = RQ_BFQQ(rq);
3068
3069 if (bfqq) {
3070 /*
3071 * Increment counters here, because this
3072 * dispatch does not follow the standard
3073 * dispatch flow (where counters are
3074 * incremented)
3075 */
3076 bfqq->dispatched++;
3077
3078 goto inc_in_driver_start_rq;
3079 }
3080
3081 /*
3082 * We exploit the put_rq_private hook to decrement
3083 * rq_in_driver, but put_rq_private will not be
3084 * invoked on this request. So, to avoid unbalance,
3085 * just start this request, without incrementing
3086 * rq_in_driver. As a negative consequence,
3087 * rq_in_driver is deceptively lower than it should be
3088 * while this request is in service. This may cause
3089 * bfq_schedule_dispatch to be invoked uselessly.
3090 *
3091 * As for implementing an exact solution, the
3092 * put_request hook, if defined, is probably invoked
3093 * also on this request. So, by exploiting this hook,
3094 * we could 1) increment rq_in_driver here, and 2)
3095 * decrement it in put_request. Such a solution would
3096 * let the value of the counter be always accurate,
3097 * but it would entail using an extra interface
3098 * function. This cost seems higher than the benefit,
3099 * being the frequency of non-elevator-private
3100 * requests very low.
3101 */
3102 goto start_rq;
3103 }
3104
3105 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3106
3107 if (bfqd->busy_queues == 0)
3108 goto exit;
3109
3110 /*
3111 * Force device to serve one request at a time if
3112 * strict_guarantees is true. Forcing this service scheme is
3113 * currently the ONLY way to guarantee that the request
3114 * service order enforced by the scheduler is respected by a
3115 * queueing device. Otherwise the device is free even to make
3116 * some unlucky request wait for as long as the device
3117 * wishes.
3118 *
3119 * Of course, serving one request at at time may cause loss of
3120 * throughput.
3121 */
3122 if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
3123 goto exit;
3124
3125 bfqq = bfq_select_queue(bfqd);
3126 if (!bfqq)
3127 goto exit;
3128
3129 rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
3130
3131 if (rq) {
3132inc_in_driver_start_rq:
3133 bfqd->rq_in_driver++;
3134start_rq:
3135 rq->rq_flags |= RQF_STARTED;
3136 }
3137exit:
3138 return rq;
3139}
3140
3141static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
3142{
3143 struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
3144 struct request *rq;
3145
3146 spin_lock_irq(&bfqd->lock);
3147 rq = __bfq_dispatch_request(hctx);
3148 spin_unlock_irq(&bfqd->lock);
3149
3150 return rq;
3151}
3152
3153/*
3154 * Task holds one reference to the queue, dropped when task exits. Each rq
3155 * in-flight on this queue also holds a reference, dropped when rq is freed.
3156 *
3157 * Scheduler lock must be held here. Recall not to use bfqq after calling
3158 * this function on it.
3159 */
3160static void bfq_put_queue(struct bfq_queue *bfqq)
3161{
3162 if (bfqq->bfqd)
3163 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
3164 bfqq, bfqq->ref);
3165
3166 bfqq->ref--;
3167 if (bfqq->ref)
3168 return;
3169
3170 kmem_cache_free(bfq_pool, bfqq);
3171}
3172
3173static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3174{
3175 if (bfqq == bfqd->in_service_queue) {
3176 __bfq_bfqq_expire(bfqd, bfqq);
3177 bfq_schedule_dispatch(bfqd);
3178 }
3179
3180 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
3181
3182 bfq_put_queue(bfqq); /* release process reference */
3183}
3184
3185static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
3186{
3187 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
3188 struct bfq_data *bfqd;
3189
3190 if (bfqq)
3191 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
3192
3193 if (bfqq && bfqd) {
3194 unsigned long flags;
3195
3196 spin_lock_irqsave(&bfqd->lock, flags);
3197 bfq_exit_bfqq(bfqd, bfqq);
3198 bic_set_bfqq(bic, NULL, is_sync);
3199 spin_unlock_irq(&bfqd->lock);
3200 }
3201}
3202
3203static void bfq_exit_icq(struct io_cq *icq)
3204{
3205 struct bfq_io_cq *bic = icq_to_bic(icq);
3206
3207 bfq_exit_icq_bfqq(bic, true);
3208 bfq_exit_icq_bfqq(bic, false);
3209}
3210
3211/*
3212 * Update the entity prio values; note that the new values will not
3213 * be used until the next (re)activation.
3214 */
3215static void
3216bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3217{
3218 struct task_struct *tsk = current;
3219 int ioprio_class;
3220 struct bfq_data *bfqd = bfqq->bfqd;
3221
3222 if (!bfqd)
3223 return;
3224
3225 ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3226 switch (ioprio_class) {
3227 default:
3228 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
3229 "bfq: bad prio class %d\n", ioprio_class);
3230 case IOPRIO_CLASS_NONE:
3231 /*
3232 * No prio set, inherit CPU scheduling settings.
3233 */
3234 bfqq->new_ioprio = task_nice_ioprio(tsk);
3235 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
3236 break;
3237 case IOPRIO_CLASS_RT:
3238 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3239 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
3240 break;
3241 case IOPRIO_CLASS_BE:
3242 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3243 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
3244 break;
3245 case IOPRIO_CLASS_IDLE:
3246 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
3247 bfqq->new_ioprio = 7;
3248 bfq_clear_bfqq_idle_window(bfqq);
3249 break;
3250 }
3251
3252 if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
3253 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
3254 bfqq->new_ioprio);
3255 bfqq->new_ioprio = IOPRIO_BE_NR;
3256 }
3257
3258 bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
3259 bfqq->entity.prio_changed = 1;
3260}
3261
3262static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
3263{
3264 struct bfq_data *bfqd = bic_to_bfqd(bic);
3265 struct bfq_queue *bfqq;
3266 int ioprio = bic->icq.ioc->ioprio;
3267
3268 /*
3269 * This condition may trigger on a newly created bic, be sure to
3270 * drop the lock before returning.
3271 */
3272 if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
3273 return;
3274
3275 bic->ioprio = ioprio;
3276
3277 bfqq = bic_to_bfqq(bic, false);
3278 if (bfqq) {
3279 /* release process reference on this queue */
3280 bfq_put_queue(bfqq);
3281 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
3282 bic_set_bfqq(bic, bfqq, false);
3283 }
3284
3285 bfqq = bic_to_bfqq(bic, true);
3286 if (bfqq)
3287 bfq_set_next_ioprio_data(bfqq, bic);
3288}
3289
3290static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3291 struct bfq_io_cq *bic, pid_t pid, int is_sync)
3292{
3293 RB_CLEAR_NODE(&bfqq->entity.rb_node);
3294 INIT_LIST_HEAD(&bfqq->fifo);
3295
3296 bfqq->ref = 0;
3297 bfqq->bfqd = bfqd;
3298
3299 if (bic)
3300 bfq_set_next_ioprio_data(bfqq, bic);
3301
3302 if (is_sync) {
3303 if (!bfq_class_idle(bfqq))
3304 bfq_mark_bfqq_idle_window(bfqq);
3305 bfq_mark_bfqq_sync(bfqq);
3306 } else
3307 bfq_clear_bfqq_sync(bfqq);
3308
3309 /* set end request to minus infinity from now */
3310 bfqq->ttime.last_end_request = ktime_get_ns() + 1;
3311
3312 bfq_mark_bfqq_IO_bound(bfqq);
3313
3314 bfqq->pid = pid;
3315
3316 /* Tentative initial value to trade off between thr and lat */
3317 bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
3318 bfqq->budget_timeout = bfq_smallest_from_now();
3319 bfqq->pid = pid;
3320
3321 /* first request is almost certainly seeky */
3322 bfqq->seek_history = 1;
3323}
3324
3325static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3326 int ioprio_class, int ioprio)
3327{
3328 switch (ioprio_class) {
3329 case IOPRIO_CLASS_RT:
3330 return &async_bfqq[0][ioprio];
3331 case IOPRIO_CLASS_NONE:
3332 ioprio = IOPRIO_NORM;
3333 /* fall through */
3334 case IOPRIO_CLASS_BE:
3335 return &async_bfqq[1][ioprio];
3336 case IOPRIO_CLASS_IDLE:
3337 return &async_idle_bfqq;
3338 default:
3339 return NULL;
3340 }
3341}
3342
3343static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3344 struct bio *bio, bool is_sync,
3345 struct bfq_io_cq *bic)
3346{
3347 const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3348 const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3349 struct bfq_queue **async_bfqq = NULL;
3350 struct bfq_queue *bfqq;
3351
3352 rcu_read_lock();
3353
3354 if (!is_sync) {
3355 async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
3356 ioprio);
3357 bfqq = *async_bfqq;
3358 if (bfqq)
3359 goto out;
3360 }
3361
3362 bfqq = kmem_cache_alloc_node(bfq_pool,
3363 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3364 bfqd->queue->node);
3365
3366 if (bfqq) {
3367 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
3368 is_sync);
3369 bfq_init_entity(&bfqq->entity);
3370 bfq_log_bfqq(bfqd, bfqq, "allocated");
3371 } else {
3372 bfqq = &bfqd->oom_bfqq;
3373 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3374 goto out;
3375 }
3376
3377 /*
3378 * Pin the queue now that it's allocated, scheduler exit will
3379 * prune it.
3380 */
3381 if (async_bfqq) {
3382 bfqq->ref++;
3383 bfq_log_bfqq(bfqd, bfqq,
3384 "get_queue, bfqq not in async: %p, %d",
3385 bfqq, bfqq->ref);
3386 *async_bfqq = bfqq;
3387 }
3388
3389out:
3390 bfqq->ref++; /* get a process reference to this queue */
3391 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
3392 rcu_read_unlock();
3393 return bfqq;
3394}
3395
3396static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3397 struct bfq_queue *bfqq)
3398{
3399 struct bfq_ttime *ttime = &bfqq->ttime;
3400 u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
3401
3402 elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
3403
3404 ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
3405 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3406 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3407 ttime->ttime_samples);
3408}
3409
3410static void
3411bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3412 struct request *rq)
3413{
3414 sector_t sdist = 0;
3415
3416 if (bfqq->last_request_pos) {
3417 if (bfqq->last_request_pos < blk_rq_pos(rq))
3418 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3419 else
3420 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3421 }
3422
3423 bfqq->seek_history <<= 1;
3424 bfqq->seek_history |= sdist > BFQQ_SEEK_THR &&
3425 (!blk_queue_nonrot(bfqd->queue) ||
3426 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
3427}
3428
3429/*
3430 * Disable idle window if the process thinks too long or seeks so much that
3431 * it doesn't matter.
3432 */
3433static void bfq_update_idle_window(struct bfq_data *bfqd,
3434 struct bfq_queue *bfqq,
3435 struct bfq_io_cq *bic)
3436{
3437 int enable_idle;
3438
3439 /* Don't idle for async or idle io prio class. */
3440 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3441 return;
3442
3443 enable_idle = bfq_bfqq_idle_window(bfqq);
3444
3445 if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3446 bfqd->bfq_slice_idle == 0 ||
3447 (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
3448 enable_idle = 0;
3449 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
3450 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
3451 enable_idle = 0;
3452 else
3453 enable_idle = 1;
3454 }
3455 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3456 enable_idle);
3457
3458 if (enable_idle)
3459 bfq_mark_bfqq_idle_window(bfqq);
3460 else
3461 bfq_clear_bfqq_idle_window(bfqq);
3462}
3463
3464/*
3465 * Called when a new fs request (rq) is added to bfqq. Check if there's
3466 * something we should do about it.
3467 */
3468static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3469 struct request *rq)
3470{
3471 struct bfq_io_cq *bic = RQ_BIC(rq);
3472
3473 if (rq->cmd_flags & REQ_META)
3474 bfqq->meta_pending++;
3475
3476 bfq_update_io_thinktime(bfqd, bfqq);
3477 bfq_update_io_seektime(bfqd, bfqq, rq);
3478 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3479 !BFQQ_SEEKY(bfqq))
3480 bfq_update_idle_window(bfqd, bfqq, bic);
3481
3482 bfq_log_bfqq(bfqd, bfqq,
3483 "rq_enqueued: idle_window=%d (seeky %d)",
3484 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
3485
3486 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3487
3488 if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3489 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3490 blk_rq_sectors(rq) < 32;
3491 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3492
3493 /*
3494 * There is just this request queued: if the request
3495 * is small and the queue is not to be expired, then
3496 * just exit.
3497 *
3498 * In this way, if the device is being idled to wait
3499 * for a new request from the in-service queue, we
3500 * avoid unplugging the device and committing the
3501 * device to serve just a small request. On the
3502 * contrary, we wait for the block layer to decide
3503 * when to unplug the device: hopefully, new requests
3504 * will be merged to this one quickly, then the device
3505 * will be unplugged and larger requests will be
3506 * dispatched.
3507 */
3508 if (small_req && !budget_timeout)
3509 return;
3510
3511 /*
3512 * A large enough request arrived, or the queue is to
3513 * be expired: in both cases disk idling is to be
3514 * stopped, so clear wait_request flag and reset
3515 * timer.
3516 */
3517 bfq_clear_bfqq_wait_request(bfqq);
3518 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
3519
3520 /*
3521 * The queue is not empty, because a new request just
3522 * arrived. Hence we can safely expire the queue, in
3523 * case of budget timeout, without risking that the
3524 * timestamps of the queue are not updated correctly.
3525 * See [1] for more details.
3526 */
3527 if (budget_timeout)
3528 bfq_bfqq_expire(bfqd, bfqq, false,
3529 BFQQE_BUDGET_TIMEOUT);
3530 }
3531}
3532
3533static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
3534{
3535 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3536
3537 bfq_add_request(rq);
3538
3539 rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
3540 list_add_tail(&rq->queuelist, &bfqq->fifo);
3541
3542 bfq_rq_enqueued(bfqd, bfqq, rq);
3543}
3544
3545static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
3546 bool at_head)
3547{
3548 struct request_queue *q = hctx->queue;
3549 struct bfq_data *bfqd = q->elevator->elevator_data;
3550
3551 spin_lock_irq(&bfqd->lock);
3552 if (blk_mq_sched_try_insert_merge(q, rq)) {
3553 spin_unlock_irq(&bfqd->lock);
3554 return;
3555 }
3556
3557 spin_unlock_irq(&bfqd->lock);
3558
3559 blk_mq_sched_request_inserted(rq);
3560
3561 spin_lock_irq(&bfqd->lock);
3562 if (at_head || blk_rq_is_passthrough(rq)) {
3563 if (at_head)
3564 list_add(&rq->queuelist, &bfqd->dispatch);
3565 else
3566 list_add_tail(&rq->queuelist, &bfqd->dispatch);
3567 } else {
3568 __bfq_insert_request(bfqd, rq);
3569
3570 if (rq_mergeable(rq)) {
3571 elv_rqhash_add(q, rq);
3572 if (!q->last_merge)
3573 q->last_merge = rq;
3574 }
3575 }
3576
3577 spin_unlock_irq(&bfqd->lock);
3578}
3579
3580static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
3581 struct list_head *list, bool at_head)
3582{
3583 while (!list_empty(list)) {
3584 struct request *rq;
3585
3586 rq = list_first_entry(list, struct request, queuelist);
3587 list_del_init(&rq->queuelist);
3588 bfq_insert_request(hctx, rq, at_head);
3589 }
3590}
3591
3592static void bfq_update_hw_tag(struct bfq_data *bfqd)
3593{
3594 bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
3595 bfqd->rq_in_driver);
3596
3597 if (bfqd->hw_tag == 1)
3598 return;
3599
3600 /*
3601 * This sample is valid if the number of outstanding requests
3602 * is large enough to allow a queueing behavior. Note that the
3603 * sum is not exact, as it's not taking into account deactivated
3604 * requests.
3605 */
3606 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3607 return;
3608
3609 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3610 return;
3611
3612 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3613 bfqd->max_rq_in_driver = 0;
3614 bfqd->hw_tag_samples = 0;
3615}
3616
3617static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
3618{
3619 bfq_update_hw_tag(bfqd);
3620
3621 bfqd->rq_in_driver--;
3622 bfqq->dispatched--;
3623
3624 bfqq->ttime.last_end_request = ktime_get_ns();
3625
3626 /*
3627 * If this is the in-service queue, check if it needs to be expired,
3628 * or if we want to idle in case it has no pending requests.
3629 */
3630 if (bfqd->in_service_queue == bfqq) {
3631 if (bfq_bfqq_budget_new(bfqq))
3632 bfq_set_budget_timeout(bfqd);
3633
3634 if (bfq_bfqq_must_idle(bfqq)) {
3635 bfq_arm_slice_timer(bfqd);
3636 return;
3637 } else if (bfq_may_expire_for_budg_timeout(bfqq))
3638 bfq_bfqq_expire(bfqd, bfqq, false,
3639 BFQQE_BUDGET_TIMEOUT);
3640 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3641 (bfqq->dispatched == 0 ||
3642 !bfq_bfqq_may_idle(bfqq)))
3643 bfq_bfqq_expire(bfqd, bfqq, false,
3644 BFQQE_NO_MORE_REQUESTS);
3645 }
3646}
3647
3648static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
3649{
3650 bfqq->allocated--;
3651
3652 bfq_put_queue(bfqq);
3653}
3654
3655static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
3656{
3657 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3658 struct bfq_data *bfqd = bfqq->bfqd;
3659
3660
3661 if (likely(rq->rq_flags & RQF_STARTED)) {
3662 unsigned long flags;
3663
3664 spin_lock_irqsave(&bfqd->lock, flags);
3665
3666 bfq_completed_request(bfqq, bfqd);
3667 bfq_put_rq_priv_body(bfqq);
3668
3669 spin_unlock_irqrestore(&bfqd->lock, flags);
3670 } else {
3671 /*
3672 * Request rq may be still/already in the scheduler,
3673 * in which case we need to remove it. And we cannot
3674 * defer such a check and removal, to avoid
3675 * inconsistencies in the time interval from the end
3676 * of this function to the start of the deferred work.
3677 * This situation seems to occur only in process
3678 * context, as a consequence of a merge. In the
3679 * current version of the code, this implies that the
3680 * lock is held.
3681 */
3682
3683 if (!RB_EMPTY_NODE(&rq->rb_node))
3684 bfq_remove_request(q, rq);
3685 bfq_put_rq_priv_body(bfqq);
3686 }
3687
3688 rq->elv.priv[0] = NULL;
3689 rq->elv.priv[1] = NULL;
3690}
3691
3692/*
3693 * Allocate bfq data structures associated with this request.
3694 */
3695static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
3696 struct bio *bio)
3697{
3698 struct bfq_data *bfqd = q->elevator->elevator_data;
3699 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3700 const int is_sync = rq_is_sync(rq);
3701 struct bfq_queue *bfqq;
3702
3703 spin_lock_irq(&bfqd->lock);
3704
3705 bfq_check_ioprio_change(bic, bio);
3706
3707 if (!bic)
3708 goto queue_fail;
3709
3710 bfqq = bic_to_bfqq(bic, is_sync);
3711 if (!bfqq || bfqq == &bfqd->oom_bfqq) {
3712 if (bfqq)
3713 bfq_put_queue(bfqq);
3714 bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
3715 bic_set_bfqq(bic, bfqq, is_sync);
3716 }
3717
3718 bfqq->allocated++;
3719 bfqq->ref++;
3720 bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
3721 rq, bfqq, bfqq->ref);
3722
3723 rq->elv.priv[0] = bic;
3724 rq->elv.priv[1] = bfqq;
3725
3726 spin_unlock_irq(&bfqd->lock);
3727
3728 return 0;
3729
3730queue_fail:
3731 spin_unlock_irq(&bfqd->lock);
3732
3733 return 1;
3734}
3735
3736static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
3737{
3738 struct bfq_data *bfqd = bfqq->bfqd;
3739 enum bfqq_expiration reason;
3740 unsigned long flags;
3741
3742 spin_lock_irqsave(&bfqd->lock, flags);
3743 bfq_clear_bfqq_wait_request(bfqq);
3744
3745 if (bfqq != bfqd->in_service_queue) {
3746 spin_unlock_irqrestore(&bfqd->lock, flags);
3747 return;
3748 }
3749
3750 if (bfq_bfqq_budget_timeout(bfqq))
3751 /*
3752 * Also here the queue can be safely expired
3753 * for budget timeout without wasting
3754 * guarantees
3755 */
3756 reason = BFQQE_BUDGET_TIMEOUT;
3757 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3758 /*
3759 * The queue may not be empty upon timer expiration,
3760 * because we may not disable the timer when the
3761 * first request of the in-service queue arrives
3762 * during disk idling.
3763 */
3764 reason = BFQQE_TOO_IDLE;
3765 else
3766 goto schedule_dispatch;
3767
3768 bfq_bfqq_expire(bfqd, bfqq, true, reason);
3769
3770schedule_dispatch:
3771 spin_unlock_irqrestore(&bfqd->lock, flags);
3772 bfq_schedule_dispatch(bfqd);
3773}
3774
3775/*
3776 * Handler of the expiration of the timer running if the in-service queue
3777 * is idling inside its time slice.
3778 */
3779static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
3780{
3781 struct bfq_data *bfqd = container_of(timer, struct bfq_data,
3782 idle_slice_timer);
3783 struct bfq_queue *bfqq = bfqd->in_service_queue;
3784
3785 /*
3786 * Theoretical race here: the in-service queue can be NULL or
3787 * different from the queue that was idling if a new request
3788 * arrives for the current queue and there is a full dispatch
3789 * cycle that changes the in-service queue. This can hardly
3790 * happen, but in the worst case we just expire a queue too
3791 * early.
3792 */
3793 if (bfqq)
3794 bfq_idle_slice_timer_body(bfqq);
3795
3796 return HRTIMER_NORESTART;
3797}
3798
3799static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3800 struct bfq_queue **bfqq_ptr)
3801{
3802 struct bfq_queue *bfqq = *bfqq_ptr;
3803
3804 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3805 if (bfqq) {
3806 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3807 bfqq, bfqq->ref);
3808 bfq_put_queue(bfqq);
3809 *bfqq_ptr = NULL;
3810 }
3811}
3812
3813/*
3814 * Release the extra reference of the async queues as the device
3815 * goes away.
3816 */
3817static void bfq_put_async_queues(struct bfq_data *bfqd)
3818{
3819 int i, j;
3820
3821 for (i = 0; i < 2; i++)
3822 for (j = 0; j < IOPRIO_BE_NR; j++)
3823 __bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
3824
3825 __bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
3826}
3827
3828static void bfq_exit_queue(struct elevator_queue *e)
3829{
3830 struct bfq_data *bfqd = e->elevator_data;
3831 struct bfq_queue *bfqq, *n;
3832
3833 hrtimer_cancel(&bfqd->idle_slice_timer);
3834
3835 spin_lock_irq(&bfqd->lock);
3836 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3837 bfq_deactivate_bfqq(bfqd, bfqq, false);
3838 bfq_put_async_queues(bfqd);
3839 spin_unlock_irq(&bfqd->lock);
3840
3841 hrtimer_cancel(&bfqd->idle_slice_timer);
3842
3843 kfree(bfqd);
3844}
3845
3846static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3847{
3848 struct bfq_data *bfqd;
3849 struct elevator_queue *eq;
3850 int i;
3851
3852 eq = elevator_alloc(q, e);
3853 if (!eq)
3854 return -ENOMEM;
3855
3856 bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
3857 if (!bfqd) {
3858 kobject_put(&eq->kobj);
3859 return -ENOMEM;
3860 }
3861 eq->elevator_data = bfqd;
3862
3863 /*
3864 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3865 * Grab a permanent reference to it, so that the normal code flow
3866 * will not attempt to free it.
3867 */
3868 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
3869 bfqd->oom_bfqq.ref++;
3870 bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
3871 bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
3872 bfqd->oom_bfqq.entity.new_weight =
3873 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
3874 /*
3875 * Trigger weight initialization, according to ioprio, at the
3876 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
3877 * class won't be changed any more.
3878 */
3879 bfqd->oom_bfqq.entity.prio_changed = 1;
3880
3881 bfqd->queue = q;
3882
3883 for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3884 bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3885
3886 hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
3887 HRTIMER_MODE_REL);
3888 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3889
3890 INIT_LIST_HEAD(&bfqd->active_list);
3891 INIT_LIST_HEAD(&bfqd->idle_list);
3892
3893 bfqd->hw_tag = -1;
3894
3895 bfqd->bfq_max_budget = bfq_default_max_budget;
3896
3897 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3898 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3899 bfqd->bfq_back_max = bfq_back_max;
3900 bfqd->bfq_back_penalty = bfq_back_penalty;
3901 bfqd->bfq_slice_idle = bfq_slice_idle;
3902 bfqd->bfq_class_idle_last_service = 0;
3903 bfqd->bfq_timeout = bfq_timeout;
3904
3905 bfqd->bfq_requests_within_timer = 120;
3906
3907 spin_lock_init(&bfqd->lock);
3908 INIT_LIST_HEAD(&bfqd->dispatch);
3909
3910 q->elevator = eq;
3911
3912 return 0;
3913}
3914
3915static void bfq_slab_kill(void)
3916{
3917 kmem_cache_destroy(bfq_pool);
3918}
3919
3920static int __init bfq_slab_setup(void)
3921{
3922 bfq_pool = KMEM_CACHE(bfq_queue, 0);
3923 if (!bfq_pool)
3924 return -ENOMEM;
3925 return 0;
3926}
3927
3928static ssize_t bfq_var_show(unsigned int var, char *page)
3929{
3930 return sprintf(page, "%u\n", var);
3931}
3932
3933static ssize_t bfq_var_store(unsigned long *var, const char *page,
3934 size_t count)
3935{
3936 unsigned long new_val;
3937 int ret = kstrtoul(page, 10, &new_val);
3938
3939 if (ret == 0)
3940 *var = new_val;
3941
3942 return count;
3943}
3944
3945#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3946static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3947{ \
3948 struct bfq_data *bfqd = e->elevator_data; \
3949 u64 __data = __VAR; \
3950 if (__CONV == 1) \
3951 __data = jiffies_to_msecs(__data); \
3952 else if (__CONV == 2) \
3953 __data = div_u64(__data, NSEC_PER_MSEC); \
3954 return bfq_var_show(__data, (page)); \
3955}
3956SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
3957SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
3958SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
3959SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
3960SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
3961SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
3962SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
3963SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
3964#undef SHOW_FUNCTION
3965
3966#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
3967static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3968{ \
3969 struct bfq_data *bfqd = e->elevator_data; \
3970 u64 __data = __VAR; \
3971 __data = div_u64(__data, NSEC_PER_USEC); \
3972 return bfq_var_show(__data, (page)); \
3973}
3974USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
3975#undef USEC_SHOW_FUNCTION
3976
3977#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
3978static ssize_t \
3979__FUNC(struct elevator_queue *e, const char *page, size_t count) \
3980{ \
3981 struct bfq_data *bfqd = e->elevator_data; \
3982 unsigned long uninitialized_var(__data); \
3983 int ret = bfq_var_store(&__data, (page), count); \
3984 if (__data < (MIN)) \
3985 __data = (MIN); \
3986 else if (__data > (MAX)) \
3987 __data = (MAX); \
3988 if (__CONV == 1) \
3989 *(__PTR) = msecs_to_jiffies(__data); \
3990 else if (__CONV == 2) \
3991 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
3992 else \
3993 *(__PTR) = __data; \
3994 return ret; \
3995}
3996STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
3997 INT_MAX, 2);
3998STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
3999 INT_MAX, 2);
4000STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4001STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4002 INT_MAX, 0);
4003STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
4004#undef STORE_FUNCTION
4005
4006#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4007static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
4008{ \
4009 struct bfq_data *bfqd = e->elevator_data; \
4010 unsigned long uninitialized_var(__data); \
4011 int ret = bfq_var_store(&__data, (page), count); \
4012 if (__data < (MIN)) \
4013 __data = (MIN); \
4014 else if (__data > (MAX)) \
4015 __data = (MAX); \
4016 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4017 return ret; \
4018}
4019USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
4020 UINT_MAX);
4021#undef USEC_STORE_FUNCTION
4022
4023static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4024{
4025 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
4026
4027 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4028 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4029 else
4030 return bfq_default_max_budget;
4031}
4032
4033static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4034 const char *page, size_t count)
4035{
4036 struct bfq_data *bfqd = e->elevator_data;
4037 unsigned long uninitialized_var(__data);
4038 int ret = bfq_var_store(&__data, (page), count);
4039
4040 if (__data == 0)
4041 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4042 else {
4043 if (__data > INT_MAX)
4044 __data = INT_MAX;
4045 bfqd->bfq_max_budget = __data;
4046 }
4047
4048 bfqd->bfq_user_max_budget = __data;
4049
4050 return ret;
4051}
4052
4053/*
4054 * Leaving this name to preserve name compatibility with cfq
4055 * parameters, but this timeout is used for both sync and async.
4056 */
4057static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4058 const char *page, size_t count)
4059{
4060 struct bfq_data *bfqd = e->elevator_data;
4061 unsigned long uninitialized_var(__data);
4062 int ret = bfq_var_store(&__data, (page), count);
4063
4064 if (__data < 1)
4065 __data = 1;
4066 else if (__data > INT_MAX)
4067 __data = INT_MAX;
4068
4069 bfqd->bfq_timeout = msecs_to_jiffies(__data);
4070 if (bfqd->bfq_user_max_budget == 0)
4071 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4072
4073 return ret;
4074}
4075
4076static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
4077 const char *page, size_t count)
4078{
4079 struct bfq_data *bfqd = e->elevator_data;
4080 unsigned long uninitialized_var(__data);
4081 int ret = bfq_var_store(&__data, (page), count);
4082
4083 if (__data > 1)
4084 __data = 1;
4085 if (!bfqd->strict_guarantees && __data == 1
4086 && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
4087 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
4088
4089 bfqd->strict_guarantees = __data;
4090
4091 return ret;
4092}
4093
4094#define BFQ_ATTR(name) \
4095 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
4096
4097static struct elv_fs_entry bfq_attrs[] = {
4098 BFQ_ATTR(fifo_expire_sync),
4099 BFQ_ATTR(fifo_expire_async),
4100 BFQ_ATTR(back_seek_max),
4101 BFQ_ATTR(back_seek_penalty),
4102 BFQ_ATTR(slice_idle),
4103 BFQ_ATTR(slice_idle_us),
4104 BFQ_ATTR(max_budget),
4105 BFQ_ATTR(timeout_sync),
4106 BFQ_ATTR(strict_guarantees),
4107 __ATTR_NULL
4108};
4109
4110static struct elevator_type iosched_bfq_mq = {
4111 .ops.mq = {
4112 .get_rq_priv = bfq_get_rq_private,
4113 .put_rq_priv = bfq_put_rq_private,
4114 .exit_icq = bfq_exit_icq,
4115 .insert_requests = bfq_insert_requests,
4116 .dispatch_request = bfq_dispatch_request,
4117 .next_request = elv_rb_latter_request,
4118 .former_request = elv_rb_former_request,
4119 .allow_merge = bfq_allow_bio_merge,
4120 .bio_merge = bfq_bio_merge,
4121 .request_merge = bfq_request_merge,
4122 .requests_merged = bfq_requests_merged,
4123 .request_merged = bfq_request_merged,
4124 .has_work = bfq_has_work,
4125 .init_sched = bfq_init_queue,
4126 .exit_sched = bfq_exit_queue,
4127 },
4128
4129 .uses_mq = true,
4130 .icq_size = sizeof(struct bfq_io_cq),
4131 .icq_align = __alignof__(struct bfq_io_cq),
4132 .elevator_attrs = bfq_attrs,
4133 .elevator_name = "bfq",
4134 .elevator_owner = THIS_MODULE,
4135};
4136
4137static int __init bfq_init(void)
4138{
4139 int ret;
4140
4141 ret = -ENOMEM;
4142 if (bfq_slab_setup())
4143 goto err_pol_unreg;
4144
4145 ret = elv_register(&iosched_bfq_mq);
4146 if (ret)
4147 goto err_pol_unreg;
4148
4149 return 0;
4150
4151err_pol_unreg:
4152 return ret;
4153}
4154
4155static void __exit bfq_exit(void)
4156{
4157 elv_unregister(&iosched_bfq_mq);
4158 bfq_slab_kill();
4159}
4160
4161module_init(bfq_init);
4162module_exit(bfq_exit);
4163
4164MODULE_AUTHOR("Paolo Valente");
4165MODULE_LICENSE("GPL");
4166MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");