]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/blk-throttle.c
block: use standard blktrace API to output cgroup info for debug notes
[mirror_ubuntu-bionic-kernel.git] / block / blk-throttle.c
CommitLineData
e43473b7
VG
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
eea8f41c 12#include <linux/blk-cgroup.h>
bc9fcbf9 13#include "blk.h"
e43473b7
VG
14
15/* Max dispatch from a group in 1 round */
16static int throtl_grp_quantum = 8;
17
18/* Total max dispatch from all groups in one round */
19static int throtl_quantum = 32;
20
d61fcfa4
SL
21/* Throttling is performed over a slice and after that slice is renewed */
22#define DFL_THROTL_SLICE_HD (HZ / 10)
23#define DFL_THROTL_SLICE_SSD (HZ / 50)
297e3d85 24#define MAX_THROTL_SLICE (HZ)
9e234eea 25#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
9bb67aeb
SL
26#define MIN_THROTL_BPS (320 * 1024)
27#define MIN_THROTL_IOPS (10)
b4f428ef
SL
28#define DFL_LATENCY_TARGET (-1L)
29#define DFL_IDLE_THRESHOLD (0)
6679a90c
SL
30#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
31#define LATENCY_FILTERED_SSD (0)
32/*
33 * For HD, very small latency comes from sequential IO. Such IO is helpless to
34 * help determine if its IO is impacted by others, hence we ignore the IO
35 */
36#define LATENCY_FILTERED_HD (1000L) /* 1ms */
e43473b7 37
b9147dd1
SL
38#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
39
3c798398 40static struct blkcg_policy blkcg_policy_throtl;
0381411e 41
450adcbe
VG
42/* A workqueue to queue throttle related work */
43static struct workqueue_struct *kthrotld_workqueue;
450adcbe 44
c5cc2070
TH
45/*
46 * To implement hierarchical throttling, throtl_grps form a tree and bios
47 * are dispatched upwards level by level until they reach the top and get
48 * issued. When dispatching bios from the children and local group at each
49 * level, if the bios are dispatched into a single bio_list, there's a risk
50 * of a local or child group which can queue many bios at once filling up
51 * the list starving others.
52 *
53 * To avoid such starvation, dispatched bios are queued separately
54 * according to where they came from. When they are again dispatched to
55 * the parent, they're popped in round-robin order so that no single source
56 * hogs the dispatch window.
57 *
58 * throtl_qnode is used to keep the queued bios separated by their sources.
59 * Bios are queued to throtl_qnode which in turn is queued to
60 * throtl_service_queue and then dispatched in round-robin order.
61 *
62 * It's also used to track the reference counts on blkg's. A qnode always
63 * belongs to a throtl_grp and gets queued on itself or the parent, so
64 * incrementing the reference of the associated throtl_grp when a qnode is
65 * queued and decrementing when dequeued is enough to keep the whole blkg
66 * tree pinned while bios are in flight.
67 */
68struct throtl_qnode {
69 struct list_head node; /* service_queue->queued[] */
70 struct bio_list bios; /* queued bios */
71 struct throtl_grp *tg; /* tg this qnode belongs to */
72};
73
c9e0332e 74struct throtl_service_queue {
77216b04
TH
75 struct throtl_service_queue *parent_sq; /* the parent service_queue */
76
73f0d49a
TH
77 /*
78 * Bios queued directly to this service_queue or dispatched from
79 * children throtl_grp's.
80 */
c5cc2070 81 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
73f0d49a
TH
82 unsigned int nr_queued[2]; /* number of queued bios */
83
84 /*
85 * RB tree of active children throtl_grp's, which are sorted by
86 * their ->disptime.
87 */
c9e0332e
TH
88 struct rb_root pending_tree; /* RB tree of active tgs */
89 struct rb_node *first_pending; /* first node in the tree */
90 unsigned int nr_pending; /* # queued in the tree */
91 unsigned long first_pending_disptime; /* disptime of the first tg */
69df0ab0 92 struct timer_list pending_timer; /* fires on first_pending_disptime */
e43473b7
VG
93};
94
5b2c16aa
TH
95enum tg_state_flags {
96 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
0e9f4164 97 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
5b2c16aa
TH
98};
99
e43473b7
VG
100#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
101
9f626e37 102enum {
cd5ab1b0 103 LIMIT_LOW,
9f626e37
SL
104 LIMIT_MAX,
105 LIMIT_CNT,
106};
107
e43473b7 108struct throtl_grp {
f95a04af
TH
109 /* must be the first member */
110 struct blkg_policy_data pd;
111
c9e0332e 112 /* active throtl group service_queue member */
e43473b7
VG
113 struct rb_node rb_node;
114
0f3457f6
TH
115 /* throtl_data this group belongs to */
116 struct throtl_data *td;
117
49a2f1e3
TH
118 /* this group's service queue */
119 struct throtl_service_queue service_queue;
120
c5cc2070
TH
121 /*
122 * qnode_on_self is used when bios are directly queued to this
123 * throtl_grp so that local bios compete fairly with bios
124 * dispatched from children. qnode_on_parent is used when bios are
125 * dispatched from this throtl_grp into its parent and will compete
126 * with the sibling qnode_on_parents and the parent's
127 * qnode_on_self.
128 */
129 struct throtl_qnode qnode_on_self[2];
130 struct throtl_qnode qnode_on_parent[2];
131
e43473b7
VG
132 /*
133 * Dispatch time in jiffies. This is the estimated time when group
134 * will unthrottle and is ready to dispatch more bio. It is used as
135 * key to sort active groups in service tree.
136 */
137 unsigned long disptime;
138
e43473b7
VG
139 unsigned int flags;
140
693e751e
TH
141 /* are there any throtl rules between this group and td? */
142 bool has_rules[2];
143
cd5ab1b0 144 /* internally used bytes per second rate limits */
9f626e37 145 uint64_t bps[2][LIMIT_CNT];
cd5ab1b0
SL
146 /* user configured bps limits */
147 uint64_t bps_conf[2][LIMIT_CNT];
e43473b7 148
cd5ab1b0 149 /* internally used IOPS limits */
9f626e37 150 unsigned int iops[2][LIMIT_CNT];
cd5ab1b0
SL
151 /* user configured IOPS limits */
152 unsigned int iops_conf[2][LIMIT_CNT];
8e89d13f 153
e43473b7
VG
154 /* Number of bytes disptached in current slice */
155 uint64_t bytes_disp[2];
8e89d13f
VG
156 /* Number of bio's dispatched in current slice */
157 unsigned int io_disp[2];
e43473b7 158
3f0abd80
SL
159 unsigned long last_low_overflow_time[2];
160
161 uint64_t last_bytes_disp[2];
162 unsigned int last_io_disp[2];
163
164 unsigned long last_check_time;
165
ec80991d 166 unsigned long latency_target; /* us */
5b81fc3c 167 unsigned long latency_target_conf; /* us */
e43473b7
VG
168 /* When did we start a new slice */
169 unsigned long slice_start[2];
170 unsigned long slice_end[2];
9e234eea
SL
171
172 unsigned long last_finish_time; /* ns / 1024 */
173 unsigned long checked_last_finish_time; /* ns / 1024 */
174 unsigned long avg_idletime; /* ns / 1024 */
175 unsigned long idletime_threshold; /* us */
5b81fc3c 176 unsigned long idletime_threshold_conf; /* us */
53696b8d
SL
177
178 unsigned int bio_cnt; /* total bios */
179 unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
180 unsigned long bio_cnt_reset_time;
e43473b7
VG
181};
182
b9147dd1
SL
183/* We measure latency for request size from <= 4k to >= 1M */
184#define LATENCY_BUCKET_SIZE 9
185
186struct latency_bucket {
187 unsigned long total_latency; /* ns / 1024 */
188 int samples;
189};
190
191struct avg_latency_bucket {
192 unsigned long latency; /* ns / 1024 */
193 bool valid;
194};
195
e43473b7
VG
196struct throtl_data
197{
e43473b7 198 /* service tree for active throtl groups */
c9e0332e 199 struct throtl_service_queue service_queue;
e43473b7 200
e43473b7
VG
201 struct request_queue *queue;
202
203 /* Total Number of queued bios on READ and WRITE lists */
204 unsigned int nr_queued[2];
205
297e3d85
SL
206 unsigned int throtl_slice;
207
e43473b7 208 /* Work for dispatching throttled bios */
69df0ab0 209 struct work_struct dispatch_work;
9f626e37
SL
210 unsigned int limit_index;
211 bool limit_valid[LIMIT_CNT];
3f0abd80
SL
212
213 unsigned long low_upgrade_time;
214 unsigned long low_downgrade_time;
7394e31f
SL
215
216 unsigned int scale;
b9147dd1
SL
217
218 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
219 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
220 struct latency_bucket __percpu *latency_buckets;
221 unsigned long last_calculate_time;
6679a90c 222 unsigned long filtered_latency;
b9147dd1
SL
223
224 bool track_bio_latency;
e43473b7
VG
225};
226
69df0ab0
TH
227static void throtl_pending_timer_fn(unsigned long arg);
228
f95a04af
TH
229static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
230{
231 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
232}
233
3c798398 234static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e 235{
f95a04af 236 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e
TH
237}
238
3c798398 239static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e 240{
f95a04af 241 return pd_to_blkg(&tg->pd);
0381411e
TH
242}
243
fda6f272
TH
244/**
245 * sq_to_tg - return the throl_grp the specified service queue belongs to
246 * @sq: the throtl_service_queue of interest
247 *
248 * Return the throtl_grp @sq belongs to. If @sq is the top-level one
249 * embedded in throtl_data, %NULL is returned.
250 */
251static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
252{
253 if (sq && sq->parent_sq)
254 return container_of(sq, struct throtl_grp, service_queue);
255 else
256 return NULL;
257}
258
259/**
260 * sq_to_td - return throtl_data the specified service queue belongs to
261 * @sq: the throtl_service_queue of interest
262 *
b43daedc 263 * A service_queue can be embedded in either a throtl_grp or throtl_data.
fda6f272
TH
264 * Determine the associated throtl_data accordingly and return it.
265 */
266static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
267{
268 struct throtl_grp *tg = sq_to_tg(sq);
269
270 if (tg)
271 return tg->td;
272 else
273 return container_of(sq, struct throtl_data, service_queue);
274}
275
7394e31f
SL
276/*
277 * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
278 * make the IO dispatch more smooth.
279 * Scale up: linearly scale up according to lapsed time since upgrade. For
280 * every throtl_slice, the limit scales up 1/2 .low limit till the
281 * limit hits .max limit
282 * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
283 */
284static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
285{
286 /* arbitrary value to avoid too big scale */
287 if (td->scale < 4096 && time_after_eq(jiffies,
288 td->low_upgrade_time + td->scale * td->throtl_slice))
289 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
290
291 return low + (low >> 1) * td->scale;
292}
293
9f626e37
SL
294static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
295{
b22c417c 296 struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31f 297 struct throtl_data *td;
b22c417c
SL
298 uint64_t ret;
299
300 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
301 return U64_MAX;
7394e31f
SL
302
303 td = tg->td;
304 ret = tg->bps[rw][td->limit_index];
9bb67aeb
SL
305 if (ret == 0 && td->limit_index == LIMIT_LOW) {
306 /* intermediate node or iops isn't 0 */
307 if (!list_empty(&blkg->blkcg->css.children) ||
308 tg->iops[rw][td->limit_index])
309 return U64_MAX;
310 else
311 return MIN_THROTL_BPS;
312 }
7394e31f
SL
313
314 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
315 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
316 uint64_t adjusted;
317
318 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
319 ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
320 }
b22c417c 321 return ret;
9f626e37
SL
322}
323
324static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
325{
b22c417c 326 struct blkcg_gq *blkg = tg_to_blkg(tg);
7394e31f 327 struct throtl_data *td;
b22c417c
SL
328 unsigned int ret;
329
330 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
331 return UINT_MAX;
9bb67aeb 332
7394e31f
SL
333 td = tg->td;
334 ret = tg->iops[rw][td->limit_index];
9bb67aeb
SL
335 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
336 /* intermediate node or bps isn't 0 */
337 if (!list_empty(&blkg->blkcg->css.children) ||
338 tg->bps[rw][td->limit_index])
339 return UINT_MAX;
340 else
341 return MIN_THROTL_IOPS;
342 }
7394e31f
SL
343
344 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
345 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
346 uint64_t adjusted;
347
348 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
349 if (adjusted > UINT_MAX)
350 adjusted = UINT_MAX;
351 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
352 }
b22c417c 353 return ret;
9f626e37
SL
354}
355
b9147dd1
SL
356#define request_bucket_index(sectors) \
357 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
358
fda6f272
TH
359/**
360 * throtl_log - log debug message via blktrace
361 * @sq: the service_queue being reported
362 * @fmt: printf format string
363 * @args: printf args
364 *
365 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
366 * throtl_grp; otherwise, just "throtl".
fda6f272
TH
367 */
368#define throtl_log(sq, fmt, args...) do { \
369 struct throtl_grp *__tg = sq_to_tg((sq)); \
370 struct throtl_data *__td = sq_to_td((sq)); \
371 \
372 (void)__td; \
59fa0224
SL
373 if (likely(!blk_trace_note_message_enabled(__td->queue))) \
374 break; \
fda6f272 375 if ((__tg)) { \
35fe6d76
SL
376 blk_add_cgroup_trace_msg(__td->queue, \
377 tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
fda6f272
TH
378 } else { \
379 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
380 } \
54e7ed12 381} while (0)
e43473b7 382
c5cc2070
TH
383static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
384{
385 INIT_LIST_HEAD(&qn->node);
386 bio_list_init(&qn->bios);
387 qn->tg = tg;
388}
389
390/**
391 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
392 * @bio: bio being added
393 * @qn: qnode to add bio to
394 * @queued: the service_queue->queued[] list @qn belongs to
395 *
396 * Add @bio to @qn and put @qn on @queued if it's not already on.
397 * @qn->tg's reference count is bumped when @qn is activated. See the
398 * comment on top of throtl_qnode definition for details.
399 */
400static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
401 struct list_head *queued)
402{
403 bio_list_add(&qn->bios, bio);
404 if (list_empty(&qn->node)) {
405 list_add_tail(&qn->node, queued);
406 blkg_get(tg_to_blkg(qn->tg));
407 }
408}
409
410/**
411 * throtl_peek_queued - peek the first bio on a qnode list
412 * @queued: the qnode list to peek
413 */
414static struct bio *throtl_peek_queued(struct list_head *queued)
415{
416 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
417 struct bio *bio;
418
419 if (list_empty(queued))
420 return NULL;
421
422 bio = bio_list_peek(&qn->bios);
423 WARN_ON_ONCE(!bio);
424 return bio;
425}
426
427/**
428 * throtl_pop_queued - pop the first bio form a qnode list
429 * @queued: the qnode list to pop a bio from
430 * @tg_to_put: optional out argument for throtl_grp to put
431 *
432 * Pop the first bio from the qnode list @queued. After popping, the first
433 * qnode is removed from @queued if empty or moved to the end of @queued so
434 * that the popping order is round-robin.
435 *
436 * When the first qnode is removed, its associated throtl_grp should be put
437 * too. If @tg_to_put is NULL, this function automatically puts it;
438 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
439 * responsible for putting it.
440 */
441static struct bio *throtl_pop_queued(struct list_head *queued,
442 struct throtl_grp **tg_to_put)
443{
444 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
445 struct bio *bio;
446
447 if (list_empty(queued))
448 return NULL;
449
450 bio = bio_list_pop(&qn->bios);
451 WARN_ON_ONCE(!bio);
452
453 if (bio_list_empty(&qn->bios)) {
454 list_del_init(&qn->node);
455 if (tg_to_put)
456 *tg_to_put = qn->tg;
457 else
458 blkg_put(tg_to_blkg(qn->tg));
459 } else {
460 list_move_tail(&qn->node, queued);
461 }
462
463 return bio;
464}
465
49a2f1e3 466/* init a service_queue, assumes the caller zeroed it */
b2ce2643 467static void throtl_service_queue_init(struct throtl_service_queue *sq)
49a2f1e3 468{
c5cc2070
TH
469 INIT_LIST_HEAD(&sq->queued[0]);
470 INIT_LIST_HEAD(&sq->queued[1]);
49a2f1e3 471 sq->pending_tree = RB_ROOT;
69df0ab0
TH
472 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
473 (unsigned long)sq);
474}
475
001bea73
TH
476static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
477{
4fb72036 478 struct throtl_grp *tg;
24bdb8ef 479 int rw;
4fb72036
TH
480
481 tg = kzalloc_node(sizeof(*tg), gfp, node);
482 if (!tg)
77ea7338 483 return NULL;
4fb72036 484
b2ce2643
TH
485 throtl_service_queue_init(&tg->service_queue);
486
487 for (rw = READ; rw <= WRITE; rw++) {
488 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
489 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
490 }
491
492 RB_CLEAR_NODE(&tg->rb_node);
9f626e37
SL
493 tg->bps[READ][LIMIT_MAX] = U64_MAX;
494 tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
495 tg->iops[READ][LIMIT_MAX] = UINT_MAX;
496 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
cd5ab1b0
SL
497 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
498 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
499 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
500 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
501 /* LIMIT_LOW will have default value 0 */
b2ce2643 502
ec80991d 503 tg->latency_target = DFL_LATENCY_TARGET;
5b81fc3c 504 tg->latency_target_conf = DFL_LATENCY_TARGET;
b4f428ef
SL
505 tg->idletime_threshold = DFL_IDLE_THRESHOLD;
506 tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
ec80991d 507
4fb72036 508 return &tg->pd;
001bea73
TH
509}
510
a9520cd6 511static void throtl_pd_init(struct blkg_policy_data *pd)
a29a171e 512{
a9520cd6
TH
513 struct throtl_grp *tg = pd_to_tg(pd);
514 struct blkcg_gq *blkg = tg_to_blkg(tg);
77216b04 515 struct throtl_data *td = blkg->q->td;
b2ce2643 516 struct throtl_service_queue *sq = &tg->service_queue;
cd1604fa 517
9138125b 518 /*
aa6ec29b 519 * If on the default hierarchy, we switch to properly hierarchical
9138125b
TH
520 * behavior where limits on a given throtl_grp are applied to the
521 * whole subtree rather than just the group itself. e.g. If 16M
522 * read_bps limit is set on the root group, the whole system can't
523 * exceed 16M for the device.
524 *
aa6ec29b 525 * If not on the default hierarchy, the broken flat hierarchy
9138125b
TH
526 * behavior is retained where all throtl_grps are treated as if
527 * they're all separate root groups right below throtl_data.
528 * Limits of a group don't interact with limits of other groups
529 * regardless of the position of the group in the hierarchy.
530 */
b2ce2643 531 sq->parent_sq = &td->service_queue;
9e10a130 532 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
b2ce2643 533 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
77216b04 534 tg->td = td;
8a3d2615
TH
535}
536
693e751e
TH
537/*
538 * Set has_rules[] if @tg or any of its parents have limits configured.
539 * This doesn't require walking up to the top of the hierarchy as the
540 * parent's has_rules[] is guaranteed to be correct.
541 */
542static void tg_update_has_rules(struct throtl_grp *tg)
543{
544 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
9f626e37 545 struct throtl_data *td = tg->td;
693e751e
TH
546 int rw;
547
548 for (rw = READ; rw <= WRITE; rw++)
549 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
9f626e37
SL
550 (td->limit_valid[td->limit_index] &&
551 (tg_bps_limit(tg, rw) != U64_MAX ||
552 tg_iops_limit(tg, rw) != UINT_MAX));
693e751e
TH
553}
554
a9520cd6 555static void throtl_pd_online(struct blkg_policy_data *pd)
693e751e 556{
aec24246 557 struct throtl_grp *tg = pd_to_tg(pd);
693e751e
TH
558 /*
559 * We don't want new groups to escape the limits of its ancestors.
560 * Update has_rules[] after a new group is brought online.
561 */
aec24246 562 tg_update_has_rules(tg);
693e751e
TH
563}
564
cd5ab1b0
SL
565static void blk_throtl_update_limit_valid(struct throtl_data *td)
566{
567 struct cgroup_subsys_state *pos_css;
568 struct blkcg_gq *blkg;
569 bool low_valid = false;
570
571 rcu_read_lock();
572 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
573 struct throtl_grp *tg = blkg_to_tg(blkg);
574
575 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
576 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
577 low_valid = true;
578 }
579 rcu_read_unlock();
580
581 td->limit_valid[LIMIT_LOW] = low_valid;
582}
583
c79892c5 584static void throtl_upgrade_state(struct throtl_data *td);
cd5ab1b0
SL
585static void throtl_pd_offline(struct blkg_policy_data *pd)
586{
587 struct throtl_grp *tg = pd_to_tg(pd);
588
589 tg->bps[READ][LIMIT_LOW] = 0;
590 tg->bps[WRITE][LIMIT_LOW] = 0;
591 tg->iops[READ][LIMIT_LOW] = 0;
592 tg->iops[WRITE][LIMIT_LOW] = 0;
593
594 blk_throtl_update_limit_valid(tg->td);
595
c79892c5
SL
596 if (!tg->td->limit_valid[tg->td->limit_index])
597 throtl_upgrade_state(tg->td);
cd5ab1b0
SL
598}
599
001bea73
TH
600static void throtl_pd_free(struct blkg_policy_data *pd)
601{
4fb72036
TH
602 struct throtl_grp *tg = pd_to_tg(pd);
603
b2ce2643 604 del_timer_sync(&tg->service_queue.pending_timer);
4fb72036 605 kfree(tg);
001bea73
TH
606}
607
0049af73
TH
608static struct throtl_grp *
609throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7
VG
610{
611 /* Service tree is empty */
0049af73 612 if (!parent_sq->nr_pending)
e43473b7
VG
613 return NULL;
614
0049af73
TH
615 if (!parent_sq->first_pending)
616 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
e43473b7 617
0049af73
TH
618 if (parent_sq->first_pending)
619 return rb_entry_tg(parent_sq->first_pending);
e43473b7
VG
620
621 return NULL;
622}
623
624static void rb_erase_init(struct rb_node *n, struct rb_root *root)
625{
626 rb_erase(n, root);
627 RB_CLEAR_NODE(n);
628}
629
0049af73
TH
630static void throtl_rb_erase(struct rb_node *n,
631 struct throtl_service_queue *parent_sq)
e43473b7 632{
0049af73
TH
633 if (parent_sq->first_pending == n)
634 parent_sq->first_pending = NULL;
635 rb_erase_init(n, &parent_sq->pending_tree);
636 --parent_sq->nr_pending;
e43473b7
VG
637}
638
0049af73 639static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7
VG
640{
641 struct throtl_grp *tg;
642
0049af73 643 tg = throtl_rb_first(parent_sq);
e43473b7
VG
644 if (!tg)
645 return;
646
0049af73 647 parent_sq->first_pending_disptime = tg->disptime;
e43473b7
VG
648}
649
77216b04 650static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7 651{
77216b04 652 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0049af73 653 struct rb_node **node = &parent_sq->pending_tree.rb_node;
e43473b7
VG
654 struct rb_node *parent = NULL;
655 struct throtl_grp *__tg;
656 unsigned long key = tg->disptime;
657 int left = 1;
658
659 while (*node != NULL) {
660 parent = *node;
661 __tg = rb_entry_tg(parent);
662
663 if (time_before(key, __tg->disptime))
664 node = &parent->rb_left;
665 else {
666 node = &parent->rb_right;
667 left = 0;
668 }
669 }
670
671 if (left)
0049af73 672 parent_sq->first_pending = &tg->rb_node;
e43473b7
VG
673
674 rb_link_node(&tg->rb_node, parent, node);
0049af73 675 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
e43473b7
VG
676}
677
77216b04 678static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7 679{
77216b04 680 tg_service_queue_add(tg);
5b2c16aa 681 tg->flags |= THROTL_TG_PENDING;
77216b04 682 tg->service_queue.parent_sq->nr_pending++;
e43473b7
VG
683}
684
77216b04 685static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7 686{
5b2c16aa 687 if (!(tg->flags & THROTL_TG_PENDING))
77216b04 688 __throtl_enqueue_tg(tg);
e43473b7
VG
689}
690
77216b04 691static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7 692{
77216b04 693 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aa 694 tg->flags &= ~THROTL_TG_PENDING;
e43473b7
VG
695}
696
77216b04 697static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7 698{
5b2c16aa 699 if (tg->flags & THROTL_TG_PENDING)
77216b04 700 __throtl_dequeue_tg(tg);
e43473b7
VG
701}
702
a9131a27 703/* Call with queue lock held */
69df0ab0
TH
704static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
705 unsigned long expires)
a9131a27 706{
a41b816c 707 unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
06cceedc
SL
708
709 /*
710 * Since we are adjusting the throttle limit dynamically, the sleep
711 * time calculated according to previous limit might be invalid. It's
712 * possible the cgroup sleep time is very long and no other cgroups
713 * have IO running so notify the limit changes. Make sure the cgroup
714 * doesn't sleep too long to avoid the missed notification.
715 */
716 if (time_after(expires, max_expire))
717 expires = max_expire;
69df0ab0
TH
718 mod_timer(&sq->pending_timer, expires);
719 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
720 expires - jiffies, jiffies);
a9131a27
TH
721}
722
7f52f98c
TH
723/**
724 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
725 * @sq: the service_queue to schedule dispatch for
726 * @force: force scheduling
727 *
728 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
729 * dispatch time of the first pending child. Returns %true if either timer
730 * is armed or there's no pending child left. %false if the current
731 * dispatch window is still open and the caller should continue
732 * dispatching.
733 *
734 * If @force is %true, the dispatch timer is always scheduled and this
735 * function is guaranteed to return %true. This is to be used when the
736 * caller can't dispatch itself and needs to invoke pending_timer
737 * unconditionally. Note that forced scheduling is likely to induce short
738 * delay before dispatch starts even if @sq->first_pending_disptime is not
739 * in the future and thus shouldn't be used in hot paths.
740 */
741static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
742 bool force)
e43473b7 743{
6a525600 744 /* any pending children left? */
c9e0332e 745 if (!sq->nr_pending)
7f52f98c 746 return true;
e43473b7 747
c9e0332e 748 update_min_dispatch_time(sq);
e43473b7 749
69df0ab0 750 /* is the next dispatch time in the future? */
7f52f98c 751 if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab0 752 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c 753 return true;
69df0ab0
TH
754 }
755
7f52f98c
TH
756 /* tell the caller to continue dispatching */
757 return false;
e43473b7
VG
758}
759
32ee5bc4
VG
760static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
761 bool rw, unsigned long start)
762{
763 tg->bytes_disp[rw] = 0;
764 tg->io_disp[rw] = 0;
765
766 /*
767 * Previous slice has expired. We must have trimmed it after last
768 * bio dispatch. That means since start of last slice, we never used
769 * that bandwidth. Do try to make use of that bandwidth while giving
770 * credit.
771 */
772 if (time_after_eq(start, tg->slice_start[rw]))
773 tg->slice_start[rw] = start;
774
297e3d85 775 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
32ee5bc4
VG
776 throtl_log(&tg->service_queue,
777 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
778 rw == READ ? 'R' : 'W', tg->slice_start[rw],
779 tg->slice_end[rw], jiffies);
780}
781
0f3457f6 782static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7
VG
783{
784 tg->bytes_disp[rw] = 0;
8e89d13f 785 tg->io_disp[rw] = 0;
e43473b7 786 tg->slice_start[rw] = jiffies;
297e3d85 787 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
fda6f272
TH
788 throtl_log(&tg->service_queue,
789 "[%c] new slice start=%lu end=%lu jiffies=%lu",
790 rw == READ ? 'R' : 'W', tg->slice_start[rw],
791 tg->slice_end[rw], jiffies);
e43473b7
VG
792}
793
0f3457f6
TH
794static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
795 unsigned long jiffy_end)
d1ae8ffd 796{
297e3d85 797 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
d1ae8ffd
VG
798}
799
0f3457f6
TH
800static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
801 unsigned long jiffy_end)
e43473b7 802{
297e3d85 803 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
fda6f272
TH
804 throtl_log(&tg->service_queue,
805 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
806 rw == READ ? 'R' : 'W', tg->slice_start[rw],
807 tg->slice_end[rw], jiffies);
e43473b7
VG
808}
809
810/* Determine if previously allocated or extended slice is complete or not */
0f3457f6 811static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7
VG
812{
813 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
5cf8c227 814 return false;
e43473b7
VG
815
816 return 1;
817}
818
819/* Trim the used slices and adjust slice start accordingly */
0f3457f6 820static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7 821{
3aad5d3e
VG
822 unsigned long nr_slices, time_elapsed, io_trim;
823 u64 bytes_trim, tmp;
e43473b7
VG
824
825 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
826
827 /*
828 * If bps are unlimited (-1), then time slice don't get
829 * renewed. Don't try to trim the slice if slice is used. A new
830 * slice will start when appropriate.
831 */
0f3457f6 832 if (throtl_slice_used(tg, rw))
e43473b7
VG
833 return;
834
d1ae8ffd
VG
835 /*
836 * A bio has been dispatched. Also adjust slice_end. It might happen
837 * that initially cgroup limit was very low resulting in high
838 * slice_end, but later limit was bumped up and bio was dispached
839 * sooner, then we need to reduce slice_end. A high bogus slice_end
840 * is bad because it does not allow new slice to start.
841 */
842
297e3d85 843 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
d1ae8ffd 844
e43473b7
VG
845 time_elapsed = jiffies - tg->slice_start[rw];
846
297e3d85 847 nr_slices = time_elapsed / tg->td->throtl_slice;
e43473b7
VG
848
849 if (!nr_slices)
850 return;
297e3d85 851 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
3aad5d3e
VG
852 do_div(tmp, HZ);
853 bytes_trim = tmp;
e43473b7 854
297e3d85
SL
855 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
856 HZ;
e43473b7 857
8e89d13f 858 if (!bytes_trim && !io_trim)
e43473b7
VG
859 return;
860
861 if (tg->bytes_disp[rw] >= bytes_trim)
862 tg->bytes_disp[rw] -= bytes_trim;
863 else
864 tg->bytes_disp[rw] = 0;
865
8e89d13f
VG
866 if (tg->io_disp[rw] >= io_trim)
867 tg->io_disp[rw] -= io_trim;
868 else
869 tg->io_disp[rw] = 0;
870
297e3d85 871 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
e43473b7 872
fda6f272
TH
873 throtl_log(&tg->service_queue,
874 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
875 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
876 tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7
VG
877}
878
0f3457f6
TH
879static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
880 unsigned long *wait)
e43473b7
VG
881{
882 bool rw = bio_data_dir(bio);
8e89d13f 883 unsigned int io_allowed;
e43473b7 884 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e4 885 u64 tmp;
e43473b7 886
8e89d13f 887 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
e43473b7 888
8e89d13f
VG
889 /* Slice has just started. Consider one slice interval */
890 if (!jiffy_elapsed)
297e3d85 891 jiffy_elapsed_rnd = tg->td->throtl_slice;
8e89d13f 892
297e3d85 893 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
8e89d13f 894
c49c06e4
VG
895 /*
896 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
897 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
898 * will allow dispatch after 1 second and after that slice should
899 * have been trimmed.
900 */
901
9f626e37 902 tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
c49c06e4
VG
903 do_div(tmp, HZ);
904
905 if (tmp > UINT_MAX)
906 io_allowed = UINT_MAX;
907 else
908 io_allowed = tmp;
8e89d13f
VG
909
910 if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7
VG
911 if (wait)
912 *wait = 0;
5cf8c227 913 return true;
e43473b7
VG
914 }
915
8e89d13f 916 /* Calc approx time to dispatch */
9f626e37 917 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
8e89d13f
VG
918
919 if (jiffy_wait > jiffy_elapsed)
920 jiffy_wait = jiffy_wait - jiffy_elapsed;
921 else
922 jiffy_wait = 1;
923
924 if (wait)
925 *wait = jiffy_wait;
926 return 0;
927}
928
0f3457f6
TH
929static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
930 unsigned long *wait)
8e89d13f
VG
931{
932 bool rw = bio_data_dir(bio);
3aad5d3e 933 u64 bytes_allowed, extra_bytes, tmp;
8e89d13f 934 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
e43473b7
VG
935
936 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
937
938 /* Slice has just started. Consider one slice interval */
939 if (!jiffy_elapsed)
297e3d85 940 jiffy_elapsed_rnd = tg->td->throtl_slice;
e43473b7 941
297e3d85 942 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
e43473b7 943
9f626e37 944 tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
5e901a2b 945 do_div(tmp, HZ);
3aad5d3e 946 bytes_allowed = tmp;
e43473b7 947
4f024f37 948 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
e43473b7
VG
949 if (wait)
950 *wait = 0;
5cf8c227 951 return true;
e43473b7
VG
952 }
953
954 /* Calc approx time to dispatch */
4f024f37 955 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
9f626e37 956 jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
e43473b7
VG
957
958 if (!jiffy_wait)
959 jiffy_wait = 1;
960
961 /*
962 * This wait time is without taking into consideration the rounding
963 * up we did. Add that time also.
964 */
965 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7
VG
966 if (wait)
967 *wait = jiffy_wait;
8e89d13f
VG
968 return 0;
969}
970
971/*
972 * Returns whether one can dispatch a bio or not. Also returns approx number
973 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
974 */
0f3457f6
TH
975static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
976 unsigned long *wait)
8e89d13f
VG
977{
978 bool rw = bio_data_dir(bio);
979 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
980
981 /*
982 * Currently whole state machine of group depends on first bio
983 * queued in the group bio list. So one should not be calling
984 * this function with a different bio if there are other bios
985 * queued.
986 */
73f0d49a 987 BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070 988 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7 989
8e89d13f 990 /* If tg->bps = -1, then BW is unlimited */
9f626e37
SL
991 if (tg_bps_limit(tg, rw) == U64_MAX &&
992 tg_iops_limit(tg, rw) == UINT_MAX) {
8e89d13f
VG
993 if (wait)
994 *wait = 0;
5cf8c227 995 return true;
8e89d13f
VG
996 }
997
998 /*
999 * If previous slice expired, start a new one otherwise renew/extend
1000 * existing slice to make sure it is at least throtl_slice interval
164c80ed
VG
1001 * long since now. New slice is started only for empty throttle group.
1002 * If there is queued bio, that means there should be an active
1003 * slice and it should be extended instead.
8e89d13f 1004 */
164c80ed 1005 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
0f3457f6 1006 throtl_start_new_slice(tg, rw);
8e89d13f 1007 else {
297e3d85
SL
1008 if (time_before(tg->slice_end[rw],
1009 jiffies + tg->td->throtl_slice))
1010 throtl_extend_slice(tg, rw,
1011 jiffies + tg->td->throtl_slice);
8e89d13f
VG
1012 }
1013
0f3457f6
TH
1014 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
1015 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f
VG
1016 if (wait)
1017 *wait = 0;
1018 return 1;
1019 }
1020
1021 max_wait = max(bps_wait, iops_wait);
1022
1023 if (wait)
1024 *wait = max_wait;
1025
1026 if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f6 1027 throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7
VG
1028
1029 return 0;
1030}
1031
1032static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
1033{
1034 bool rw = bio_data_dir(bio);
e43473b7
VG
1035
1036 /* Charge the bio to the group */
4f024f37 1037 tg->bytes_disp[rw] += bio->bi_iter.bi_size;
8e89d13f 1038 tg->io_disp[rw]++;
3f0abd80
SL
1039 tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
1040 tg->last_io_disp[rw]++;
e43473b7 1041
2a0f61e6 1042 /*
8d2bbd4c 1043 * BIO_THROTTLED is used to prevent the same bio to be throttled
2a0f61e6
TH
1044 * more than once as a throttled bio will go through blk-throtl the
1045 * second time when it eventually gets issued. Set it when a bio
1046 * is being charged to a tg.
2a0f61e6 1047 */
8d2bbd4c
CH
1048 if (!bio_flagged(bio, BIO_THROTTLED))
1049 bio_set_flag(bio, BIO_THROTTLED);
e43473b7
VG
1050}
1051
c5cc2070
TH
1052/**
1053 * throtl_add_bio_tg - add a bio to the specified throtl_grp
1054 * @bio: bio to add
1055 * @qn: qnode to use
1056 * @tg: the target throtl_grp
1057 *
1058 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
1059 * tg->qnode_on_self[] is used.
1060 */
1061static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
1062 struct throtl_grp *tg)
e43473b7 1063{
73f0d49a 1064 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1065 bool rw = bio_data_dir(bio);
1066
c5cc2070
TH
1067 if (!qn)
1068 qn = &tg->qnode_on_self[rw];
1069
0e9f4164
TH
1070 /*
1071 * If @tg doesn't currently have any bios queued in the same
1072 * direction, queueing @bio can change when @tg should be
1073 * dispatched. Mark that @tg was empty. This is automatically
1074 * cleaered on the next tg_update_disptime().
1075 */
1076 if (!sq->nr_queued[rw])
1077 tg->flags |= THROTL_TG_WAS_EMPTY;
1078
c5cc2070
TH
1079 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
1080
73f0d49a 1081 sq->nr_queued[rw]++;
77216b04 1082 throtl_enqueue_tg(tg);
e43473b7
VG
1083}
1084
77216b04 1085static void tg_update_disptime(struct throtl_grp *tg)
e43473b7 1086{
73f0d49a 1087 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1088 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
1089 struct bio *bio;
1090
d609af3a
ME
1091 bio = throtl_peek_queued(&sq->queued[READ]);
1092 if (bio)
0f3457f6 1093 tg_may_dispatch(tg, bio, &read_wait);
e43473b7 1094
d609af3a
ME
1095 bio = throtl_peek_queued(&sq->queued[WRITE]);
1096 if (bio)
0f3457f6 1097 tg_may_dispatch(tg, bio, &write_wait);
e43473b7
VG
1098
1099 min_wait = min(read_wait, write_wait);
1100 disptime = jiffies + min_wait;
1101
e43473b7 1102 /* Update dispatch time */
77216b04 1103 throtl_dequeue_tg(tg);
e43473b7 1104 tg->disptime = disptime;
77216b04 1105 throtl_enqueue_tg(tg);
0e9f4164
TH
1106
1107 /* see throtl_add_bio_tg() */
1108 tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7
VG
1109}
1110
32ee5bc4
VG
1111static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
1112 struct throtl_grp *parent_tg, bool rw)
1113{
1114 if (throtl_slice_used(parent_tg, rw)) {
1115 throtl_start_new_slice_with_credit(parent_tg, rw,
1116 child_tg->slice_start[rw]);
1117 }
1118
1119}
1120
77216b04 1121static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7 1122{
73f0d49a 1123 struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b4
TH
1124 struct throtl_service_queue *parent_sq = sq->parent_sq;
1125 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070 1126 struct throtl_grp *tg_to_put = NULL;
e43473b7
VG
1127 struct bio *bio;
1128
c5cc2070
TH
1129 /*
1130 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1131 * from @tg may put its reference and @parent_sq might end up
1132 * getting released prematurely. Remember the tg to put and put it
1133 * after @bio is transferred to @parent_sq.
1134 */
1135 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a 1136 sq->nr_queued[rw]--;
e43473b7
VG
1137
1138 throtl_charge_bio(tg, bio);
6bc9c2b4
TH
1139
1140 /*
1141 * If our parent is another tg, we just need to transfer @bio to
1142 * the parent using throtl_add_bio_tg(). If our parent is
1143 * @td->service_queue, @bio is ready to be issued. Put it on its
1144 * bio_lists[] and decrease total number queued. The caller is
1145 * responsible for issuing these bios.
1146 */
1147 if (parent_tg) {
c5cc2070 1148 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc4 1149 start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b4 1150 } else {
c5cc2070
TH
1151 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1152 &parent_sq->queued[rw]);
6bc9c2b4
TH
1153 BUG_ON(tg->td->nr_queued[rw] <= 0);
1154 tg->td->nr_queued[rw]--;
1155 }
e43473b7 1156
0f3457f6 1157 throtl_trim_slice(tg, rw);
6bc9c2b4 1158
c5cc2070
TH
1159 if (tg_to_put)
1160 blkg_put(tg_to_blkg(tg_to_put));
e43473b7
VG
1161}
1162
77216b04 1163static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7 1164{
73f0d49a 1165 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1166 unsigned int nr_reads = 0, nr_writes = 0;
1167 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d 1168 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7
VG
1169 struct bio *bio;
1170
1171 /* Try to dispatch 75% READS and 25% WRITES */
1172
c5cc2070 1173 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f6 1174 tg_may_dispatch(tg, bio, NULL)) {
e43473b7 1175
77216b04 1176 tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7
VG
1177 nr_reads++;
1178
1179 if (nr_reads >= max_nr_reads)
1180 break;
1181 }
1182
c5cc2070 1183 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f6 1184 tg_may_dispatch(tg, bio, NULL)) {
e43473b7 1185
77216b04 1186 tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7
VG
1187 nr_writes++;
1188
1189 if (nr_writes >= max_nr_writes)
1190 break;
1191 }
1192
1193 return nr_reads + nr_writes;
1194}
1195
651930bc 1196static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7
VG
1197{
1198 unsigned int nr_disp = 0;
e43473b7
VG
1199
1200 while (1) {
73f0d49a
TH
1201 struct throtl_grp *tg = throtl_rb_first(parent_sq);
1202 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1203
1204 if (!tg)
1205 break;
1206
1207 if (time_before(jiffies, tg->disptime))
1208 break;
1209
77216b04 1210 throtl_dequeue_tg(tg);
e43473b7 1211
77216b04 1212 nr_disp += throtl_dispatch_tg(tg);
e43473b7 1213
73f0d49a 1214 if (sq->nr_queued[0] || sq->nr_queued[1])
77216b04 1215 tg_update_disptime(tg);
e43473b7
VG
1216
1217 if (nr_disp >= throtl_quantum)
1218 break;
1219 }
1220
1221 return nr_disp;
1222}
1223
c79892c5
SL
1224static bool throtl_can_upgrade(struct throtl_data *td,
1225 struct throtl_grp *this_tg);
6e1a5704
TH
1226/**
1227 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1228 * @arg: the throtl_service_queue being serviced
1229 *
1230 * This timer is armed when a child throtl_grp with active bio's become
1231 * pending and queued on the service_queue's pending_tree and expires when
1232 * the first child throtl_grp should be dispatched. This function
2e48a530
TH
1233 * dispatches bio's from the children throtl_grps to the parent
1234 * service_queue.
1235 *
1236 * If the parent's parent is another throtl_grp, dispatching is propagated
1237 * by either arming its pending_timer or repeating dispatch directly. If
1238 * the top-level service_tree is reached, throtl_data->dispatch_work is
1239 * kicked so that the ready bio's are issued.
6e1a5704 1240 */
69df0ab0
TH
1241static void throtl_pending_timer_fn(unsigned long arg)
1242{
1243 struct throtl_service_queue *sq = (void *)arg;
2e48a530 1244 struct throtl_grp *tg = sq_to_tg(sq);
69df0ab0 1245 struct throtl_data *td = sq_to_td(sq);
cb76199c 1246 struct request_queue *q = td->queue;
2e48a530
TH
1247 struct throtl_service_queue *parent_sq;
1248 bool dispatched;
6e1a5704 1249 int ret;
e43473b7
VG
1250
1251 spin_lock_irq(q->queue_lock);
c79892c5
SL
1252 if (throtl_can_upgrade(td, NULL))
1253 throtl_upgrade_state(td);
1254
2e48a530
TH
1255again:
1256 parent_sq = sq->parent_sq;
1257 dispatched = false;
e43473b7 1258
7f52f98c
TH
1259 while (true) {
1260 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530
TH
1261 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1262 sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c
TH
1263
1264 ret = throtl_select_dispatch(sq);
1265 if (ret) {
7f52f98c
TH
1266 throtl_log(sq, "bios disp=%u", ret);
1267 dispatched = true;
1268 }
e43473b7 1269
7f52f98c
TH
1270 if (throtl_schedule_next_dispatch(sq, false))
1271 break;
e43473b7 1272
7f52f98c
TH
1273 /* this dispatch windows is still open, relax and repeat */
1274 spin_unlock_irq(q->queue_lock);
1275 cpu_relax();
1276 spin_lock_irq(q->queue_lock);
651930bc 1277 }
e43473b7 1278
2e48a530
TH
1279 if (!dispatched)
1280 goto out_unlock;
6e1a5704 1281
2e48a530
TH
1282 if (parent_sq) {
1283 /* @parent_sq is another throl_grp, propagate dispatch */
1284 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1285 tg_update_disptime(tg);
1286 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1287 /* window is already open, repeat dispatching */
1288 sq = parent_sq;
1289 tg = sq_to_tg(sq);
1290 goto again;
1291 }
1292 }
1293 } else {
1294 /* reached the top-level, queue issueing */
1295 queue_work(kthrotld_workqueue, &td->dispatch_work);
1296 }
1297out_unlock:
e43473b7 1298 spin_unlock_irq(q->queue_lock);
6e1a5704 1299}
e43473b7 1300
6e1a5704
TH
1301/**
1302 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1303 * @work: work item being executed
1304 *
1305 * This function is queued for execution when bio's reach the bio_lists[]
1306 * of throtl_data->service_queue. Those bio's are ready and issued by this
1307 * function.
1308 */
8876e140 1309static void blk_throtl_dispatch_work_fn(struct work_struct *work)
6e1a5704
TH
1310{
1311 struct throtl_data *td = container_of(work, struct throtl_data,
1312 dispatch_work);
1313 struct throtl_service_queue *td_sq = &td->service_queue;
1314 struct request_queue *q = td->queue;
1315 struct bio_list bio_list_on_stack;
1316 struct bio *bio;
1317 struct blk_plug plug;
1318 int rw;
1319
1320 bio_list_init(&bio_list_on_stack);
1321
1322 spin_lock_irq(q->queue_lock);
c5cc2070
TH
1323 for (rw = READ; rw <= WRITE; rw++)
1324 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1325 bio_list_add(&bio_list_on_stack, bio);
6e1a5704
TH
1326 spin_unlock_irq(q->queue_lock);
1327
1328 if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb9 1329 blk_start_plug(&plug);
e43473b7
VG
1330 while((bio = bio_list_pop(&bio_list_on_stack)))
1331 generic_make_request(bio);
69d60eb9 1332 blk_finish_plug(&plug);
e43473b7 1333 }
e43473b7
VG
1334}
1335
f95a04af
TH
1336static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1337 int off)
60c2bc2d 1338{
f95a04af
TH
1339 struct throtl_grp *tg = pd_to_tg(pd);
1340 u64 v = *(u64 *)((void *)tg + off);
60c2bc2d 1341
2ab5492d 1342 if (v == U64_MAX)
60c2bc2d 1343 return 0;
f95a04af 1344 return __blkg_prfill_u64(sf, pd, v);
60c2bc2d
TH
1345}
1346
f95a04af
TH
1347static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1348 int off)
e43473b7 1349{
f95a04af
TH
1350 struct throtl_grp *tg = pd_to_tg(pd);
1351 unsigned int v = *(unsigned int *)((void *)tg + off);
fe071437 1352
2ab5492d 1353 if (v == UINT_MAX)
af133ceb 1354 return 0;
f95a04af 1355 return __blkg_prfill_u64(sf, pd, v);
e43473b7
VG
1356}
1357
2da8ca82 1358static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f 1359{
2da8ca82
TH
1360 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
1361 &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb 1362 return 0;
8e89d13f
VG
1363}
1364
2da8ca82 1365static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f 1366{
2da8ca82
TH
1367 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
1368 &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb 1369 return 0;
60c2bc2d
TH
1370}
1371
9bb67aeb 1372static void tg_conf_updated(struct throtl_grp *tg, bool global)
60c2bc2d 1373{
69948b07 1374 struct throtl_service_queue *sq = &tg->service_queue;
492eb21b 1375 struct cgroup_subsys_state *pos_css;
69948b07 1376 struct blkcg_gq *blkg;
af133ceb 1377
fda6f272
TH
1378 throtl_log(&tg->service_queue,
1379 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
9f626e37
SL
1380 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
1381 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
632b4493 1382
693e751e
TH
1383 /*
1384 * Update has_rules[] flags for the updated tg's subtree. A tg is
1385 * considered to have rules if either the tg itself or any of its
1386 * ancestors has rules. This identifies groups without any
1387 * restrictions in the whole hierarchy and allows them to bypass
1388 * blk-throttle.
1389 */
9bb67aeb
SL
1390 blkg_for_each_descendant_pre(blkg, pos_css,
1391 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
5b81fc3c
SL
1392 struct throtl_grp *this_tg = blkg_to_tg(blkg);
1393 struct throtl_grp *parent_tg;
1394
1395 tg_update_has_rules(this_tg);
1396 /* ignore root/second level */
1397 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
1398 !blkg->parent->parent)
1399 continue;
1400 parent_tg = blkg_to_tg(blkg->parent);
1401 /*
1402 * make sure all children has lower idle time threshold and
1403 * higher latency target
1404 */
1405 this_tg->idletime_threshold = min(this_tg->idletime_threshold,
1406 parent_tg->idletime_threshold);
1407 this_tg->latency_target = max(this_tg->latency_target,
1408 parent_tg->latency_target);
1409 }
693e751e 1410
632b4493
TH
1411 /*
1412 * We're already holding queue_lock and know @tg is valid. Let's
1413 * apply the new config directly.
1414 *
1415 * Restart the slices for both READ and WRITES. It might happen
1416 * that a group's limit are dropped suddenly and we don't want to
1417 * account recently dispatched IO with new low rate.
1418 */
0f3457f6
TH
1419 throtl_start_new_slice(tg, 0);
1420 throtl_start_new_slice(tg, 1);
632b4493 1421
5b2c16aa 1422 if (tg->flags & THROTL_TG_PENDING) {
77216b04 1423 tg_update_disptime(tg);
7f52f98c 1424 throtl_schedule_next_dispatch(sq->parent_sq, true);
632b4493 1425 }
69948b07
TH
1426}
1427
1428static ssize_t tg_set_conf(struct kernfs_open_file *of,
1429 char *buf, size_t nbytes, loff_t off, bool is_u64)
1430{
1431 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1432 struct blkg_conf_ctx ctx;
1433 struct throtl_grp *tg;
1434 int ret;
1435 u64 v;
1436
1437 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1438 if (ret)
1439 return ret;
1440
1441 ret = -EINVAL;
1442 if (sscanf(ctx.body, "%llu", &v) != 1)
1443 goto out_finish;
1444 if (!v)
2ab5492d 1445 v = U64_MAX;
69948b07
TH
1446
1447 tg = blkg_to_tg(ctx.blkg);
1448
1449 if (is_u64)
1450 *(u64 *)((void *)tg + of_cft(of)->private) = v;
1451 else
1452 *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
60c2bc2d 1453
9bb67aeb 1454 tg_conf_updated(tg, false);
36aa9e5f
TH
1455 ret = 0;
1456out_finish:
60c2bc2d 1457 blkg_conf_finish(&ctx);
36aa9e5f 1458 return ret ?: nbytes;
8e89d13f
VG
1459}
1460
451af504
TH
1461static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
1462 char *buf, size_t nbytes, loff_t off)
60c2bc2d 1463{
451af504 1464 return tg_set_conf(of, buf, nbytes, off, true);
60c2bc2d
TH
1465}
1466
451af504
TH
1467static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1468 char *buf, size_t nbytes, loff_t off)
60c2bc2d 1469{
451af504 1470 return tg_set_conf(of, buf, nbytes, off, false);
60c2bc2d
TH
1471}
1472
880f50e2 1473static struct cftype throtl_legacy_files[] = {
60c2bc2d
TH
1474 {
1475 .name = "throttle.read_bps_device",
9f626e37 1476 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
2da8ca82 1477 .seq_show = tg_print_conf_u64,
451af504 1478 .write = tg_set_conf_u64,
60c2bc2d
TH
1479 },
1480 {
1481 .name = "throttle.write_bps_device",
9f626e37 1482 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
2da8ca82 1483 .seq_show = tg_print_conf_u64,
451af504 1484 .write = tg_set_conf_u64,
60c2bc2d
TH
1485 },
1486 {
1487 .name = "throttle.read_iops_device",
9f626e37 1488 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
2da8ca82 1489 .seq_show = tg_print_conf_uint,
451af504 1490 .write = tg_set_conf_uint,
60c2bc2d
TH
1491 },
1492 {
1493 .name = "throttle.write_iops_device",
9f626e37 1494 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
2da8ca82 1495 .seq_show = tg_print_conf_uint,
451af504 1496 .write = tg_set_conf_uint,
60c2bc2d
TH
1497 },
1498 {
1499 .name = "throttle.io_service_bytes",
77ea7338
TH
1500 .private = (unsigned long)&blkcg_policy_throtl,
1501 .seq_show = blkg_print_stat_bytes,
60c2bc2d
TH
1502 },
1503 {
1504 .name = "throttle.io_serviced",
77ea7338
TH
1505 .private = (unsigned long)&blkcg_policy_throtl,
1506 .seq_show = blkg_print_stat_ios,
60c2bc2d
TH
1507 },
1508 { } /* terminate */
1509};
1510
cd5ab1b0 1511static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
2ee867dc
TH
1512 int off)
1513{
1514 struct throtl_grp *tg = pd_to_tg(pd);
1515 const char *dname = blkg_dev_name(pd->blkg);
1516 char bufs[4][21] = { "max", "max", "max", "max" };
cd5ab1b0
SL
1517 u64 bps_dft;
1518 unsigned int iops_dft;
ada75b6e 1519 char idle_time[26] = "";
ec80991d 1520 char latency_time[26] = "";
2ee867dc
TH
1521
1522 if (!dname)
1523 return 0;
9f626e37 1524
cd5ab1b0
SL
1525 if (off == LIMIT_LOW) {
1526 bps_dft = 0;
1527 iops_dft = 0;
1528 } else {
1529 bps_dft = U64_MAX;
1530 iops_dft = UINT_MAX;
1531 }
1532
1533 if (tg->bps_conf[READ][off] == bps_dft &&
1534 tg->bps_conf[WRITE][off] == bps_dft &&
1535 tg->iops_conf[READ][off] == iops_dft &&
ada75b6e 1536 tg->iops_conf[WRITE][off] == iops_dft &&
ec80991d 1537 (off != LIMIT_LOW ||
b4f428ef 1538 (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
5b81fc3c 1539 tg->latency_target_conf == DFL_LATENCY_TARGET)))
2ee867dc
TH
1540 return 0;
1541
9bb67aeb 1542 if (tg->bps_conf[READ][off] != U64_MAX)
9f626e37 1543 snprintf(bufs[0], sizeof(bufs[0]), "%llu",
cd5ab1b0 1544 tg->bps_conf[READ][off]);
9bb67aeb 1545 if (tg->bps_conf[WRITE][off] != U64_MAX)
9f626e37 1546 snprintf(bufs[1], sizeof(bufs[1]), "%llu",
cd5ab1b0 1547 tg->bps_conf[WRITE][off]);
9bb67aeb 1548 if (tg->iops_conf[READ][off] != UINT_MAX)
9f626e37 1549 snprintf(bufs[2], sizeof(bufs[2]), "%u",
cd5ab1b0 1550 tg->iops_conf[READ][off]);
9bb67aeb 1551 if (tg->iops_conf[WRITE][off] != UINT_MAX)
9f626e37 1552 snprintf(bufs[3], sizeof(bufs[3]), "%u",
cd5ab1b0 1553 tg->iops_conf[WRITE][off]);
ada75b6e 1554 if (off == LIMIT_LOW) {
5b81fc3c 1555 if (tg->idletime_threshold_conf == ULONG_MAX)
ada75b6e
SL
1556 strcpy(idle_time, " idle=max");
1557 else
1558 snprintf(idle_time, sizeof(idle_time), " idle=%lu",
5b81fc3c 1559 tg->idletime_threshold_conf);
ec80991d 1560
5b81fc3c 1561 if (tg->latency_target_conf == ULONG_MAX)
ec80991d
SL
1562 strcpy(latency_time, " latency=max");
1563 else
1564 snprintf(latency_time, sizeof(latency_time),
5b81fc3c 1565 " latency=%lu", tg->latency_target_conf);
ada75b6e 1566 }
2ee867dc 1567
ec80991d
SL
1568 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
1569 dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
1570 latency_time);
2ee867dc
TH
1571 return 0;
1572}
1573
cd5ab1b0 1574static int tg_print_limit(struct seq_file *sf, void *v)
2ee867dc 1575{
cd5ab1b0 1576 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
2ee867dc
TH
1577 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1578 return 0;
1579}
1580
cd5ab1b0 1581static ssize_t tg_set_limit(struct kernfs_open_file *of,
2ee867dc
TH
1582 char *buf, size_t nbytes, loff_t off)
1583{
1584 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1585 struct blkg_conf_ctx ctx;
1586 struct throtl_grp *tg;
1587 u64 v[4];
ada75b6e 1588 unsigned long idle_time;
ec80991d 1589 unsigned long latency_time;
2ee867dc 1590 int ret;
cd5ab1b0 1591 int index = of_cft(of)->private;
2ee867dc
TH
1592
1593 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1594 if (ret)
1595 return ret;
1596
1597 tg = blkg_to_tg(ctx.blkg);
1598
cd5ab1b0
SL
1599 v[0] = tg->bps_conf[READ][index];
1600 v[1] = tg->bps_conf[WRITE][index];
1601 v[2] = tg->iops_conf[READ][index];
1602 v[3] = tg->iops_conf[WRITE][index];
2ee867dc 1603
5b81fc3c
SL
1604 idle_time = tg->idletime_threshold_conf;
1605 latency_time = tg->latency_target_conf;
2ee867dc
TH
1606 while (true) {
1607 char tok[27]; /* wiops=18446744073709551616 */
1608 char *p;
2ab5492d 1609 u64 val = U64_MAX;
2ee867dc
TH
1610 int len;
1611
1612 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
1613 break;
1614 if (tok[0] == '\0')
1615 break;
1616 ctx.body += len;
1617
1618 ret = -EINVAL;
1619 p = tok;
1620 strsep(&p, "=");
1621 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
1622 goto out_finish;
1623
1624 ret = -ERANGE;
1625 if (!val)
1626 goto out_finish;
1627
1628 ret = -EINVAL;
1629 if (!strcmp(tok, "rbps"))
1630 v[0] = val;
1631 else if (!strcmp(tok, "wbps"))
1632 v[1] = val;
1633 else if (!strcmp(tok, "riops"))
1634 v[2] = min_t(u64, val, UINT_MAX);
1635 else if (!strcmp(tok, "wiops"))
1636 v[3] = min_t(u64, val, UINT_MAX);
ada75b6e
SL
1637 else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
1638 idle_time = val;
ec80991d
SL
1639 else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
1640 latency_time = val;
2ee867dc
TH
1641 else
1642 goto out_finish;
1643 }
1644
cd5ab1b0
SL
1645 tg->bps_conf[READ][index] = v[0];
1646 tg->bps_conf[WRITE][index] = v[1];
1647 tg->iops_conf[READ][index] = v[2];
1648 tg->iops_conf[WRITE][index] = v[3];
2ee867dc 1649
cd5ab1b0
SL
1650 if (index == LIMIT_MAX) {
1651 tg->bps[READ][index] = v[0];
1652 tg->bps[WRITE][index] = v[1];
1653 tg->iops[READ][index] = v[2];
1654 tg->iops[WRITE][index] = v[3];
1655 }
1656 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
1657 tg->bps_conf[READ][LIMIT_MAX]);
1658 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
1659 tg->bps_conf[WRITE][LIMIT_MAX]);
1660 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
1661 tg->iops_conf[READ][LIMIT_MAX]);
1662 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
1663 tg->iops_conf[WRITE][LIMIT_MAX]);
b4f428ef
SL
1664 tg->idletime_threshold_conf = idle_time;
1665 tg->latency_target_conf = latency_time;
1666
1667 /* force user to configure all settings for low limit */
1668 if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
1669 tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
1670 tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
1671 tg->latency_target_conf == DFL_LATENCY_TARGET) {
1672 tg->bps[READ][LIMIT_LOW] = 0;
1673 tg->bps[WRITE][LIMIT_LOW] = 0;
1674 tg->iops[READ][LIMIT_LOW] = 0;
1675 tg->iops[WRITE][LIMIT_LOW] = 0;
1676 tg->idletime_threshold = DFL_IDLE_THRESHOLD;
1677 tg->latency_target = DFL_LATENCY_TARGET;
1678 } else if (index == LIMIT_LOW) {
5b81fc3c 1679 tg->idletime_threshold = tg->idletime_threshold_conf;
5b81fc3c 1680 tg->latency_target = tg->latency_target_conf;
cd5ab1b0 1681 }
b4f428ef
SL
1682
1683 blk_throtl_update_limit_valid(tg->td);
1684 if (tg->td->limit_valid[LIMIT_LOW]) {
1685 if (index == LIMIT_LOW)
1686 tg->td->limit_index = LIMIT_LOW;
1687 } else
1688 tg->td->limit_index = LIMIT_MAX;
9bb67aeb
SL
1689 tg_conf_updated(tg, index == LIMIT_LOW &&
1690 tg->td->limit_valid[LIMIT_LOW]);
2ee867dc
TH
1691 ret = 0;
1692out_finish:
1693 blkg_conf_finish(&ctx);
1694 return ret ?: nbytes;
1695}
1696
1697static struct cftype throtl_files[] = {
cd5ab1b0
SL
1698#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1699 {
1700 .name = "low",
1701 .flags = CFTYPE_NOT_ON_ROOT,
1702 .seq_show = tg_print_limit,
1703 .write = tg_set_limit,
1704 .private = LIMIT_LOW,
1705 },
1706#endif
2ee867dc
TH
1707 {
1708 .name = "max",
1709 .flags = CFTYPE_NOT_ON_ROOT,
cd5ab1b0
SL
1710 .seq_show = tg_print_limit,
1711 .write = tg_set_limit,
1712 .private = LIMIT_MAX,
2ee867dc
TH
1713 },
1714 { } /* terminate */
1715};
1716
da527770 1717static void throtl_shutdown_wq(struct request_queue *q)
e43473b7
VG
1718{
1719 struct throtl_data *td = q->td;
1720
69df0ab0 1721 cancel_work_sync(&td->dispatch_work);
e43473b7
VG
1722}
1723
3c798398 1724static struct blkcg_policy blkcg_policy_throtl = {
2ee867dc 1725 .dfl_cftypes = throtl_files,
880f50e2 1726 .legacy_cftypes = throtl_legacy_files,
f9fcc2d3 1727
001bea73 1728 .pd_alloc_fn = throtl_pd_alloc,
f9fcc2d3 1729 .pd_init_fn = throtl_pd_init,
693e751e 1730 .pd_online_fn = throtl_pd_online,
cd5ab1b0 1731 .pd_offline_fn = throtl_pd_offline,
001bea73 1732 .pd_free_fn = throtl_pd_free,
e43473b7
VG
1733};
1734
3f0abd80
SL
1735static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
1736{
1737 unsigned long rtime = jiffies, wtime = jiffies;
1738
1739 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
1740 rtime = tg->last_low_overflow_time[READ];
1741 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
1742 wtime = tg->last_low_overflow_time[WRITE];
1743 return min(rtime, wtime);
1744}
1745
1746/* tg should not be an intermediate node */
1747static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
1748{
1749 struct throtl_service_queue *parent_sq;
1750 struct throtl_grp *parent = tg;
1751 unsigned long ret = __tg_last_low_overflow_time(tg);
1752
1753 while (true) {
1754 parent_sq = parent->service_queue.parent_sq;
1755 parent = sq_to_tg(parent_sq);
1756 if (!parent)
1757 break;
1758
1759 /*
1760 * The parent doesn't have low limit, it always reaches low
1761 * limit. Its overflow time is useless for children
1762 */
1763 if (!parent->bps[READ][LIMIT_LOW] &&
1764 !parent->iops[READ][LIMIT_LOW] &&
1765 !parent->bps[WRITE][LIMIT_LOW] &&
1766 !parent->iops[WRITE][LIMIT_LOW])
1767 continue;
1768 if (time_after(__tg_last_low_overflow_time(parent), ret))
1769 ret = __tg_last_low_overflow_time(parent);
1770 }
1771 return ret;
1772}
1773
9e234eea
SL
1774static bool throtl_tg_is_idle(struct throtl_grp *tg)
1775{
1776 /*
1777 * cgroup is idle if:
1778 * - single idle is too long, longer than a fixed value (in case user
b4f428ef 1779 * configure a too big threshold) or 4 times of idletime threshold
9e234eea 1780 * - average think time is more than threshold
53696b8d 1781 * - IO latency is largely below threshold
9e234eea 1782 */
b4f428ef 1783 unsigned long time;
4cff729f 1784 bool ret;
9e234eea 1785
b4f428ef
SL
1786 time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
1787 ret = tg->latency_target == DFL_LATENCY_TARGET ||
1788 tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
1789 (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
1790 tg->avg_idletime > tg->idletime_threshold ||
1791 (tg->latency_target && tg->bio_cnt &&
53696b8d 1792 tg->bad_bio_cnt * 5 < tg->bio_cnt);
4cff729f
SL
1793 throtl_log(&tg->service_queue,
1794 "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
1795 tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
1796 tg->bio_cnt, ret, tg->td->scale);
1797 return ret;
9e234eea
SL
1798}
1799
c79892c5
SL
1800static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
1801{
1802 struct throtl_service_queue *sq = &tg->service_queue;
1803 bool read_limit, write_limit;
1804
1805 /*
1806 * if cgroup reaches low limit (if low limit is 0, the cgroup always
1807 * reaches), it's ok to upgrade to next limit
1808 */
1809 read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
1810 write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
1811 if (!read_limit && !write_limit)
1812 return true;
1813 if (read_limit && sq->nr_queued[READ] &&
1814 (!write_limit || sq->nr_queued[WRITE]))
1815 return true;
1816 if (write_limit && sq->nr_queued[WRITE] &&
1817 (!read_limit || sq->nr_queued[READ]))
1818 return true;
aec24246
SL
1819
1820 if (time_after_eq(jiffies,
fa6fb5aa
SL
1821 tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
1822 throtl_tg_is_idle(tg))
aec24246 1823 return true;
c79892c5
SL
1824 return false;
1825}
1826
1827static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
1828{
1829 while (true) {
1830 if (throtl_tg_can_upgrade(tg))
1831 return true;
1832 tg = sq_to_tg(tg->service_queue.parent_sq);
1833 if (!tg || !tg_to_blkg(tg)->parent)
1834 return false;
1835 }
1836 return false;
1837}
1838
1839static bool throtl_can_upgrade(struct throtl_data *td,
1840 struct throtl_grp *this_tg)
1841{
1842 struct cgroup_subsys_state *pos_css;
1843 struct blkcg_gq *blkg;
1844
1845 if (td->limit_index != LIMIT_LOW)
1846 return false;
1847
297e3d85 1848 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
3f0abd80
SL
1849 return false;
1850
c79892c5
SL
1851 rcu_read_lock();
1852 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1853 struct throtl_grp *tg = blkg_to_tg(blkg);
1854
1855 if (tg == this_tg)
1856 continue;
1857 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1858 continue;
1859 if (!throtl_hierarchy_can_upgrade(tg)) {
1860 rcu_read_unlock();
1861 return false;
1862 }
1863 }
1864 rcu_read_unlock();
1865 return true;
1866}
1867
fa6fb5aa
SL
1868static void throtl_upgrade_check(struct throtl_grp *tg)
1869{
1870 unsigned long now = jiffies;
1871
1872 if (tg->td->limit_index != LIMIT_LOW)
1873 return;
1874
1875 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
1876 return;
1877
1878 tg->last_check_time = now;
1879
1880 if (!time_after_eq(now,
1881 __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
1882 return;
1883
1884 if (throtl_can_upgrade(tg->td, NULL))
1885 throtl_upgrade_state(tg->td);
1886}
1887
c79892c5
SL
1888static void throtl_upgrade_state(struct throtl_data *td)
1889{
1890 struct cgroup_subsys_state *pos_css;
1891 struct blkcg_gq *blkg;
1892
4cff729f 1893 throtl_log(&td->service_queue, "upgrade to max");
c79892c5 1894 td->limit_index = LIMIT_MAX;
3f0abd80 1895 td->low_upgrade_time = jiffies;
7394e31f 1896 td->scale = 0;
c79892c5
SL
1897 rcu_read_lock();
1898 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1899 struct throtl_grp *tg = blkg_to_tg(blkg);
1900 struct throtl_service_queue *sq = &tg->service_queue;
1901
1902 tg->disptime = jiffies - 1;
1903 throtl_select_dispatch(sq);
1904 throtl_schedule_next_dispatch(sq, false);
1905 }
1906 rcu_read_unlock();
1907 throtl_select_dispatch(&td->service_queue);
1908 throtl_schedule_next_dispatch(&td->service_queue, false);
1909 queue_work(kthrotld_workqueue, &td->dispatch_work);
1910}
1911
3f0abd80
SL
1912static void throtl_downgrade_state(struct throtl_data *td, int new)
1913{
7394e31f
SL
1914 td->scale /= 2;
1915
4cff729f 1916 throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
7394e31f
SL
1917 if (td->scale) {
1918 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
1919 return;
1920 }
1921
3f0abd80
SL
1922 td->limit_index = new;
1923 td->low_downgrade_time = jiffies;
1924}
1925
1926static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
1927{
1928 struct throtl_data *td = tg->td;
1929 unsigned long now = jiffies;
1930
1931 /*
1932 * If cgroup is below low limit, consider downgrade and throttle other
1933 * cgroups
1934 */
297e3d85
SL
1935 if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
1936 time_after_eq(now, tg_last_low_overflow_time(tg) +
fa6fb5aa
SL
1937 td->throtl_slice) &&
1938 (!throtl_tg_is_idle(tg) ||
1939 !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
3f0abd80
SL
1940 return true;
1941 return false;
1942}
1943
1944static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
1945{
1946 while (true) {
1947 if (!throtl_tg_can_downgrade(tg))
1948 return false;
1949 tg = sq_to_tg(tg->service_queue.parent_sq);
1950 if (!tg || !tg_to_blkg(tg)->parent)
1951 break;
1952 }
1953 return true;
1954}
1955
1956static void throtl_downgrade_check(struct throtl_grp *tg)
1957{
1958 uint64_t bps;
1959 unsigned int iops;
1960 unsigned long elapsed_time;
1961 unsigned long now = jiffies;
1962
1963 if (tg->td->limit_index != LIMIT_MAX ||
1964 !tg->td->limit_valid[LIMIT_LOW])
1965 return;
1966 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1967 return;
297e3d85 1968 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
3f0abd80
SL
1969 return;
1970
1971 elapsed_time = now - tg->last_check_time;
1972 tg->last_check_time = now;
1973
297e3d85
SL
1974 if (time_before(now, tg_last_low_overflow_time(tg) +
1975 tg->td->throtl_slice))
3f0abd80
SL
1976 return;
1977
1978 if (tg->bps[READ][LIMIT_LOW]) {
1979 bps = tg->last_bytes_disp[READ] * HZ;
1980 do_div(bps, elapsed_time);
1981 if (bps >= tg->bps[READ][LIMIT_LOW])
1982 tg->last_low_overflow_time[READ] = now;
1983 }
1984
1985 if (tg->bps[WRITE][LIMIT_LOW]) {
1986 bps = tg->last_bytes_disp[WRITE] * HZ;
1987 do_div(bps, elapsed_time);
1988 if (bps >= tg->bps[WRITE][LIMIT_LOW])
1989 tg->last_low_overflow_time[WRITE] = now;
1990 }
1991
1992 if (tg->iops[READ][LIMIT_LOW]) {
1993 iops = tg->last_io_disp[READ] * HZ / elapsed_time;
1994 if (iops >= tg->iops[READ][LIMIT_LOW])
1995 tg->last_low_overflow_time[READ] = now;
1996 }
1997
1998 if (tg->iops[WRITE][LIMIT_LOW]) {
1999 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
2000 if (iops >= tg->iops[WRITE][LIMIT_LOW])
2001 tg->last_low_overflow_time[WRITE] = now;
2002 }
2003
2004 /*
2005 * If cgroup is below low limit, consider downgrade and throttle other
2006 * cgroups
2007 */
2008 if (throtl_hierarchy_can_downgrade(tg))
2009 throtl_downgrade_state(tg->td, LIMIT_LOW);
2010
2011 tg->last_bytes_disp[READ] = 0;
2012 tg->last_bytes_disp[WRITE] = 0;
2013 tg->last_io_disp[READ] = 0;
2014 tg->last_io_disp[WRITE] = 0;
2015}
2016
9e234eea
SL
2017static void blk_throtl_update_idletime(struct throtl_grp *tg)
2018{
2019 unsigned long now = ktime_get_ns() >> 10;
2020 unsigned long last_finish_time = tg->last_finish_time;
2021
2022 if (now <= last_finish_time || last_finish_time == 0 ||
2023 last_finish_time == tg->checked_last_finish_time)
2024 return;
2025
2026 tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
2027 tg->checked_last_finish_time = last_finish_time;
2028}
2029
b9147dd1
SL
2030#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2031static void throtl_update_latency_buckets(struct throtl_data *td)
2032{
2033 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
2034 int i, cpu;
2035 unsigned long last_latency = 0;
2036 unsigned long latency;
2037
2038 if (!blk_queue_nonrot(td->queue))
2039 return;
2040 if (time_before(jiffies, td->last_calculate_time + HZ))
2041 return;
2042 td->last_calculate_time = jiffies;
2043
2044 memset(avg_latency, 0, sizeof(avg_latency));
2045 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2046 struct latency_bucket *tmp = &td->tmp_buckets[i];
2047
2048 for_each_possible_cpu(cpu) {
2049 struct latency_bucket *bucket;
2050
2051 /* this isn't race free, but ok in practice */
2052 bucket = per_cpu_ptr(td->latency_buckets, cpu);
2053 tmp->total_latency += bucket[i].total_latency;
2054 tmp->samples += bucket[i].samples;
2055 bucket[i].total_latency = 0;
2056 bucket[i].samples = 0;
2057 }
2058
2059 if (tmp->samples >= 32) {
2060 int samples = tmp->samples;
2061
2062 latency = tmp->total_latency;
2063
2064 tmp->total_latency = 0;
2065 tmp->samples = 0;
2066 latency /= samples;
2067 if (latency == 0)
2068 continue;
2069 avg_latency[i].latency = latency;
2070 }
2071 }
2072
2073 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
2074 if (!avg_latency[i].latency) {
2075 if (td->avg_buckets[i].latency < last_latency)
2076 td->avg_buckets[i].latency = last_latency;
2077 continue;
2078 }
2079
2080 if (!td->avg_buckets[i].valid)
2081 latency = avg_latency[i].latency;
2082 else
2083 latency = (td->avg_buckets[i].latency * 7 +
2084 avg_latency[i].latency) >> 3;
2085
2086 td->avg_buckets[i].latency = max(latency, last_latency);
2087 td->avg_buckets[i].valid = true;
2088 last_latency = td->avg_buckets[i].latency;
2089 }
4cff729f
SL
2090
2091 for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
2092 throtl_log(&td->service_queue,
2093 "Latency bucket %d: latency=%ld, valid=%d", i,
2094 td->avg_buckets[i].latency, td->avg_buckets[i].valid);
b9147dd1
SL
2095}
2096#else
2097static inline void throtl_update_latency_buckets(struct throtl_data *td)
2098{
2099}
2100#endif
2101
2bc19cd5
JA
2102static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2103{
2104#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
007cc56b 2105 if (bio->bi_css)
2bc19cd5
JA
2106 bio->bi_cg_private = tg;
2107 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
2bc19cd5
JA
2108#endif
2109}
2110
ae118896
TH
2111bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2112 struct bio *bio)
e43473b7 2113{
c5cc2070 2114 struct throtl_qnode *qn = NULL;
ae118896 2115 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
73f0d49a 2116 struct throtl_service_queue *sq;
0e9f4164 2117 bool rw = bio_data_dir(bio);
bc16a4f9 2118 bool throttled = false;
b9147dd1 2119 struct throtl_data *td = tg->td;
e43473b7 2120
ae118896
TH
2121 WARN_ON_ONCE(!rcu_read_lock_held());
2122
2a0f61e6 2123 /* see throtl_charge_bio() */
8d2bbd4c 2124 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
bc16a4f9 2125 goto out;
e43473b7
VG
2126
2127 spin_lock_irq(q->queue_lock);
c9589f03 2128
b9147dd1
SL
2129 throtl_update_latency_buckets(td);
2130
c9589f03 2131 if (unlikely(blk_queue_bypass(q)))
bc16a4f9 2132 goto out_unlock;
f469a7b4 2133
2bc19cd5 2134 blk_throtl_assoc_bio(tg, bio);
9e234eea
SL
2135 blk_throtl_update_idletime(tg);
2136
73f0d49a
TH
2137 sq = &tg->service_queue;
2138
c79892c5 2139again:
9e660acf 2140 while (true) {
3f0abd80
SL
2141 if (tg->last_low_overflow_time[rw] == 0)
2142 tg->last_low_overflow_time[rw] = jiffies;
2143 throtl_downgrade_check(tg);
fa6fb5aa 2144 throtl_upgrade_check(tg);
9e660acf
TH
2145 /* throtl is FIFO - if bios are already queued, should queue */
2146 if (sq->nr_queued[rw])
2147 break;
de701c74 2148
9e660acf 2149 /* if above limits, break to queue */
c79892c5 2150 if (!tg_may_dispatch(tg, bio, NULL)) {
3f0abd80 2151 tg->last_low_overflow_time[rw] = jiffies;
b9147dd1
SL
2152 if (throtl_can_upgrade(td, tg)) {
2153 throtl_upgrade_state(td);
c79892c5
SL
2154 goto again;
2155 }
9e660acf 2156 break;
c79892c5 2157 }
9e660acf
TH
2158
2159 /* within limits, let's charge and dispatch directly */
e43473b7 2160 throtl_charge_bio(tg, bio);
04521db0
VG
2161
2162 /*
2163 * We need to trim slice even when bios are not being queued
2164 * otherwise it might happen that a bio is not queued for
2165 * a long time and slice keeps on extending and trim is not
2166 * called for a long time. Now if limits are reduced suddenly
2167 * we take into account all the IO dispatched so far at new
2168 * low rate and * newly queued IO gets a really long dispatch
2169 * time.
2170 *
2171 * So keep on trimming slice even if bio is not queued.
2172 */
0f3457f6 2173 throtl_trim_slice(tg, rw);
9e660acf
TH
2174
2175 /*
2176 * @bio passed through this layer without being throttled.
2177 * Climb up the ladder. If we''re already at the top, it
2178 * can be executed directly.
2179 */
c5cc2070 2180 qn = &tg->qnode_on_parent[rw];
9e660acf
TH
2181 sq = sq->parent_sq;
2182 tg = sq_to_tg(sq);
2183 if (!tg)
2184 goto out_unlock;
e43473b7
VG
2185 }
2186
9e660acf 2187 /* out-of-limit, queue to @tg */
fda6f272
TH
2188 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
2189 rw == READ ? 'R' : 'W',
9f626e37
SL
2190 tg->bytes_disp[rw], bio->bi_iter.bi_size,
2191 tg_bps_limit(tg, rw),
2192 tg->io_disp[rw], tg_iops_limit(tg, rw),
fda6f272 2193 sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7 2194
3f0abd80
SL
2195 tg->last_low_overflow_time[rw] = jiffies;
2196
b9147dd1 2197 td->nr_queued[rw]++;
c5cc2070 2198 throtl_add_bio_tg(bio, qn, tg);
bc16a4f9 2199 throttled = true;
e43473b7 2200
7f52f98c
TH
2201 /*
2202 * Update @tg's dispatch time and force schedule dispatch if @tg
2203 * was empty before @bio. The forced scheduling isn't likely to
2204 * cause undue delay as @bio is likely to be dispatched directly if
2205 * its @tg's disptime is not in the future.
2206 */
0e9f4164 2207 if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b04 2208 tg_update_disptime(tg);
7f52f98c 2209 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7
VG
2210 }
2211
bc16a4f9 2212out_unlock:
e43473b7 2213 spin_unlock_irq(q->queue_lock);
bc16a4f9 2214out:
2a0f61e6
TH
2215 /*
2216 * As multiple blk-throtls may stack in the same issue path, we
2217 * don't want bios to leave with the flag set. Clear the flag if
2218 * being issued.
2219 */
2220 if (!throttled)
8d2bbd4c 2221 bio_clear_flag(bio, BIO_THROTTLED);
b9147dd1
SL
2222
2223#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2224 if (throttled || !td->track_bio_latency)
2225 bio->bi_issue_stat.stat |= SKIP_LATENCY;
2226#endif
bc16a4f9 2227 return throttled;
e43473b7
VG
2228}
2229
9e234eea 2230#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
b9147dd1
SL
2231static void throtl_track_latency(struct throtl_data *td, sector_t size,
2232 int op, unsigned long time)
2233{
2234 struct latency_bucket *latency;
2235 int index;
2236
2237 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
2238 !blk_queue_nonrot(td->queue))
2239 return;
2240
2241 index = request_bucket_index(size);
2242
2243 latency = get_cpu_ptr(td->latency_buckets);
2244 latency[index].total_latency += time;
2245 latency[index].samples++;
2246 put_cpu_ptr(td->latency_buckets);
2247}
2248
2249void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2250{
2251 struct request_queue *q = rq->q;
2252 struct throtl_data *td = q->td;
2253
2254 throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
2255 req_op(rq), time_ns >> 10);
2256}
2257
9e234eea
SL
2258void blk_throtl_bio_endio(struct bio *bio)
2259{
2260 struct throtl_grp *tg;
b9147dd1
SL
2261 u64 finish_time_ns;
2262 unsigned long finish_time;
2263 unsigned long start_time;
2264 unsigned long lat;
9e234eea
SL
2265
2266 tg = bio->bi_cg_private;
2267 if (!tg)
2268 return;
2269 bio->bi_cg_private = NULL;
2270
b9147dd1
SL
2271 finish_time_ns = ktime_get_ns();
2272 tg->last_finish_time = finish_time_ns >> 10;
2273
2274 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
2275 finish_time = __blk_stat_time(finish_time_ns) >> 10;
53696b8d
SL
2276 if (!start_time || finish_time <= start_time)
2277 return;
2278
2279 lat = finish_time - start_time;
b9147dd1 2280 /* this is only for bio based driver */
53696b8d 2281 if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
b9147dd1
SL
2282 throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
2283 bio_op(bio), lat);
53696b8d 2284
6679a90c 2285 if (tg->latency_target && lat >= tg->td->filtered_latency) {
53696b8d
SL
2286 int bucket;
2287 unsigned int threshold;
2288
2289 bucket = request_bucket_index(
2290 blk_stat_size(&bio->bi_issue_stat));
2291 threshold = tg->td->avg_buckets[bucket].latency +
2292 tg->latency_target;
2293 if (lat > threshold)
2294 tg->bad_bio_cnt++;
2295 /*
2296 * Not race free, could get wrong count, which means cgroups
2297 * will be throttled
2298 */
2299 tg->bio_cnt++;
2300 }
2301
2302 if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
2303 tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
2304 tg->bio_cnt /= 2;
2305 tg->bad_bio_cnt /= 2;
b9147dd1 2306 }
9e234eea
SL
2307}
2308#endif
2309
2a12f0dc
TH
2310/*
2311 * Dispatch all bios from all children tg's queued on @parent_sq. On
2312 * return, @parent_sq is guaranteed to not have any active children tg's
2313 * and all bios from previously active tg's are on @parent_sq->bio_lists[].
2314 */
2315static void tg_drain_bios(struct throtl_service_queue *parent_sq)
2316{
2317 struct throtl_grp *tg;
2318
2319 while ((tg = throtl_rb_first(parent_sq))) {
2320 struct throtl_service_queue *sq = &tg->service_queue;
2321 struct bio *bio;
2322
2323 throtl_dequeue_tg(tg);
2324
c5cc2070 2325 while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dc 2326 tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070 2327 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dc
TH
2328 tg_dispatch_one_bio(tg, bio_data_dir(bio));
2329 }
2330}
2331
c9a929dd
TH
2332/**
2333 * blk_throtl_drain - drain throttled bios
2334 * @q: request_queue to drain throttled bios for
2335 *
2336 * Dispatch all currently throttled bios on @q through ->make_request_fn().
2337 */
2338void blk_throtl_drain(struct request_queue *q)
2339 __releases(q->queue_lock) __acquires(q->queue_lock)
2340{
2341 struct throtl_data *td = q->td;
2a12f0dc 2342 struct blkcg_gq *blkg;
492eb21b 2343 struct cgroup_subsys_state *pos_css;
c9a929dd 2344 struct bio *bio;
651930bc 2345 int rw;
c9a929dd 2346
8bcb6c7d 2347 queue_lockdep_assert_held(q);
2a12f0dc 2348 rcu_read_lock();
c9a929dd 2349
2a12f0dc
TH
2350 /*
2351 * Drain each tg while doing post-order walk on the blkg tree, so
2352 * that all bios are propagated to td->service_queue. It'd be
2353 * better to walk service_queue tree directly but blkg walk is
2354 * easier.
2355 */
492eb21b 2356 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dc 2357 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a 2358
2a12f0dc
TH
2359 /* finally, transfer bios from top-level tg's into the td */
2360 tg_drain_bios(&td->service_queue);
2361
2362 rcu_read_unlock();
c9a929dd
TH
2363 spin_unlock_irq(q->queue_lock);
2364
2a12f0dc 2365 /* all bios now should be in td->service_queue, issue them */
651930bc 2366 for (rw = READ; rw <= WRITE; rw++)
c5cc2070
TH
2367 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
2368 NULL)))
651930bc 2369 generic_make_request(bio);
c9a929dd
TH
2370
2371 spin_lock_irq(q->queue_lock);
2372}
2373
e43473b7
VG
2374int blk_throtl_init(struct request_queue *q)
2375{
2376 struct throtl_data *td;
a2b1693b 2377 int ret;
e43473b7
VG
2378
2379 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
2380 if (!td)
2381 return -ENOMEM;
b9147dd1
SL
2382 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
2383 LATENCY_BUCKET_SIZE, __alignof__(u64));
2384 if (!td->latency_buckets) {
2385 kfree(td);
2386 return -ENOMEM;
2387 }
e43473b7 2388
69df0ab0 2389 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
b2ce2643 2390 throtl_service_queue_init(&td->service_queue);
e43473b7 2391
cd1604fa 2392 q->td = td;
29b12589 2393 td->queue = q;
02977e4a 2394
9f626e37 2395 td->limit_valid[LIMIT_MAX] = true;
cd5ab1b0 2396 td->limit_index = LIMIT_MAX;
3f0abd80
SL
2397 td->low_upgrade_time = jiffies;
2398 td->low_downgrade_time = jiffies;
9e234eea 2399
a2b1693b 2400 /* activate policy */
3c798398 2401 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
b9147dd1
SL
2402 if (ret) {
2403 free_percpu(td->latency_buckets);
f51b802c 2404 kfree(td);
b9147dd1 2405 }
a2b1693b 2406 return ret;
e43473b7
VG
2407}
2408
2409void blk_throtl_exit(struct request_queue *q)
2410{
c875f4d0 2411 BUG_ON(!q->td);
da527770 2412 throtl_shutdown_wq(q);
3c798398 2413 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
b9147dd1 2414 free_percpu(q->td->latency_buckets);
c9a929dd 2415 kfree(q->td);
e43473b7
VG
2416}
2417
d61fcfa4
SL
2418void blk_throtl_register_queue(struct request_queue *q)
2419{
2420 struct throtl_data *td;
6679a90c 2421 int i;
d61fcfa4
SL
2422
2423 td = q->td;
2424 BUG_ON(!td);
2425
6679a90c 2426 if (blk_queue_nonrot(q)) {
d61fcfa4 2427 td->throtl_slice = DFL_THROTL_SLICE_SSD;
6679a90c
SL
2428 td->filtered_latency = LATENCY_FILTERED_SSD;
2429 } else {
d61fcfa4 2430 td->throtl_slice = DFL_THROTL_SLICE_HD;
6679a90c
SL
2431 td->filtered_latency = LATENCY_FILTERED_HD;
2432 for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
2433 td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
2434 }
d61fcfa4
SL
2435#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
2436 /* if no low limit, use previous default */
2437 td->throtl_slice = DFL_THROTL_SLICE_HD;
2438#endif
9e234eea 2439
b9147dd1
SL
2440 td->track_bio_latency = !q->mq_ops && !q->request_fn;
2441 if (!td->track_bio_latency)
2442 blk_stat_enable_accounting(q);
d61fcfa4
SL
2443}
2444
297e3d85
SL
2445#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2446ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
2447{
2448 if (!q->td)
2449 return -EINVAL;
2450 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
2451}
2452
2453ssize_t blk_throtl_sample_time_store(struct request_queue *q,
2454 const char *page, size_t count)
2455{
2456 unsigned long v;
2457 unsigned long t;
2458
2459 if (!q->td)
2460 return -EINVAL;
2461 if (kstrtoul(page, 10, &v))
2462 return -EINVAL;
2463 t = msecs_to_jiffies(v);
2464 if (t == 0 || t > MAX_THROTL_SLICE)
2465 return -EINVAL;
2466 q->td->throtl_slice = t;
2467 return count;
2468}
2469#endif
2470
e43473b7
VG
2471static int __init throtl_init(void)
2472{
450adcbe
VG
2473 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
2474 if (!kthrotld_workqueue)
2475 panic("Failed to create kthrotld\n");
2476
3c798398 2477 return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7
VG
2478}
2479
2480module_init(throtl_init);