]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - block/blk-throttle.c
blk-throttle: choose a small throtl_slice for SSD
[mirror_ubuntu-bionic-kernel.git] / block / blk-throttle.c
CommitLineData
e43473b7
VG
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
eea8f41c 12#include <linux/blk-cgroup.h>
bc9fcbf9 13#include "blk.h"
e43473b7
VG
14
15/* Max dispatch from a group in 1 round */
16static int throtl_grp_quantum = 8;
17
18/* Total max dispatch from all groups in one round */
19static int throtl_quantum = 32;
20
d61fcfa4
SL
21/* Throttling is performed over a slice and after that slice is renewed */
22#define DFL_THROTL_SLICE_HD (HZ / 10)
23#define DFL_THROTL_SLICE_SSD (HZ / 50)
297e3d85 24#define MAX_THROTL_SLICE (HZ)
e43473b7 25
3c798398 26static struct blkcg_policy blkcg_policy_throtl;
0381411e 27
450adcbe
VG
28/* A workqueue to queue throttle related work */
29static struct workqueue_struct *kthrotld_workqueue;
450adcbe 30
c5cc2070
TH
31/*
32 * To implement hierarchical throttling, throtl_grps form a tree and bios
33 * are dispatched upwards level by level until they reach the top and get
34 * issued. When dispatching bios from the children and local group at each
35 * level, if the bios are dispatched into a single bio_list, there's a risk
36 * of a local or child group which can queue many bios at once filling up
37 * the list starving others.
38 *
39 * To avoid such starvation, dispatched bios are queued separately
40 * according to where they came from. When they are again dispatched to
41 * the parent, they're popped in round-robin order so that no single source
42 * hogs the dispatch window.
43 *
44 * throtl_qnode is used to keep the queued bios separated by their sources.
45 * Bios are queued to throtl_qnode which in turn is queued to
46 * throtl_service_queue and then dispatched in round-robin order.
47 *
48 * It's also used to track the reference counts on blkg's. A qnode always
49 * belongs to a throtl_grp and gets queued on itself or the parent, so
50 * incrementing the reference of the associated throtl_grp when a qnode is
51 * queued and decrementing when dequeued is enough to keep the whole blkg
52 * tree pinned while bios are in flight.
53 */
54struct throtl_qnode {
55 struct list_head node; /* service_queue->queued[] */
56 struct bio_list bios; /* queued bios */
57 struct throtl_grp *tg; /* tg this qnode belongs to */
58};
59
c9e0332e 60struct throtl_service_queue {
77216b04
TH
61 struct throtl_service_queue *parent_sq; /* the parent service_queue */
62
73f0d49a
TH
63 /*
64 * Bios queued directly to this service_queue or dispatched from
65 * children throtl_grp's.
66 */
c5cc2070 67 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
73f0d49a
TH
68 unsigned int nr_queued[2]; /* number of queued bios */
69
70 /*
71 * RB tree of active children throtl_grp's, which are sorted by
72 * their ->disptime.
73 */
c9e0332e
TH
74 struct rb_root pending_tree; /* RB tree of active tgs */
75 struct rb_node *first_pending; /* first node in the tree */
76 unsigned int nr_pending; /* # queued in the tree */
77 unsigned long first_pending_disptime; /* disptime of the first tg */
69df0ab0 78 struct timer_list pending_timer; /* fires on first_pending_disptime */
e43473b7
VG
79};
80
5b2c16aa
TH
81enum tg_state_flags {
82 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
0e9f4164 83 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
5b2c16aa
TH
84};
85
e43473b7
VG
86#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
87
9f626e37 88enum {
cd5ab1b0 89 LIMIT_LOW,
9f626e37
SL
90 LIMIT_MAX,
91 LIMIT_CNT,
92};
93
e43473b7 94struct throtl_grp {
f95a04af
TH
95 /* must be the first member */
96 struct blkg_policy_data pd;
97
c9e0332e 98 /* active throtl group service_queue member */
e43473b7
VG
99 struct rb_node rb_node;
100
0f3457f6
TH
101 /* throtl_data this group belongs to */
102 struct throtl_data *td;
103
49a2f1e3
TH
104 /* this group's service queue */
105 struct throtl_service_queue service_queue;
106
c5cc2070
TH
107 /*
108 * qnode_on_self is used when bios are directly queued to this
109 * throtl_grp so that local bios compete fairly with bios
110 * dispatched from children. qnode_on_parent is used when bios are
111 * dispatched from this throtl_grp into its parent and will compete
112 * with the sibling qnode_on_parents and the parent's
113 * qnode_on_self.
114 */
115 struct throtl_qnode qnode_on_self[2];
116 struct throtl_qnode qnode_on_parent[2];
117
e43473b7
VG
118 /*
119 * Dispatch time in jiffies. This is the estimated time when group
120 * will unthrottle and is ready to dispatch more bio. It is used as
121 * key to sort active groups in service tree.
122 */
123 unsigned long disptime;
124
e43473b7
VG
125 unsigned int flags;
126
693e751e
TH
127 /* are there any throtl rules between this group and td? */
128 bool has_rules[2];
129
cd5ab1b0 130 /* internally used bytes per second rate limits */
9f626e37 131 uint64_t bps[2][LIMIT_CNT];
cd5ab1b0
SL
132 /* user configured bps limits */
133 uint64_t bps_conf[2][LIMIT_CNT];
e43473b7 134
cd5ab1b0 135 /* internally used IOPS limits */
9f626e37 136 unsigned int iops[2][LIMIT_CNT];
cd5ab1b0
SL
137 /* user configured IOPS limits */
138 unsigned int iops_conf[2][LIMIT_CNT];
8e89d13f 139
e43473b7
VG
140 /* Number of bytes disptached in current slice */
141 uint64_t bytes_disp[2];
8e89d13f
VG
142 /* Number of bio's dispatched in current slice */
143 unsigned int io_disp[2];
e43473b7 144
3f0abd80
SL
145 unsigned long last_low_overflow_time[2];
146
147 uint64_t last_bytes_disp[2];
148 unsigned int last_io_disp[2];
149
150 unsigned long last_check_time;
151
e43473b7
VG
152 /* When did we start a new slice */
153 unsigned long slice_start[2];
154 unsigned long slice_end[2];
155};
156
157struct throtl_data
158{
e43473b7 159 /* service tree for active throtl groups */
c9e0332e 160 struct throtl_service_queue service_queue;
e43473b7 161
e43473b7
VG
162 struct request_queue *queue;
163
164 /* Total Number of queued bios on READ and WRITE lists */
165 unsigned int nr_queued[2];
166
297e3d85
SL
167 unsigned int throtl_slice;
168
e43473b7 169 /* Work for dispatching throttled bios */
69df0ab0 170 struct work_struct dispatch_work;
9f626e37
SL
171 unsigned int limit_index;
172 bool limit_valid[LIMIT_CNT];
3f0abd80
SL
173
174 unsigned long low_upgrade_time;
175 unsigned long low_downgrade_time;
e43473b7
VG
176};
177
69df0ab0
TH
178static void throtl_pending_timer_fn(unsigned long arg);
179
f95a04af
TH
180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
181{
182 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
183}
184
3c798398 185static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
0381411e 186{
f95a04af 187 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
0381411e
TH
188}
189
3c798398 190static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
0381411e 191{
f95a04af 192 return pd_to_blkg(&tg->pd);
0381411e
TH
193}
194
fda6f272
TH
195/**
196 * sq_to_tg - return the throl_grp the specified service queue belongs to
197 * @sq: the throtl_service_queue of interest
198 *
199 * Return the throtl_grp @sq belongs to. If @sq is the top-level one
200 * embedded in throtl_data, %NULL is returned.
201 */
202static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
203{
204 if (sq && sq->parent_sq)
205 return container_of(sq, struct throtl_grp, service_queue);
206 else
207 return NULL;
208}
209
210/**
211 * sq_to_td - return throtl_data the specified service queue belongs to
212 * @sq: the throtl_service_queue of interest
213 *
b43daedc 214 * A service_queue can be embedded in either a throtl_grp or throtl_data.
fda6f272
TH
215 * Determine the associated throtl_data accordingly and return it.
216 */
217static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
218{
219 struct throtl_grp *tg = sq_to_tg(sq);
220
221 if (tg)
222 return tg->td;
223 else
224 return container_of(sq, struct throtl_data, service_queue);
225}
226
9f626e37
SL
227static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
228{
b22c417c
SL
229 struct blkcg_gq *blkg = tg_to_blkg(tg);
230 uint64_t ret;
231
232 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
233 return U64_MAX;
234 ret = tg->bps[rw][tg->td->limit_index];
235 if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
236 return tg->bps[rw][LIMIT_MAX];
237 return ret;
9f626e37
SL
238}
239
240static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
241{
b22c417c
SL
242 struct blkcg_gq *blkg = tg_to_blkg(tg);
243 unsigned int ret;
244
245 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
246 return UINT_MAX;
247 ret = tg->iops[rw][tg->td->limit_index];
248 if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
249 return tg->iops[rw][LIMIT_MAX];
250 return ret;
9f626e37
SL
251}
252
fda6f272
TH
253/**
254 * throtl_log - log debug message via blktrace
255 * @sq: the service_queue being reported
256 * @fmt: printf format string
257 * @args: printf args
258 *
259 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
260 * throtl_grp; otherwise, just "throtl".
fda6f272
TH
261 */
262#define throtl_log(sq, fmt, args...) do { \
263 struct throtl_grp *__tg = sq_to_tg((sq)); \
264 struct throtl_data *__td = sq_to_td((sq)); \
265 \
266 (void)__td; \
59fa0224
SL
267 if (likely(!blk_trace_note_message_enabled(__td->queue))) \
268 break; \
fda6f272
TH
269 if ((__tg)) { \
270 char __pbuf[128]; \
54e7ed12 271 \
fda6f272
TH
272 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
273 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
274 } else { \
275 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
276 } \
54e7ed12 277} while (0)
e43473b7 278
c5cc2070
TH
279static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
280{
281 INIT_LIST_HEAD(&qn->node);
282 bio_list_init(&qn->bios);
283 qn->tg = tg;
284}
285
286/**
287 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
288 * @bio: bio being added
289 * @qn: qnode to add bio to
290 * @queued: the service_queue->queued[] list @qn belongs to
291 *
292 * Add @bio to @qn and put @qn on @queued if it's not already on.
293 * @qn->tg's reference count is bumped when @qn is activated. See the
294 * comment on top of throtl_qnode definition for details.
295 */
296static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
297 struct list_head *queued)
298{
299 bio_list_add(&qn->bios, bio);
300 if (list_empty(&qn->node)) {
301 list_add_tail(&qn->node, queued);
302 blkg_get(tg_to_blkg(qn->tg));
303 }
304}
305
306/**
307 * throtl_peek_queued - peek the first bio on a qnode list
308 * @queued: the qnode list to peek
309 */
310static struct bio *throtl_peek_queued(struct list_head *queued)
311{
312 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
313 struct bio *bio;
314
315 if (list_empty(queued))
316 return NULL;
317
318 bio = bio_list_peek(&qn->bios);
319 WARN_ON_ONCE(!bio);
320 return bio;
321}
322
323/**
324 * throtl_pop_queued - pop the first bio form a qnode list
325 * @queued: the qnode list to pop a bio from
326 * @tg_to_put: optional out argument for throtl_grp to put
327 *
328 * Pop the first bio from the qnode list @queued. After popping, the first
329 * qnode is removed from @queued if empty or moved to the end of @queued so
330 * that the popping order is round-robin.
331 *
332 * When the first qnode is removed, its associated throtl_grp should be put
333 * too. If @tg_to_put is NULL, this function automatically puts it;
334 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
335 * responsible for putting it.
336 */
337static struct bio *throtl_pop_queued(struct list_head *queued,
338 struct throtl_grp **tg_to_put)
339{
340 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
341 struct bio *bio;
342
343 if (list_empty(queued))
344 return NULL;
345
346 bio = bio_list_pop(&qn->bios);
347 WARN_ON_ONCE(!bio);
348
349 if (bio_list_empty(&qn->bios)) {
350 list_del_init(&qn->node);
351 if (tg_to_put)
352 *tg_to_put = qn->tg;
353 else
354 blkg_put(tg_to_blkg(qn->tg));
355 } else {
356 list_move_tail(&qn->node, queued);
357 }
358
359 return bio;
360}
361
49a2f1e3 362/* init a service_queue, assumes the caller zeroed it */
b2ce2643 363static void throtl_service_queue_init(struct throtl_service_queue *sq)
49a2f1e3 364{
c5cc2070
TH
365 INIT_LIST_HEAD(&sq->queued[0]);
366 INIT_LIST_HEAD(&sq->queued[1]);
49a2f1e3 367 sq->pending_tree = RB_ROOT;
69df0ab0
TH
368 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
369 (unsigned long)sq);
370}
371
001bea73
TH
372static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
373{
4fb72036 374 struct throtl_grp *tg;
24bdb8ef 375 int rw;
4fb72036
TH
376
377 tg = kzalloc_node(sizeof(*tg), gfp, node);
378 if (!tg)
77ea7338 379 return NULL;
4fb72036 380
b2ce2643
TH
381 throtl_service_queue_init(&tg->service_queue);
382
383 for (rw = READ; rw <= WRITE; rw++) {
384 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
385 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
386 }
387
388 RB_CLEAR_NODE(&tg->rb_node);
9f626e37
SL
389 tg->bps[READ][LIMIT_MAX] = U64_MAX;
390 tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
391 tg->iops[READ][LIMIT_MAX] = UINT_MAX;
392 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
cd5ab1b0
SL
393 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
394 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
395 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
396 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
397 /* LIMIT_LOW will have default value 0 */
b2ce2643 398
4fb72036 399 return &tg->pd;
001bea73
TH
400}
401
a9520cd6 402static void throtl_pd_init(struct blkg_policy_data *pd)
a29a171e 403{
a9520cd6
TH
404 struct throtl_grp *tg = pd_to_tg(pd);
405 struct blkcg_gq *blkg = tg_to_blkg(tg);
77216b04 406 struct throtl_data *td = blkg->q->td;
b2ce2643 407 struct throtl_service_queue *sq = &tg->service_queue;
cd1604fa 408
9138125b 409 /*
aa6ec29b 410 * If on the default hierarchy, we switch to properly hierarchical
9138125b
TH
411 * behavior where limits on a given throtl_grp are applied to the
412 * whole subtree rather than just the group itself. e.g. If 16M
413 * read_bps limit is set on the root group, the whole system can't
414 * exceed 16M for the device.
415 *
aa6ec29b 416 * If not on the default hierarchy, the broken flat hierarchy
9138125b
TH
417 * behavior is retained where all throtl_grps are treated as if
418 * they're all separate root groups right below throtl_data.
419 * Limits of a group don't interact with limits of other groups
420 * regardless of the position of the group in the hierarchy.
421 */
b2ce2643 422 sq->parent_sq = &td->service_queue;
9e10a130 423 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
b2ce2643 424 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
77216b04 425 tg->td = td;
8a3d2615
TH
426}
427
693e751e
TH
428/*
429 * Set has_rules[] if @tg or any of its parents have limits configured.
430 * This doesn't require walking up to the top of the hierarchy as the
431 * parent's has_rules[] is guaranteed to be correct.
432 */
433static void tg_update_has_rules(struct throtl_grp *tg)
434{
435 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
9f626e37 436 struct throtl_data *td = tg->td;
693e751e
TH
437 int rw;
438
439 for (rw = READ; rw <= WRITE; rw++)
440 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
9f626e37
SL
441 (td->limit_valid[td->limit_index] &&
442 (tg_bps_limit(tg, rw) != U64_MAX ||
443 tg_iops_limit(tg, rw) != UINT_MAX));
693e751e
TH
444}
445
a9520cd6 446static void throtl_pd_online(struct blkg_policy_data *pd)
693e751e
TH
447{
448 /*
449 * We don't want new groups to escape the limits of its ancestors.
450 * Update has_rules[] after a new group is brought online.
451 */
a9520cd6 452 tg_update_has_rules(pd_to_tg(pd));
693e751e
TH
453}
454
cd5ab1b0
SL
455static void blk_throtl_update_limit_valid(struct throtl_data *td)
456{
457 struct cgroup_subsys_state *pos_css;
458 struct blkcg_gq *blkg;
459 bool low_valid = false;
460
461 rcu_read_lock();
462 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
463 struct throtl_grp *tg = blkg_to_tg(blkg);
464
465 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
466 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
467 low_valid = true;
468 }
469 rcu_read_unlock();
470
471 td->limit_valid[LIMIT_LOW] = low_valid;
472}
473
c79892c5 474static void throtl_upgrade_state(struct throtl_data *td);
cd5ab1b0
SL
475static void throtl_pd_offline(struct blkg_policy_data *pd)
476{
477 struct throtl_grp *tg = pd_to_tg(pd);
478
479 tg->bps[READ][LIMIT_LOW] = 0;
480 tg->bps[WRITE][LIMIT_LOW] = 0;
481 tg->iops[READ][LIMIT_LOW] = 0;
482 tg->iops[WRITE][LIMIT_LOW] = 0;
483
484 blk_throtl_update_limit_valid(tg->td);
485
c79892c5
SL
486 if (!tg->td->limit_valid[tg->td->limit_index])
487 throtl_upgrade_state(tg->td);
cd5ab1b0
SL
488}
489
001bea73
TH
490static void throtl_pd_free(struct blkg_policy_data *pd)
491{
4fb72036
TH
492 struct throtl_grp *tg = pd_to_tg(pd);
493
b2ce2643 494 del_timer_sync(&tg->service_queue.pending_timer);
4fb72036 495 kfree(tg);
001bea73
TH
496}
497
0049af73
TH
498static struct throtl_grp *
499throtl_rb_first(struct throtl_service_queue *parent_sq)
e43473b7
VG
500{
501 /* Service tree is empty */
0049af73 502 if (!parent_sq->nr_pending)
e43473b7
VG
503 return NULL;
504
0049af73
TH
505 if (!parent_sq->first_pending)
506 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
e43473b7 507
0049af73
TH
508 if (parent_sq->first_pending)
509 return rb_entry_tg(parent_sq->first_pending);
e43473b7
VG
510
511 return NULL;
512}
513
514static void rb_erase_init(struct rb_node *n, struct rb_root *root)
515{
516 rb_erase(n, root);
517 RB_CLEAR_NODE(n);
518}
519
0049af73
TH
520static void throtl_rb_erase(struct rb_node *n,
521 struct throtl_service_queue *parent_sq)
e43473b7 522{
0049af73
TH
523 if (parent_sq->first_pending == n)
524 parent_sq->first_pending = NULL;
525 rb_erase_init(n, &parent_sq->pending_tree);
526 --parent_sq->nr_pending;
e43473b7
VG
527}
528
0049af73 529static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
e43473b7
VG
530{
531 struct throtl_grp *tg;
532
0049af73 533 tg = throtl_rb_first(parent_sq);
e43473b7
VG
534 if (!tg)
535 return;
536
0049af73 537 parent_sq->first_pending_disptime = tg->disptime;
e43473b7
VG
538}
539
77216b04 540static void tg_service_queue_add(struct throtl_grp *tg)
e43473b7 541{
77216b04 542 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
0049af73 543 struct rb_node **node = &parent_sq->pending_tree.rb_node;
e43473b7
VG
544 struct rb_node *parent = NULL;
545 struct throtl_grp *__tg;
546 unsigned long key = tg->disptime;
547 int left = 1;
548
549 while (*node != NULL) {
550 parent = *node;
551 __tg = rb_entry_tg(parent);
552
553 if (time_before(key, __tg->disptime))
554 node = &parent->rb_left;
555 else {
556 node = &parent->rb_right;
557 left = 0;
558 }
559 }
560
561 if (left)
0049af73 562 parent_sq->first_pending = &tg->rb_node;
e43473b7
VG
563
564 rb_link_node(&tg->rb_node, parent, node);
0049af73 565 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
e43473b7
VG
566}
567
77216b04 568static void __throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7 569{
77216b04 570 tg_service_queue_add(tg);
5b2c16aa 571 tg->flags |= THROTL_TG_PENDING;
77216b04 572 tg->service_queue.parent_sq->nr_pending++;
e43473b7
VG
573}
574
77216b04 575static void throtl_enqueue_tg(struct throtl_grp *tg)
e43473b7 576{
5b2c16aa 577 if (!(tg->flags & THROTL_TG_PENDING))
77216b04 578 __throtl_enqueue_tg(tg);
e43473b7
VG
579}
580
77216b04 581static void __throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7 582{
77216b04 583 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
5b2c16aa 584 tg->flags &= ~THROTL_TG_PENDING;
e43473b7
VG
585}
586
77216b04 587static void throtl_dequeue_tg(struct throtl_grp *tg)
e43473b7 588{
5b2c16aa 589 if (tg->flags & THROTL_TG_PENDING)
77216b04 590 __throtl_dequeue_tg(tg);
e43473b7
VG
591}
592
a9131a27 593/* Call with queue lock held */
69df0ab0
TH
594static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
595 unsigned long expires)
a9131a27 596{
297e3d85 597 unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
06cceedc
SL
598
599 /*
600 * Since we are adjusting the throttle limit dynamically, the sleep
601 * time calculated according to previous limit might be invalid. It's
602 * possible the cgroup sleep time is very long and no other cgroups
603 * have IO running so notify the limit changes. Make sure the cgroup
604 * doesn't sleep too long to avoid the missed notification.
605 */
606 if (time_after(expires, max_expire))
607 expires = max_expire;
69df0ab0
TH
608 mod_timer(&sq->pending_timer, expires);
609 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
610 expires - jiffies, jiffies);
a9131a27
TH
611}
612
7f52f98c
TH
613/**
614 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
615 * @sq: the service_queue to schedule dispatch for
616 * @force: force scheduling
617 *
618 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
619 * dispatch time of the first pending child. Returns %true if either timer
620 * is armed or there's no pending child left. %false if the current
621 * dispatch window is still open and the caller should continue
622 * dispatching.
623 *
624 * If @force is %true, the dispatch timer is always scheduled and this
625 * function is guaranteed to return %true. This is to be used when the
626 * caller can't dispatch itself and needs to invoke pending_timer
627 * unconditionally. Note that forced scheduling is likely to induce short
628 * delay before dispatch starts even if @sq->first_pending_disptime is not
629 * in the future and thus shouldn't be used in hot paths.
630 */
631static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
632 bool force)
e43473b7 633{
6a525600 634 /* any pending children left? */
c9e0332e 635 if (!sq->nr_pending)
7f52f98c 636 return true;
e43473b7 637
c9e0332e 638 update_min_dispatch_time(sq);
e43473b7 639
69df0ab0 640 /* is the next dispatch time in the future? */
7f52f98c 641 if (force || time_after(sq->first_pending_disptime, jiffies)) {
69df0ab0 642 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
7f52f98c 643 return true;
69df0ab0
TH
644 }
645
7f52f98c
TH
646 /* tell the caller to continue dispatching */
647 return false;
e43473b7
VG
648}
649
32ee5bc4
VG
650static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
651 bool rw, unsigned long start)
652{
653 tg->bytes_disp[rw] = 0;
654 tg->io_disp[rw] = 0;
655
656 /*
657 * Previous slice has expired. We must have trimmed it after last
658 * bio dispatch. That means since start of last slice, we never used
659 * that bandwidth. Do try to make use of that bandwidth while giving
660 * credit.
661 */
662 if (time_after_eq(start, tg->slice_start[rw]))
663 tg->slice_start[rw] = start;
664
297e3d85 665 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
32ee5bc4
VG
666 throtl_log(&tg->service_queue,
667 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
668 rw == READ ? 'R' : 'W', tg->slice_start[rw],
669 tg->slice_end[rw], jiffies);
670}
671
0f3457f6 672static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
e43473b7
VG
673{
674 tg->bytes_disp[rw] = 0;
8e89d13f 675 tg->io_disp[rw] = 0;
e43473b7 676 tg->slice_start[rw] = jiffies;
297e3d85 677 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
fda6f272
TH
678 throtl_log(&tg->service_queue,
679 "[%c] new slice start=%lu end=%lu jiffies=%lu",
680 rw == READ ? 'R' : 'W', tg->slice_start[rw],
681 tg->slice_end[rw], jiffies);
e43473b7
VG
682}
683
0f3457f6
TH
684static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
685 unsigned long jiffy_end)
d1ae8ffd 686{
297e3d85 687 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
d1ae8ffd
VG
688}
689
0f3457f6
TH
690static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
691 unsigned long jiffy_end)
e43473b7 692{
297e3d85 693 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
fda6f272
TH
694 throtl_log(&tg->service_queue,
695 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
696 rw == READ ? 'R' : 'W', tg->slice_start[rw],
697 tg->slice_end[rw], jiffies);
e43473b7
VG
698}
699
700/* Determine if previously allocated or extended slice is complete or not */
0f3457f6 701static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
e43473b7
VG
702{
703 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
5cf8c227 704 return false;
e43473b7
VG
705
706 return 1;
707}
708
709/* Trim the used slices and adjust slice start accordingly */
0f3457f6 710static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
e43473b7 711{
3aad5d3e
VG
712 unsigned long nr_slices, time_elapsed, io_trim;
713 u64 bytes_trim, tmp;
e43473b7
VG
714
715 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
716
717 /*
718 * If bps are unlimited (-1), then time slice don't get
719 * renewed. Don't try to trim the slice if slice is used. A new
720 * slice will start when appropriate.
721 */
0f3457f6 722 if (throtl_slice_used(tg, rw))
e43473b7
VG
723 return;
724
d1ae8ffd
VG
725 /*
726 * A bio has been dispatched. Also adjust slice_end. It might happen
727 * that initially cgroup limit was very low resulting in high
728 * slice_end, but later limit was bumped up and bio was dispached
729 * sooner, then we need to reduce slice_end. A high bogus slice_end
730 * is bad because it does not allow new slice to start.
731 */
732
297e3d85 733 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
d1ae8ffd 734
e43473b7
VG
735 time_elapsed = jiffies - tg->slice_start[rw];
736
297e3d85 737 nr_slices = time_elapsed / tg->td->throtl_slice;
e43473b7
VG
738
739 if (!nr_slices)
740 return;
297e3d85 741 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
3aad5d3e
VG
742 do_div(tmp, HZ);
743 bytes_trim = tmp;
e43473b7 744
297e3d85
SL
745 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
746 HZ;
e43473b7 747
8e89d13f 748 if (!bytes_trim && !io_trim)
e43473b7
VG
749 return;
750
751 if (tg->bytes_disp[rw] >= bytes_trim)
752 tg->bytes_disp[rw] -= bytes_trim;
753 else
754 tg->bytes_disp[rw] = 0;
755
8e89d13f
VG
756 if (tg->io_disp[rw] >= io_trim)
757 tg->io_disp[rw] -= io_trim;
758 else
759 tg->io_disp[rw] = 0;
760
297e3d85 761 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
e43473b7 762
fda6f272
TH
763 throtl_log(&tg->service_queue,
764 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
765 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
766 tg->slice_start[rw], tg->slice_end[rw], jiffies);
e43473b7
VG
767}
768
0f3457f6
TH
769static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
770 unsigned long *wait)
e43473b7
VG
771{
772 bool rw = bio_data_dir(bio);
8e89d13f 773 unsigned int io_allowed;
e43473b7 774 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
c49c06e4 775 u64 tmp;
e43473b7 776
8e89d13f 777 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
e43473b7 778
8e89d13f
VG
779 /* Slice has just started. Consider one slice interval */
780 if (!jiffy_elapsed)
297e3d85 781 jiffy_elapsed_rnd = tg->td->throtl_slice;
8e89d13f 782
297e3d85 783 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
8e89d13f 784
c49c06e4
VG
785 /*
786 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
787 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
788 * will allow dispatch after 1 second and after that slice should
789 * have been trimmed.
790 */
791
9f626e37 792 tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
c49c06e4
VG
793 do_div(tmp, HZ);
794
795 if (tmp > UINT_MAX)
796 io_allowed = UINT_MAX;
797 else
798 io_allowed = tmp;
8e89d13f
VG
799
800 if (tg->io_disp[rw] + 1 <= io_allowed) {
e43473b7
VG
801 if (wait)
802 *wait = 0;
5cf8c227 803 return true;
e43473b7
VG
804 }
805
8e89d13f 806 /* Calc approx time to dispatch */
9f626e37 807 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
8e89d13f
VG
808
809 if (jiffy_wait > jiffy_elapsed)
810 jiffy_wait = jiffy_wait - jiffy_elapsed;
811 else
812 jiffy_wait = 1;
813
814 if (wait)
815 *wait = jiffy_wait;
816 return 0;
817}
818
0f3457f6
TH
819static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
820 unsigned long *wait)
8e89d13f
VG
821{
822 bool rw = bio_data_dir(bio);
3aad5d3e 823 u64 bytes_allowed, extra_bytes, tmp;
8e89d13f 824 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
e43473b7
VG
825
826 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
827
828 /* Slice has just started. Consider one slice interval */
829 if (!jiffy_elapsed)
297e3d85 830 jiffy_elapsed_rnd = tg->td->throtl_slice;
e43473b7 831
297e3d85 832 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
e43473b7 833
9f626e37 834 tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
5e901a2b 835 do_div(tmp, HZ);
3aad5d3e 836 bytes_allowed = tmp;
e43473b7 837
4f024f37 838 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
e43473b7
VG
839 if (wait)
840 *wait = 0;
5cf8c227 841 return true;
e43473b7
VG
842 }
843
844 /* Calc approx time to dispatch */
4f024f37 845 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
9f626e37 846 jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
e43473b7
VG
847
848 if (!jiffy_wait)
849 jiffy_wait = 1;
850
851 /*
852 * This wait time is without taking into consideration the rounding
853 * up we did. Add that time also.
854 */
855 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
e43473b7
VG
856 if (wait)
857 *wait = jiffy_wait;
8e89d13f
VG
858 return 0;
859}
860
861/*
862 * Returns whether one can dispatch a bio or not. Also returns approx number
863 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
864 */
0f3457f6
TH
865static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
866 unsigned long *wait)
8e89d13f
VG
867{
868 bool rw = bio_data_dir(bio);
869 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
870
871 /*
872 * Currently whole state machine of group depends on first bio
873 * queued in the group bio list. So one should not be calling
874 * this function with a different bio if there are other bios
875 * queued.
876 */
73f0d49a 877 BUG_ON(tg->service_queue.nr_queued[rw] &&
c5cc2070 878 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
e43473b7 879
8e89d13f 880 /* If tg->bps = -1, then BW is unlimited */
9f626e37
SL
881 if (tg_bps_limit(tg, rw) == U64_MAX &&
882 tg_iops_limit(tg, rw) == UINT_MAX) {
8e89d13f
VG
883 if (wait)
884 *wait = 0;
5cf8c227 885 return true;
8e89d13f
VG
886 }
887
888 /*
889 * If previous slice expired, start a new one otherwise renew/extend
890 * existing slice to make sure it is at least throtl_slice interval
164c80ed
VG
891 * long since now. New slice is started only for empty throttle group.
892 * If there is queued bio, that means there should be an active
893 * slice and it should be extended instead.
8e89d13f 894 */
164c80ed 895 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
0f3457f6 896 throtl_start_new_slice(tg, rw);
8e89d13f 897 else {
297e3d85
SL
898 if (time_before(tg->slice_end[rw],
899 jiffies + tg->td->throtl_slice))
900 throtl_extend_slice(tg, rw,
901 jiffies + tg->td->throtl_slice);
8e89d13f
VG
902 }
903
0f3457f6
TH
904 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
905 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
8e89d13f
VG
906 if (wait)
907 *wait = 0;
908 return 1;
909 }
910
911 max_wait = max(bps_wait, iops_wait);
912
913 if (wait)
914 *wait = max_wait;
915
916 if (time_before(tg->slice_end[rw], jiffies + max_wait))
0f3457f6 917 throtl_extend_slice(tg, rw, jiffies + max_wait);
e43473b7
VG
918
919 return 0;
920}
921
922static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
923{
924 bool rw = bio_data_dir(bio);
e43473b7
VG
925
926 /* Charge the bio to the group */
4f024f37 927 tg->bytes_disp[rw] += bio->bi_iter.bi_size;
8e89d13f 928 tg->io_disp[rw]++;
3f0abd80
SL
929 tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
930 tg->last_io_disp[rw]++;
e43473b7 931
2a0f61e6 932 /*
8d2bbd4c 933 * BIO_THROTTLED is used to prevent the same bio to be throttled
2a0f61e6
TH
934 * more than once as a throttled bio will go through blk-throtl the
935 * second time when it eventually gets issued. Set it when a bio
936 * is being charged to a tg.
2a0f61e6 937 */
8d2bbd4c
CH
938 if (!bio_flagged(bio, BIO_THROTTLED))
939 bio_set_flag(bio, BIO_THROTTLED);
e43473b7
VG
940}
941
c5cc2070
TH
942/**
943 * throtl_add_bio_tg - add a bio to the specified throtl_grp
944 * @bio: bio to add
945 * @qn: qnode to use
946 * @tg: the target throtl_grp
947 *
948 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
949 * tg->qnode_on_self[] is used.
950 */
951static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
952 struct throtl_grp *tg)
e43473b7 953{
73f0d49a 954 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
955 bool rw = bio_data_dir(bio);
956
c5cc2070
TH
957 if (!qn)
958 qn = &tg->qnode_on_self[rw];
959
0e9f4164
TH
960 /*
961 * If @tg doesn't currently have any bios queued in the same
962 * direction, queueing @bio can change when @tg should be
963 * dispatched. Mark that @tg was empty. This is automatically
964 * cleaered on the next tg_update_disptime().
965 */
966 if (!sq->nr_queued[rw])
967 tg->flags |= THROTL_TG_WAS_EMPTY;
968
c5cc2070
TH
969 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
970
73f0d49a 971 sq->nr_queued[rw]++;
77216b04 972 throtl_enqueue_tg(tg);
e43473b7
VG
973}
974
77216b04 975static void tg_update_disptime(struct throtl_grp *tg)
e43473b7 976{
73f0d49a 977 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
978 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
979 struct bio *bio;
980
d609af3a
ME
981 bio = throtl_peek_queued(&sq->queued[READ]);
982 if (bio)
0f3457f6 983 tg_may_dispatch(tg, bio, &read_wait);
e43473b7 984
d609af3a
ME
985 bio = throtl_peek_queued(&sq->queued[WRITE]);
986 if (bio)
0f3457f6 987 tg_may_dispatch(tg, bio, &write_wait);
e43473b7
VG
988
989 min_wait = min(read_wait, write_wait);
990 disptime = jiffies + min_wait;
991
e43473b7 992 /* Update dispatch time */
77216b04 993 throtl_dequeue_tg(tg);
e43473b7 994 tg->disptime = disptime;
77216b04 995 throtl_enqueue_tg(tg);
0e9f4164
TH
996
997 /* see throtl_add_bio_tg() */
998 tg->flags &= ~THROTL_TG_WAS_EMPTY;
e43473b7
VG
999}
1000
32ee5bc4
VG
1001static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
1002 struct throtl_grp *parent_tg, bool rw)
1003{
1004 if (throtl_slice_used(parent_tg, rw)) {
1005 throtl_start_new_slice_with_credit(parent_tg, rw,
1006 child_tg->slice_start[rw]);
1007 }
1008
1009}
1010
77216b04 1011static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
e43473b7 1012{
73f0d49a 1013 struct throtl_service_queue *sq = &tg->service_queue;
6bc9c2b4
TH
1014 struct throtl_service_queue *parent_sq = sq->parent_sq;
1015 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
c5cc2070 1016 struct throtl_grp *tg_to_put = NULL;
e43473b7
VG
1017 struct bio *bio;
1018
c5cc2070
TH
1019 /*
1020 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1021 * from @tg may put its reference and @parent_sq might end up
1022 * getting released prematurely. Remember the tg to put and put it
1023 * after @bio is transferred to @parent_sq.
1024 */
1025 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
73f0d49a 1026 sq->nr_queued[rw]--;
e43473b7
VG
1027
1028 throtl_charge_bio(tg, bio);
6bc9c2b4
TH
1029
1030 /*
1031 * If our parent is another tg, we just need to transfer @bio to
1032 * the parent using throtl_add_bio_tg(). If our parent is
1033 * @td->service_queue, @bio is ready to be issued. Put it on its
1034 * bio_lists[] and decrease total number queued. The caller is
1035 * responsible for issuing these bios.
1036 */
1037 if (parent_tg) {
c5cc2070 1038 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
32ee5bc4 1039 start_parent_slice_with_credit(tg, parent_tg, rw);
6bc9c2b4 1040 } else {
c5cc2070
TH
1041 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1042 &parent_sq->queued[rw]);
6bc9c2b4
TH
1043 BUG_ON(tg->td->nr_queued[rw] <= 0);
1044 tg->td->nr_queued[rw]--;
1045 }
e43473b7 1046
0f3457f6 1047 throtl_trim_slice(tg, rw);
6bc9c2b4 1048
c5cc2070
TH
1049 if (tg_to_put)
1050 blkg_put(tg_to_blkg(tg_to_put));
e43473b7
VG
1051}
1052
77216b04 1053static int throtl_dispatch_tg(struct throtl_grp *tg)
e43473b7 1054{
73f0d49a 1055 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1056 unsigned int nr_reads = 0, nr_writes = 0;
1057 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
c2f6805d 1058 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
e43473b7
VG
1059 struct bio *bio;
1060
1061 /* Try to dispatch 75% READS and 25% WRITES */
1062
c5cc2070 1063 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
0f3457f6 1064 tg_may_dispatch(tg, bio, NULL)) {
e43473b7 1065
77216b04 1066 tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7
VG
1067 nr_reads++;
1068
1069 if (nr_reads >= max_nr_reads)
1070 break;
1071 }
1072
c5cc2070 1073 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
0f3457f6 1074 tg_may_dispatch(tg, bio, NULL)) {
e43473b7 1075
77216b04 1076 tg_dispatch_one_bio(tg, bio_data_dir(bio));
e43473b7
VG
1077 nr_writes++;
1078
1079 if (nr_writes >= max_nr_writes)
1080 break;
1081 }
1082
1083 return nr_reads + nr_writes;
1084}
1085
651930bc 1086static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
e43473b7
VG
1087{
1088 unsigned int nr_disp = 0;
e43473b7
VG
1089
1090 while (1) {
73f0d49a
TH
1091 struct throtl_grp *tg = throtl_rb_first(parent_sq);
1092 struct throtl_service_queue *sq = &tg->service_queue;
e43473b7
VG
1093
1094 if (!tg)
1095 break;
1096
1097 if (time_before(jiffies, tg->disptime))
1098 break;
1099
77216b04 1100 throtl_dequeue_tg(tg);
e43473b7 1101
77216b04 1102 nr_disp += throtl_dispatch_tg(tg);
e43473b7 1103
73f0d49a 1104 if (sq->nr_queued[0] || sq->nr_queued[1])
77216b04 1105 tg_update_disptime(tg);
e43473b7
VG
1106
1107 if (nr_disp >= throtl_quantum)
1108 break;
1109 }
1110
1111 return nr_disp;
1112}
1113
c79892c5
SL
1114static bool throtl_can_upgrade(struct throtl_data *td,
1115 struct throtl_grp *this_tg);
6e1a5704
TH
1116/**
1117 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1118 * @arg: the throtl_service_queue being serviced
1119 *
1120 * This timer is armed when a child throtl_grp with active bio's become
1121 * pending and queued on the service_queue's pending_tree and expires when
1122 * the first child throtl_grp should be dispatched. This function
2e48a530
TH
1123 * dispatches bio's from the children throtl_grps to the parent
1124 * service_queue.
1125 *
1126 * If the parent's parent is another throtl_grp, dispatching is propagated
1127 * by either arming its pending_timer or repeating dispatch directly. If
1128 * the top-level service_tree is reached, throtl_data->dispatch_work is
1129 * kicked so that the ready bio's are issued.
6e1a5704 1130 */
69df0ab0
TH
1131static void throtl_pending_timer_fn(unsigned long arg)
1132{
1133 struct throtl_service_queue *sq = (void *)arg;
2e48a530 1134 struct throtl_grp *tg = sq_to_tg(sq);
69df0ab0 1135 struct throtl_data *td = sq_to_td(sq);
cb76199c 1136 struct request_queue *q = td->queue;
2e48a530
TH
1137 struct throtl_service_queue *parent_sq;
1138 bool dispatched;
6e1a5704 1139 int ret;
e43473b7
VG
1140
1141 spin_lock_irq(q->queue_lock);
c79892c5
SL
1142 if (throtl_can_upgrade(td, NULL))
1143 throtl_upgrade_state(td);
1144
2e48a530
TH
1145again:
1146 parent_sq = sq->parent_sq;
1147 dispatched = false;
e43473b7 1148
7f52f98c
TH
1149 while (true) {
1150 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
2e48a530
TH
1151 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1152 sq->nr_queued[READ], sq->nr_queued[WRITE]);
7f52f98c
TH
1153
1154 ret = throtl_select_dispatch(sq);
1155 if (ret) {
7f52f98c
TH
1156 throtl_log(sq, "bios disp=%u", ret);
1157 dispatched = true;
1158 }
e43473b7 1159
7f52f98c
TH
1160 if (throtl_schedule_next_dispatch(sq, false))
1161 break;
e43473b7 1162
7f52f98c
TH
1163 /* this dispatch windows is still open, relax and repeat */
1164 spin_unlock_irq(q->queue_lock);
1165 cpu_relax();
1166 spin_lock_irq(q->queue_lock);
651930bc 1167 }
e43473b7 1168
2e48a530
TH
1169 if (!dispatched)
1170 goto out_unlock;
6e1a5704 1171
2e48a530
TH
1172 if (parent_sq) {
1173 /* @parent_sq is another throl_grp, propagate dispatch */
1174 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1175 tg_update_disptime(tg);
1176 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1177 /* window is already open, repeat dispatching */
1178 sq = parent_sq;
1179 tg = sq_to_tg(sq);
1180 goto again;
1181 }
1182 }
1183 } else {
1184 /* reached the top-level, queue issueing */
1185 queue_work(kthrotld_workqueue, &td->dispatch_work);
1186 }
1187out_unlock:
e43473b7 1188 spin_unlock_irq(q->queue_lock);
6e1a5704 1189}
e43473b7 1190
6e1a5704
TH
1191/**
1192 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1193 * @work: work item being executed
1194 *
1195 * This function is queued for execution when bio's reach the bio_lists[]
1196 * of throtl_data->service_queue. Those bio's are ready and issued by this
1197 * function.
1198 */
8876e140 1199static void blk_throtl_dispatch_work_fn(struct work_struct *work)
6e1a5704
TH
1200{
1201 struct throtl_data *td = container_of(work, struct throtl_data,
1202 dispatch_work);
1203 struct throtl_service_queue *td_sq = &td->service_queue;
1204 struct request_queue *q = td->queue;
1205 struct bio_list bio_list_on_stack;
1206 struct bio *bio;
1207 struct blk_plug plug;
1208 int rw;
1209
1210 bio_list_init(&bio_list_on_stack);
1211
1212 spin_lock_irq(q->queue_lock);
c5cc2070
TH
1213 for (rw = READ; rw <= WRITE; rw++)
1214 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1215 bio_list_add(&bio_list_on_stack, bio);
6e1a5704
TH
1216 spin_unlock_irq(q->queue_lock);
1217
1218 if (!bio_list_empty(&bio_list_on_stack)) {
69d60eb9 1219 blk_start_plug(&plug);
e43473b7
VG
1220 while((bio = bio_list_pop(&bio_list_on_stack)))
1221 generic_make_request(bio);
69d60eb9 1222 blk_finish_plug(&plug);
e43473b7 1223 }
e43473b7
VG
1224}
1225
f95a04af
TH
1226static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1227 int off)
60c2bc2d 1228{
f95a04af
TH
1229 struct throtl_grp *tg = pd_to_tg(pd);
1230 u64 v = *(u64 *)((void *)tg + off);
60c2bc2d 1231
2ab5492d 1232 if (v == U64_MAX)
60c2bc2d 1233 return 0;
f95a04af 1234 return __blkg_prfill_u64(sf, pd, v);
60c2bc2d
TH
1235}
1236
f95a04af
TH
1237static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1238 int off)
e43473b7 1239{
f95a04af
TH
1240 struct throtl_grp *tg = pd_to_tg(pd);
1241 unsigned int v = *(unsigned int *)((void *)tg + off);
fe071437 1242
2ab5492d 1243 if (v == UINT_MAX)
af133ceb 1244 return 0;
f95a04af 1245 return __blkg_prfill_u64(sf, pd, v);
e43473b7
VG
1246}
1247
2da8ca82 1248static int tg_print_conf_u64(struct seq_file *sf, void *v)
8e89d13f 1249{
2da8ca82
TH
1250 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
1251 &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb 1252 return 0;
8e89d13f
VG
1253}
1254
2da8ca82 1255static int tg_print_conf_uint(struct seq_file *sf, void *v)
8e89d13f 1256{
2da8ca82
TH
1257 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
1258 &blkcg_policy_throtl, seq_cft(sf)->private, false);
af133ceb 1259 return 0;
60c2bc2d
TH
1260}
1261
69948b07 1262static void tg_conf_updated(struct throtl_grp *tg)
60c2bc2d 1263{
69948b07 1264 struct throtl_service_queue *sq = &tg->service_queue;
492eb21b 1265 struct cgroup_subsys_state *pos_css;
69948b07 1266 struct blkcg_gq *blkg;
af133ceb 1267
fda6f272
TH
1268 throtl_log(&tg->service_queue,
1269 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
9f626e37
SL
1270 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
1271 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
632b4493 1272
693e751e
TH
1273 /*
1274 * Update has_rules[] flags for the updated tg's subtree. A tg is
1275 * considered to have rules if either the tg itself or any of its
1276 * ancestors has rules. This identifies groups without any
1277 * restrictions in the whole hierarchy and allows them to bypass
1278 * blk-throttle.
1279 */
69948b07 1280 blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
693e751e
TH
1281 tg_update_has_rules(blkg_to_tg(blkg));
1282
632b4493
TH
1283 /*
1284 * We're already holding queue_lock and know @tg is valid. Let's
1285 * apply the new config directly.
1286 *
1287 * Restart the slices for both READ and WRITES. It might happen
1288 * that a group's limit are dropped suddenly and we don't want to
1289 * account recently dispatched IO with new low rate.
1290 */
0f3457f6
TH
1291 throtl_start_new_slice(tg, 0);
1292 throtl_start_new_slice(tg, 1);
632b4493 1293
5b2c16aa 1294 if (tg->flags & THROTL_TG_PENDING) {
77216b04 1295 tg_update_disptime(tg);
7f52f98c 1296 throtl_schedule_next_dispatch(sq->parent_sq, true);
632b4493 1297 }
69948b07
TH
1298}
1299
1300static ssize_t tg_set_conf(struct kernfs_open_file *of,
1301 char *buf, size_t nbytes, loff_t off, bool is_u64)
1302{
1303 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1304 struct blkg_conf_ctx ctx;
1305 struct throtl_grp *tg;
1306 int ret;
1307 u64 v;
1308
1309 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1310 if (ret)
1311 return ret;
1312
1313 ret = -EINVAL;
1314 if (sscanf(ctx.body, "%llu", &v) != 1)
1315 goto out_finish;
1316 if (!v)
2ab5492d 1317 v = U64_MAX;
69948b07
TH
1318
1319 tg = blkg_to_tg(ctx.blkg);
1320
1321 if (is_u64)
1322 *(u64 *)((void *)tg + of_cft(of)->private) = v;
1323 else
1324 *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
60c2bc2d 1325
69948b07 1326 tg_conf_updated(tg);
36aa9e5f
TH
1327 ret = 0;
1328out_finish:
60c2bc2d 1329 blkg_conf_finish(&ctx);
36aa9e5f 1330 return ret ?: nbytes;
8e89d13f
VG
1331}
1332
451af504
TH
1333static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
1334 char *buf, size_t nbytes, loff_t off)
60c2bc2d 1335{
451af504 1336 return tg_set_conf(of, buf, nbytes, off, true);
60c2bc2d
TH
1337}
1338
451af504
TH
1339static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1340 char *buf, size_t nbytes, loff_t off)
60c2bc2d 1341{
451af504 1342 return tg_set_conf(of, buf, nbytes, off, false);
60c2bc2d
TH
1343}
1344
880f50e2 1345static struct cftype throtl_legacy_files[] = {
60c2bc2d
TH
1346 {
1347 .name = "throttle.read_bps_device",
9f626e37 1348 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
2da8ca82 1349 .seq_show = tg_print_conf_u64,
451af504 1350 .write = tg_set_conf_u64,
60c2bc2d
TH
1351 },
1352 {
1353 .name = "throttle.write_bps_device",
9f626e37 1354 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
2da8ca82 1355 .seq_show = tg_print_conf_u64,
451af504 1356 .write = tg_set_conf_u64,
60c2bc2d
TH
1357 },
1358 {
1359 .name = "throttle.read_iops_device",
9f626e37 1360 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
2da8ca82 1361 .seq_show = tg_print_conf_uint,
451af504 1362 .write = tg_set_conf_uint,
60c2bc2d
TH
1363 },
1364 {
1365 .name = "throttle.write_iops_device",
9f626e37 1366 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
2da8ca82 1367 .seq_show = tg_print_conf_uint,
451af504 1368 .write = tg_set_conf_uint,
60c2bc2d
TH
1369 },
1370 {
1371 .name = "throttle.io_service_bytes",
77ea7338
TH
1372 .private = (unsigned long)&blkcg_policy_throtl,
1373 .seq_show = blkg_print_stat_bytes,
60c2bc2d
TH
1374 },
1375 {
1376 .name = "throttle.io_serviced",
77ea7338
TH
1377 .private = (unsigned long)&blkcg_policy_throtl,
1378 .seq_show = blkg_print_stat_ios,
60c2bc2d
TH
1379 },
1380 { } /* terminate */
1381};
1382
cd5ab1b0 1383static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
2ee867dc
TH
1384 int off)
1385{
1386 struct throtl_grp *tg = pd_to_tg(pd);
1387 const char *dname = blkg_dev_name(pd->blkg);
1388 char bufs[4][21] = { "max", "max", "max", "max" };
cd5ab1b0
SL
1389 u64 bps_dft;
1390 unsigned int iops_dft;
2ee867dc
TH
1391
1392 if (!dname)
1393 return 0;
9f626e37 1394
cd5ab1b0
SL
1395 if (off == LIMIT_LOW) {
1396 bps_dft = 0;
1397 iops_dft = 0;
1398 } else {
1399 bps_dft = U64_MAX;
1400 iops_dft = UINT_MAX;
1401 }
1402
1403 if (tg->bps_conf[READ][off] == bps_dft &&
1404 tg->bps_conf[WRITE][off] == bps_dft &&
1405 tg->iops_conf[READ][off] == iops_dft &&
1406 tg->iops_conf[WRITE][off] == iops_dft)
2ee867dc
TH
1407 return 0;
1408
cd5ab1b0 1409 if (tg->bps_conf[READ][off] != bps_dft)
9f626e37 1410 snprintf(bufs[0], sizeof(bufs[0]), "%llu",
cd5ab1b0
SL
1411 tg->bps_conf[READ][off]);
1412 if (tg->bps_conf[WRITE][off] != bps_dft)
9f626e37 1413 snprintf(bufs[1], sizeof(bufs[1]), "%llu",
cd5ab1b0
SL
1414 tg->bps_conf[WRITE][off]);
1415 if (tg->iops_conf[READ][off] != iops_dft)
9f626e37 1416 snprintf(bufs[2], sizeof(bufs[2]), "%u",
cd5ab1b0
SL
1417 tg->iops_conf[READ][off]);
1418 if (tg->iops_conf[WRITE][off] != iops_dft)
9f626e37 1419 snprintf(bufs[3], sizeof(bufs[3]), "%u",
cd5ab1b0 1420 tg->iops_conf[WRITE][off]);
2ee867dc
TH
1421
1422 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
1423 dname, bufs[0], bufs[1], bufs[2], bufs[3]);
1424 return 0;
1425}
1426
cd5ab1b0 1427static int tg_print_limit(struct seq_file *sf, void *v)
2ee867dc 1428{
cd5ab1b0 1429 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
2ee867dc
TH
1430 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1431 return 0;
1432}
1433
cd5ab1b0 1434static ssize_t tg_set_limit(struct kernfs_open_file *of,
2ee867dc
TH
1435 char *buf, size_t nbytes, loff_t off)
1436{
1437 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1438 struct blkg_conf_ctx ctx;
1439 struct throtl_grp *tg;
1440 u64 v[4];
1441 int ret;
cd5ab1b0 1442 int index = of_cft(of)->private;
2ee867dc
TH
1443
1444 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1445 if (ret)
1446 return ret;
1447
1448 tg = blkg_to_tg(ctx.blkg);
1449
cd5ab1b0
SL
1450 v[0] = tg->bps_conf[READ][index];
1451 v[1] = tg->bps_conf[WRITE][index];
1452 v[2] = tg->iops_conf[READ][index];
1453 v[3] = tg->iops_conf[WRITE][index];
2ee867dc
TH
1454
1455 while (true) {
1456 char tok[27]; /* wiops=18446744073709551616 */
1457 char *p;
2ab5492d 1458 u64 val = U64_MAX;
2ee867dc
TH
1459 int len;
1460
1461 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
1462 break;
1463 if (tok[0] == '\0')
1464 break;
1465 ctx.body += len;
1466
1467 ret = -EINVAL;
1468 p = tok;
1469 strsep(&p, "=");
1470 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
1471 goto out_finish;
1472
1473 ret = -ERANGE;
1474 if (!val)
1475 goto out_finish;
1476
1477 ret = -EINVAL;
1478 if (!strcmp(tok, "rbps"))
1479 v[0] = val;
1480 else if (!strcmp(tok, "wbps"))
1481 v[1] = val;
1482 else if (!strcmp(tok, "riops"))
1483 v[2] = min_t(u64, val, UINT_MAX);
1484 else if (!strcmp(tok, "wiops"))
1485 v[3] = min_t(u64, val, UINT_MAX);
1486 else
1487 goto out_finish;
1488 }
1489
cd5ab1b0
SL
1490 tg->bps_conf[READ][index] = v[0];
1491 tg->bps_conf[WRITE][index] = v[1];
1492 tg->iops_conf[READ][index] = v[2];
1493 tg->iops_conf[WRITE][index] = v[3];
2ee867dc 1494
cd5ab1b0
SL
1495 if (index == LIMIT_MAX) {
1496 tg->bps[READ][index] = v[0];
1497 tg->bps[WRITE][index] = v[1];
1498 tg->iops[READ][index] = v[2];
1499 tg->iops[WRITE][index] = v[3];
1500 }
1501 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
1502 tg->bps_conf[READ][LIMIT_MAX]);
1503 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
1504 tg->bps_conf[WRITE][LIMIT_MAX]);
1505 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
1506 tg->iops_conf[READ][LIMIT_MAX]);
1507 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
1508 tg->iops_conf[WRITE][LIMIT_MAX]);
1509
1510 if (index == LIMIT_LOW) {
1511 blk_throtl_update_limit_valid(tg->td);
1512 if (tg->td->limit_valid[LIMIT_LOW])
1513 tg->td->limit_index = LIMIT_LOW;
1514 }
2ee867dc
TH
1515 tg_conf_updated(tg);
1516 ret = 0;
1517out_finish:
1518 blkg_conf_finish(&ctx);
1519 return ret ?: nbytes;
1520}
1521
1522static struct cftype throtl_files[] = {
cd5ab1b0
SL
1523#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
1524 {
1525 .name = "low",
1526 .flags = CFTYPE_NOT_ON_ROOT,
1527 .seq_show = tg_print_limit,
1528 .write = tg_set_limit,
1529 .private = LIMIT_LOW,
1530 },
1531#endif
2ee867dc
TH
1532 {
1533 .name = "max",
1534 .flags = CFTYPE_NOT_ON_ROOT,
cd5ab1b0
SL
1535 .seq_show = tg_print_limit,
1536 .write = tg_set_limit,
1537 .private = LIMIT_MAX,
2ee867dc
TH
1538 },
1539 { } /* terminate */
1540};
1541
da527770 1542static void throtl_shutdown_wq(struct request_queue *q)
e43473b7
VG
1543{
1544 struct throtl_data *td = q->td;
1545
69df0ab0 1546 cancel_work_sync(&td->dispatch_work);
e43473b7
VG
1547}
1548
3c798398 1549static struct blkcg_policy blkcg_policy_throtl = {
2ee867dc 1550 .dfl_cftypes = throtl_files,
880f50e2 1551 .legacy_cftypes = throtl_legacy_files,
f9fcc2d3 1552
001bea73 1553 .pd_alloc_fn = throtl_pd_alloc,
f9fcc2d3 1554 .pd_init_fn = throtl_pd_init,
693e751e 1555 .pd_online_fn = throtl_pd_online,
cd5ab1b0 1556 .pd_offline_fn = throtl_pd_offline,
001bea73 1557 .pd_free_fn = throtl_pd_free,
e43473b7
VG
1558};
1559
3f0abd80
SL
1560static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
1561{
1562 unsigned long rtime = jiffies, wtime = jiffies;
1563
1564 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
1565 rtime = tg->last_low_overflow_time[READ];
1566 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
1567 wtime = tg->last_low_overflow_time[WRITE];
1568 return min(rtime, wtime);
1569}
1570
1571/* tg should not be an intermediate node */
1572static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
1573{
1574 struct throtl_service_queue *parent_sq;
1575 struct throtl_grp *parent = tg;
1576 unsigned long ret = __tg_last_low_overflow_time(tg);
1577
1578 while (true) {
1579 parent_sq = parent->service_queue.parent_sq;
1580 parent = sq_to_tg(parent_sq);
1581 if (!parent)
1582 break;
1583
1584 /*
1585 * The parent doesn't have low limit, it always reaches low
1586 * limit. Its overflow time is useless for children
1587 */
1588 if (!parent->bps[READ][LIMIT_LOW] &&
1589 !parent->iops[READ][LIMIT_LOW] &&
1590 !parent->bps[WRITE][LIMIT_LOW] &&
1591 !parent->iops[WRITE][LIMIT_LOW])
1592 continue;
1593 if (time_after(__tg_last_low_overflow_time(parent), ret))
1594 ret = __tg_last_low_overflow_time(parent);
1595 }
1596 return ret;
1597}
1598
c79892c5
SL
1599static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
1600{
1601 struct throtl_service_queue *sq = &tg->service_queue;
1602 bool read_limit, write_limit;
1603
1604 /*
1605 * if cgroup reaches low limit (if low limit is 0, the cgroup always
1606 * reaches), it's ok to upgrade to next limit
1607 */
1608 read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
1609 write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
1610 if (!read_limit && !write_limit)
1611 return true;
1612 if (read_limit && sq->nr_queued[READ] &&
1613 (!write_limit || sq->nr_queued[WRITE]))
1614 return true;
1615 if (write_limit && sq->nr_queued[WRITE] &&
1616 (!read_limit || sq->nr_queued[READ]))
1617 return true;
1618 return false;
1619}
1620
1621static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
1622{
1623 while (true) {
1624 if (throtl_tg_can_upgrade(tg))
1625 return true;
1626 tg = sq_to_tg(tg->service_queue.parent_sq);
1627 if (!tg || !tg_to_blkg(tg)->parent)
1628 return false;
1629 }
1630 return false;
1631}
1632
1633static bool throtl_can_upgrade(struct throtl_data *td,
1634 struct throtl_grp *this_tg)
1635{
1636 struct cgroup_subsys_state *pos_css;
1637 struct blkcg_gq *blkg;
1638
1639 if (td->limit_index != LIMIT_LOW)
1640 return false;
1641
297e3d85 1642 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
3f0abd80
SL
1643 return false;
1644
c79892c5
SL
1645 rcu_read_lock();
1646 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1647 struct throtl_grp *tg = blkg_to_tg(blkg);
1648
1649 if (tg == this_tg)
1650 continue;
1651 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1652 continue;
1653 if (!throtl_hierarchy_can_upgrade(tg)) {
1654 rcu_read_unlock();
1655 return false;
1656 }
1657 }
1658 rcu_read_unlock();
1659 return true;
1660}
1661
1662static void throtl_upgrade_state(struct throtl_data *td)
1663{
1664 struct cgroup_subsys_state *pos_css;
1665 struct blkcg_gq *blkg;
1666
1667 td->limit_index = LIMIT_MAX;
3f0abd80 1668 td->low_upgrade_time = jiffies;
c79892c5
SL
1669 rcu_read_lock();
1670 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
1671 struct throtl_grp *tg = blkg_to_tg(blkg);
1672 struct throtl_service_queue *sq = &tg->service_queue;
1673
1674 tg->disptime = jiffies - 1;
1675 throtl_select_dispatch(sq);
1676 throtl_schedule_next_dispatch(sq, false);
1677 }
1678 rcu_read_unlock();
1679 throtl_select_dispatch(&td->service_queue);
1680 throtl_schedule_next_dispatch(&td->service_queue, false);
1681 queue_work(kthrotld_workqueue, &td->dispatch_work);
1682}
1683
3f0abd80
SL
1684static void throtl_downgrade_state(struct throtl_data *td, int new)
1685{
1686 td->limit_index = new;
1687 td->low_downgrade_time = jiffies;
1688}
1689
1690static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
1691{
1692 struct throtl_data *td = tg->td;
1693 unsigned long now = jiffies;
1694
1695 /*
1696 * If cgroup is below low limit, consider downgrade and throttle other
1697 * cgroups
1698 */
297e3d85
SL
1699 if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
1700 time_after_eq(now, tg_last_low_overflow_time(tg) +
1701 td->throtl_slice))
3f0abd80
SL
1702 return true;
1703 return false;
1704}
1705
1706static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
1707{
1708 while (true) {
1709 if (!throtl_tg_can_downgrade(tg))
1710 return false;
1711 tg = sq_to_tg(tg->service_queue.parent_sq);
1712 if (!tg || !tg_to_blkg(tg)->parent)
1713 break;
1714 }
1715 return true;
1716}
1717
1718static void throtl_downgrade_check(struct throtl_grp *tg)
1719{
1720 uint64_t bps;
1721 unsigned int iops;
1722 unsigned long elapsed_time;
1723 unsigned long now = jiffies;
1724
1725 if (tg->td->limit_index != LIMIT_MAX ||
1726 !tg->td->limit_valid[LIMIT_LOW])
1727 return;
1728 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
1729 return;
297e3d85 1730 if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
3f0abd80
SL
1731 return;
1732
1733 elapsed_time = now - tg->last_check_time;
1734 tg->last_check_time = now;
1735
297e3d85
SL
1736 if (time_before(now, tg_last_low_overflow_time(tg) +
1737 tg->td->throtl_slice))
3f0abd80
SL
1738 return;
1739
1740 if (tg->bps[READ][LIMIT_LOW]) {
1741 bps = tg->last_bytes_disp[READ] * HZ;
1742 do_div(bps, elapsed_time);
1743 if (bps >= tg->bps[READ][LIMIT_LOW])
1744 tg->last_low_overflow_time[READ] = now;
1745 }
1746
1747 if (tg->bps[WRITE][LIMIT_LOW]) {
1748 bps = tg->last_bytes_disp[WRITE] * HZ;
1749 do_div(bps, elapsed_time);
1750 if (bps >= tg->bps[WRITE][LIMIT_LOW])
1751 tg->last_low_overflow_time[WRITE] = now;
1752 }
1753
1754 if (tg->iops[READ][LIMIT_LOW]) {
1755 iops = tg->last_io_disp[READ] * HZ / elapsed_time;
1756 if (iops >= tg->iops[READ][LIMIT_LOW])
1757 tg->last_low_overflow_time[READ] = now;
1758 }
1759
1760 if (tg->iops[WRITE][LIMIT_LOW]) {
1761 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
1762 if (iops >= tg->iops[WRITE][LIMIT_LOW])
1763 tg->last_low_overflow_time[WRITE] = now;
1764 }
1765
1766 /*
1767 * If cgroup is below low limit, consider downgrade and throttle other
1768 * cgroups
1769 */
1770 if (throtl_hierarchy_can_downgrade(tg))
1771 throtl_downgrade_state(tg->td, LIMIT_LOW);
1772
1773 tg->last_bytes_disp[READ] = 0;
1774 tg->last_bytes_disp[WRITE] = 0;
1775 tg->last_io_disp[READ] = 0;
1776 tg->last_io_disp[WRITE] = 0;
1777}
1778
ae118896
TH
1779bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1780 struct bio *bio)
e43473b7 1781{
c5cc2070 1782 struct throtl_qnode *qn = NULL;
ae118896 1783 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
73f0d49a 1784 struct throtl_service_queue *sq;
0e9f4164 1785 bool rw = bio_data_dir(bio);
bc16a4f9 1786 bool throttled = false;
e43473b7 1787
ae118896
TH
1788 WARN_ON_ONCE(!rcu_read_lock_held());
1789
2a0f61e6 1790 /* see throtl_charge_bio() */
8d2bbd4c 1791 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
bc16a4f9 1792 goto out;
e43473b7
VG
1793
1794 spin_lock_irq(q->queue_lock);
c9589f03
TH
1795
1796 if (unlikely(blk_queue_bypass(q)))
bc16a4f9 1797 goto out_unlock;
f469a7b4 1798
73f0d49a
TH
1799 sq = &tg->service_queue;
1800
c79892c5 1801again:
9e660acf 1802 while (true) {
3f0abd80
SL
1803 if (tg->last_low_overflow_time[rw] == 0)
1804 tg->last_low_overflow_time[rw] = jiffies;
1805 throtl_downgrade_check(tg);
9e660acf
TH
1806 /* throtl is FIFO - if bios are already queued, should queue */
1807 if (sq->nr_queued[rw])
1808 break;
de701c74 1809
9e660acf 1810 /* if above limits, break to queue */
c79892c5 1811 if (!tg_may_dispatch(tg, bio, NULL)) {
3f0abd80 1812 tg->last_low_overflow_time[rw] = jiffies;
c79892c5
SL
1813 if (throtl_can_upgrade(tg->td, tg)) {
1814 throtl_upgrade_state(tg->td);
1815 goto again;
1816 }
9e660acf 1817 break;
c79892c5 1818 }
9e660acf
TH
1819
1820 /* within limits, let's charge and dispatch directly */
e43473b7 1821 throtl_charge_bio(tg, bio);
04521db0
VG
1822
1823 /*
1824 * We need to trim slice even when bios are not being queued
1825 * otherwise it might happen that a bio is not queued for
1826 * a long time and slice keeps on extending and trim is not
1827 * called for a long time. Now if limits are reduced suddenly
1828 * we take into account all the IO dispatched so far at new
1829 * low rate and * newly queued IO gets a really long dispatch
1830 * time.
1831 *
1832 * So keep on trimming slice even if bio is not queued.
1833 */
0f3457f6 1834 throtl_trim_slice(tg, rw);
9e660acf
TH
1835
1836 /*
1837 * @bio passed through this layer without being throttled.
1838 * Climb up the ladder. If we''re already at the top, it
1839 * can be executed directly.
1840 */
c5cc2070 1841 qn = &tg->qnode_on_parent[rw];
9e660acf
TH
1842 sq = sq->parent_sq;
1843 tg = sq_to_tg(sq);
1844 if (!tg)
1845 goto out_unlock;
e43473b7
VG
1846 }
1847
9e660acf 1848 /* out-of-limit, queue to @tg */
fda6f272
TH
1849 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
1850 rw == READ ? 'R' : 'W',
9f626e37
SL
1851 tg->bytes_disp[rw], bio->bi_iter.bi_size,
1852 tg_bps_limit(tg, rw),
1853 tg->io_disp[rw], tg_iops_limit(tg, rw),
fda6f272 1854 sq->nr_queued[READ], sq->nr_queued[WRITE]);
e43473b7 1855
3f0abd80
SL
1856 tg->last_low_overflow_time[rw] = jiffies;
1857
671058fb 1858 bio_associate_current(bio);
6bc9c2b4 1859 tg->td->nr_queued[rw]++;
c5cc2070 1860 throtl_add_bio_tg(bio, qn, tg);
bc16a4f9 1861 throttled = true;
e43473b7 1862
7f52f98c
TH
1863 /*
1864 * Update @tg's dispatch time and force schedule dispatch if @tg
1865 * was empty before @bio. The forced scheduling isn't likely to
1866 * cause undue delay as @bio is likely to be dispatched directly if
1867 * its @tg's disptime is not in the future.
1868 */
0e9f4164 1869 if (tg->flags & THROTL_TG_WAS_EMPTY) {
77216b04 1870 tg_update_disptime(tg);
7f52f98c 1871 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
e43473b7
VG
1872 }
1873
bc16a4f9 1874out_unlock:
e43473b7 1875 spin_unlock_irq(q->queue_lock);
bc16a4f9 1876out:
2a0f61e6
TH
1877 /*
1878 * As multiple blk-throtls may stack in the same issue path, we
1879 * don't want bios to leave with the flag set. Clear the flag if
1880 * being issued.
1881 */
1882 if (!throttled)
8d2bbd4c 1883 bio_clear_flag(bio, BIO_THROTTLED);
bc16a4f9 1884 return throttled;
e43473b7
VG
1885}
1886
2a12f0dc
TH
1887/*
1888 * Dispatch all bios from all children tg's queued on @parent_sq. On
1889 * return, @parent_sq is guaranteed to not have any active children tg's
1890 * and all bios from previously active tg's are on @parent_sq->bio_lists[].
1891 */
1892static void tg_drain_bios(struct throtl_service_queue *parent_sq)
1893{
1894 struct throtl_grp *tg;
1895
1896 while ((tg = throtl_rb_first(parent_sq))) {
1897 struct throtl_service_queue *sq = &tg->service_queue;
1898 struct bio *bio;
1899
1900 throtl_dequeue_tg(tg);
1901
c5cc2070 1902 while ((bio = throtl_peek_queued(&sq->queued[READ])))
2a12f0dc 1903 tg_dispatch_one_bio(tg, bio_data_dir(bio));
c5cc2070 1904 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2a12f0dc
TH
1905 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1906 }
1907}
1908
c9a929dd
TH
1909/**
1910 * blk_throtl_drain - drain throttled bios
1911 * @q: request_queue to drain throttled bios for
1912 *
1913 * Dispatch all currently throttled bios on @q through ->make_request_fn().
1914 */
1915void blk_throtl_drain(struct request_queue *q)
1916 __releases(q->queue_lock) __acquires(q->queue_lock)
1917{
1918 struct throtl_data *td = q->td;
2a12f0dc 1919 struct blkcg_gq *blkg;
492eb21b 1920 struct cgroup_subsys_state *pos_css;
c9a929dd 1921 struct bio *bio;
651930bc 1922 int rw;
c9a929dd 1923
8bcb6c7d 1924 queue_lockdep_assert_held(q);
2a12f0dc 1925 rcu_read_lock();
c9a929dd 1926
2a12f0dc
TH
1927 /*
1928 * Drain each tg while doing post-order walk on the blkg tree, so
1929 * that all bios are propagated to td->service_queue. It'd be
1930 * better to walk service_queue tree directly but blkg walk is
1931 * easier.
1932 */
492eb21b 1933 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2a12f0dc 1934 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
73f0d49a 1935
2a12f0dc
TH
1936 /* finally, transfer bios from top-level tg's into the td */
1937 tg_drain_bios(&td->service_queue);
1938
1939 rcu_read_unlock();
c9a929dd
TH
1940 spin_unlock_irq(q->queue_lock);
1941
2a12f0dc 1942 /* all bios now should be in td->service_queue, issue them */
651930bc 1943 for (rw = READ; rw <= WRITE; rw++)
c5cc2070
TH
1944 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
1945 NULL)))
651930bc 1946 generic_make_request(bio);
c9a929dd
TH
1947
1948 spin_lock_irq(q->queue_lock);
1949}
1950
e43473b7
VG
1951int blk_throtl_init(struct request_queue *q)
1952{
1953 struct throtl_data *td;
a2b1693b 1954 int ret;
e43473b7
VG
1955
1956 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1957 if (!td)
1958 return -ENOMEM;
1959
69df0ab0 1960 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
b2ce2643 1961 throtl_service_queue_init(&td->service_queue);
e43473b7 1962
cd1604fa 1963 q->td = td;
29b12589 1964 td->queue = q;
02977e4a 1965
9f626e37 1966 td->limit_valid[LIMIT_MAX] = true;
cd5ab1b0 1967 td->limit_index = LIMIT_MAX;
3f0abd80
SL
1968 td->low_upgrade_time = jiffies;
1969 td->low_downgrade_time = jiffies;
a2b1693b 1970 /* activate policy */
3c798398 1971 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
a2b1693b 1972 if (ret)
f51b802c 1973 kfree(td);
a2b1693b 1974 return ret;
e43473b7
VG
1975}
1976
1977void blk_throtl_exit(struct request_queue *q)
1978{
c875f4d0 1979 BUG_ON(!q->td);
da527770 1980 throtl_shutdown_wq(q);
3c798398 1981 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
c9a929dd 1982 kfree(q->td);
e43473b7
VG
1983}
1984
d61fcfa4
SL
1985void blk_throtl_register_queue(struct request_queue *q)
1986{
1987 struct throtl_data *td;
1988
1989 td = q->td;
1990 BUG_ON(!td);
1991
1992 if (blk_queue_nonrot(q))
1993 td->throtl_slice = DFL_THROTL_SLICE_SSD;
1994 else
1995 td->throtl_slice = DFL_THROTL_SLICE_HD;
1996#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
1997 /* if no low limit, use previous default */
1998 td->throtl_slice = DFL_THROTL_SLICE_HD;
1999#endif
2000}
2001
297e3d85
SL
2002#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2003ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
2004{
2005 if (!q->td)
2006 return -EINVAL;
2007 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
2008}
2009
2010ssize_t blk_throtl_sample_time_store(struct request_queue *q,
2011 const char *page, size_t count)
2012{
2013 unsigned long v;
2014 unsigned long t;
2015
2016 if (!q->td)
2017 return -EINVAL;
2018 if (kstrtoul(page, 10, &v))
2019 return -EINVAL;
2020 t = msecs_to_jiffies(v);
2021 if (t == 0 || t > MAX_THROTL_SLICE)
2022 return -EINVAL;
2023 q->td->throtl_slice = t;
2024 return count;
2025}
2026#endif
2027
e43473b7
VG
2028static int __init throtl_init(void)
2029{
450adcbe
VG
2030 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
2031 if (!kthrotld_workqueue)
2032 panic("Failed to create kthrotld\n");
2033
3c798398 2034 return blkcg_policy_register(&blkcg_policy_throtl);
e43473b7
VG
2035}
2036
2037module_init(throtl_init);