]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - block/blk-cgroup.c
blkcg: implement per-queue policy activation
[mirror_ubuntu-zesty-kernel.git] / block / blk-cgroup.c
CommitLineData
31e4c28d
VG
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
22084190 14#include <linux/kdev_t.h>
9d6a986c 15#include <linux/module.h>
accee785 16#include <linux/err.h>
9195291e 17#include <linux/blkdev.h>
5a0e3ad6 18#include <linux/slab.h>
34d0f179 19#include <linux/genhd.h>
72e06c25 20#include <linux/delay.h>
9a9e8a26 21#include <linux/atomic.h>
72e06c25 22#include "blk-cgroup.h"
5efd6113 23#include "blk.h"
3e252066 24
84c124da
DS
25#define MAX_KEY_LEN 100
26
bc0d6501 27static DEFINE_MUTEX(blkcg_pol_mutex);
923adde1
TH
28static DEFINE_MUTEX(all_q_mutex);
29static LIST_HEAD(all_q_list);
30
3381cb8d 31struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
9d6a986c
VG
32EXPORT_SYMBOL_GPL(blkio_root_cgroup);
33
8bd435b3 34static struct blkio_policy_type *blkio_policy[BLKCG_MAX_POLS];
035d10b2 35
31e4c28d
VG
36struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
37{
38 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
39 struct blkio_cgroup, css);
40}
9d6a986c 41EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
31e4c28d 42
4f85cb96 43static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
70087dc3
VG
44{
45 return container_of(task_subsys_state(tsk, blkio_subsys_id),
46 struct blkio_cgroup, css);
47}
4f85cb96
TH
48
49struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
50{
51 if (bio && bio->bi_css)
52 return container_of(bio->bi_css, struct blkio_cgroup, css);
53 return task_blkio_cgroup(current);
54}
55EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
70087dc3 56
a2b1693b
TH
57static bool blkcg_policy_enabled(struct request_queue *q,
58 const struct blkio_policy_type *pol)
59{
60 return pol && test_bit(pol->plid, q->blkcg_pols);
61}
62
63static size_t blkg_pd_size(const struct blkio_policy_type *pol)
64{
65 return sizeof(struct blkg_policy_data) + pol->pdata_size;
66}
67
0381411e
TH
68/**
69 * blkg_free - free a blkg
70 * @blkg: blkg to free
71 *
72 * Free @blkg which may be partially allocated.
73 */
74static void blkg_free(struct blkio_group *blkg)
75{
e8989fae 76 int i;
549d3aa8
TH
77
78 if (!blkg)
79 return;
80
8bd435b3 81 for (i = 0; i < BLKCG_MAX_POLS; i++) {
9ade5ea4 82 struct blkio_policy_type *pol = blkio_policy[i];
e8989fae
TH
83 struct blkg_policy_data *pd = blkg->pd[i];
84
9ade5ea4
TH
85 if (!pd)
86 continue;
87
88 if (pol && pol->ops.blkio_exit_group_fn)
89 pol->ops.blkio_exit_group_fn(blkg);
90
9ade5ea4 91 kfree(pd);
0381411e 92 }
e8989fae 93
549d3aa8 94 kfree(blkg);
0381411e
TH
95}
96
97/**
98 * blkg_alloc - allocate a blkg
99 * @blkcg: block cgroup the new blkg is associated with
100 * @q: request_queue the new blkg is associated with
0381411e 101 *
e8989fae 102 * Allocate a new blkg assocating @blkcg and @q.
0381411e
TH
103 */
104static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
e8989fae 105 struct request_queue *q)
0381411e
TH
106{
107 struct blkio_group *blkg;
e8989fae 108 int i;
0381411e
TH
109
110 /* alloc and init base part */
111 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
112 if (!blkg)
113 return NULL;
114
c875f4d0 115 blkg->q = q;
e8989fae 116 INIT_LIST_HEAD(&blkg->q_node);
0381411e 117 blkg->blkcg = blkcg;
1adaf3dd 118 blkg->refcnt = 1;
0381411e
TH
119 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
120
8bd435b3 121 for (i = 0; i < BLKCG_MAX_POLS; i++) {
e8989fae
TH
122 struct blkio_policy_type *pol = blkio_policy[i];
123 struct blkg_policy_data *pd;
0381411e 124
a2b1693b 125 if (!blkcg_policy_enabled(q, pol))
e8989fae
TH
126 continue;
127
128 /* alloc per-policy data and attach it to blkg */
a2b1693b 129 pd = kzalloc_node(blkg_pd_size(pol), GFP_ATOMIC, q->node);
e8989fae
TH
130 if (!pd) {
131 blkg_free(blkg);
132 return NULL;
133 }
549d3aa8 134
e8989fae
TH
135 blkg->pd[i] = pd;
136 pd->blkg = blkg;
0381411e
TH
137 }
138
549d3aa8 139 /* invoke per-policy init */
8bd435b3 140 for (i = 0; i < BLKCG_MAX_POLS; i++) {
e8989fae
TH
141 struct blkio_policy_type *pol = blkio_policy[i];
142
a2b1693b 143 if (blkcg_policy_enabled(blkg->q, pol))
e8989fae
TH
144 pol->ops.blkio_init_group_fn(blkg);
145 }
146
0381411e
TH
147 return blkg;
148}
149
80fd9979
TH
150static struct blkio_group *__blkg_lookup(struct blkio_cgroup *blkcg,
151 struct request_queue *q)
152{
153 struct blkio_group *blkg;
154 struct hlist_node *n;
155
156 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
157 if (blkg->q == q)
158 return blkg;
159 return NULL;
160}
161
162/**
163 * blkg_lookup - lookup blkg for the specified blkcg - q pair
164 * @blkcg: blkcg of interest
165 * @q: request_queue of interest
166 *
167 * Lookup blkg for the @blkcg - @q pair. This function should be called
168 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
169 * - see blk_queue_bypass_start() for details.
170 */
171struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
172 struct request_queue *q)
173{
174 WARN_ON_ONCE(!rcu_read_lock_held());
175
176 if (unlikely(blk_queue_bypass(q)))
177 return NULL;
178 return __blkg_lookup(blkcg, q);
179}
180EXPORT_SYMBOL_GPL(blkg_lookup);
181
cd1604fa
TH
182struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
183 struct request_queue *q,
cd1604fa
TH
184 bool for_root)
185 __releases(q->queue_lock) __acquires(q->queue_lock)
5624a4e4 186{
1cd9e039 187 struct blkio_group *blkg;
5624a4e4 188
cd1604fa
TH
189 WARN_ON_ONCE(!rcu_read_lock_held());
190 lockdep_assert_held(q->queue_lock);
191
192 /*
193 * This could be the first entry point of blkcg implementation and
194 * we shouldn't allow anything to go through for a bypassing queue.
cd1604fa
TH
195 */
196 if (unlikely(blk_queue_bypass(q)) && !for_root)
197 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
198
80fd9979 199 blkg = __blkg_lookup(blkcg, q);
cd1604fa
TH
200 if (blkg)
201 return blkg;
202
7ee9c562 203 /* blkg holds a reference to blkcg */
cd1604fa
TH
204 if (!css_tryget(&blkcg->css))
205 return ERR_PTR(-EINVAL);
206
207 /*
208 * Allocate and initialize.
cd1604fa 209 */
1cd9e039 210 blkg = blkg_alloc(blkcg, q);
cd1604fa
TH
211
212 /* did alloc fail? */
1cd9e039 213 if (unlikely(!blkg)) {
cd1604fa
TH
214 blkg = ERR_PTR(-ENOMEM);
215 goto out;
216 }
217
218 /* insert */
219 spin_lock(&blkcg->lock);
31e4c28d 220 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
e8989fae 221 list_add(&blkg->q_node, &q->blkg_list);
cd1604fa
TH
222 spin_unlock(&blkcg->lock);
223out:
cd1604fa 224 return blkg;
31e4c28d 225}
cd1604fa 226EXPORT_SYMBOL_GPL(blkg_lookup_create);
31e4c28d 227
e8989fae 228static void blkg_destroy(struct blkio_group *blkg)
03aa264a
TH
229{
230 struct request_queue *q = blkg->q;
9f13ef67 231 struct blkio_cgroup *blkcg = blkg->blkcg;
03aa264a
TH
232
233 lockdep_assert_held(q->queue_lock);
9f13ef67 234 lockdep_assert_held(&blkcg->lock);
03aa264a
TH
235
236 /* Something wrong if we are trying to remove same group twice */
e8989fae 237 WARN_ON_ONCE(list_empty(&blkg->q_node));
9f13ef67 238 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
e8989fae 239 list_del_init(&blkg->q_node);
9f13ef67 240 hlist_del_init_rcu(&blkg->blkcg_node);
03aa264a 241
03aa264a
TH
242 /*
243 * Put the reference taken at the time of creation so that when all
244 * queues are gone, group can be destroyed.
245 */
246 blkg_put(blkg);
247}
248
9f13ef67
TH
249/**
250 * blkg_destroy_all - destroy all blkgs associated with a request_queue
251 * @q: request_queue of interest
252 * @destroy_root: whether to destroy root blkg or not
253 *
254 * Destroy blkgs associated with @q. If @destroy_root is %true, all are
255 * destroyed; otherwise, root blkg is left alone.
256 */
e8989fae 257void blkg_destroy_all(struct request_queue *q, bool destroy_root)
72e06c25 258{
03aa264a 259 struct blkio_group *blkg, *n;
72e06c25 260
9f13ef67 261 spin_lock_irq(q->queue_lock);
72e06c25 262
9f13ef67
TH
263 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
264 struct blkio_cgroup *blkcg = blkg->blkcg;
72e06c25 265
9f13ef67
TH
266 /* skip root? */
267 if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
268 continue;
72e06c25 269
9f13ef67
TH
270 spin_lock(&blkcg->lock);
271 blkg_destroy(blkg);
272 spin_unlock(&blkcg->lock);
72e06c25 273 }
9f13ef67
TH
274
275 spin_unlock_irq(q->queue_lock);
72e06c25 276}
03aa264a 277EXPORT_SYMBOL_GPL(blkg_destroy_all);
72e06c25 278
1adaf3dd
TH
279static void blkg_rcu_free(struct rcu_head *rcu_head)
280{
281 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
282}
283
284void __blkg_release(struct blkio_group *blkg)
285{
286 /* release the extra blkcg reference this blkg has been holding */
287 css_put(&blkg->blkcg->css);
288
289 /*
290 * A group is freed in rcu manner. But having an rcu lock does not
291 * mean that one can access all the fields of blkg and assume these
292 * are valid. For example, don't try to follow throtl_data and
293 * request queue links.
294 *
295 * Having a reference to blkg under an rcu allows acess to only
296 * values local to groups like group stats and group rate limits
297 */
298 call_rcu(&blkg->rcu_head, blkg_rcu_free);
299}
300EXPORT_SYMBOL_GPL(__blkg_release);
301
303a3acb 302static int
84c124da 303blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
303a3acb 304{
997a026c 305 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
303a3acb
DS
306 struct blkio_group *blkg;
307 struct hlist_node *n;
bc0d6501 308 int i;
303a3acb 309
bc0d6501 310 mutex_lock(&blkcg_pol_mutex);
303a3acb 311 spin_lock_irq(&blkcg->lock);
997a026c
TH
312
313 /*
314 * Note that stat reset is racy - it doesn't synchronize against
315 * stat updates. This is a debug feature which shouldn't exist
316 * anyway. If you get hit by a race, retry.
317 */
303a3acb 318 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
8bd435b3 319 for (i = 0; i < BLKCG_MAX_POLS; i++) {
bc0d6501 320 struct blkio_policy_type *pol = blkio_policy[i];
549d3aa8 321
a2b1693b
TH
322 if (blkcg_policy_enabled(blkg->q, pol) &&
323 pol->ops.blkio_reset_group_stats_fn)
9ade5ea4 324 pol->ops.blkio_reset_group_stats_fn(blkg);
bc0d6501 325 }
303a3acb 326 }
f0bdc8cd 327
303a3acb 328 spin_unlock_irq(&blkcg->lock);
bc0d6501 329 mutex_unlock(&blkcg_pol_mutex);
303a3acb
DS
330 return 0;
331}
332
d3d32e69 333static const char *blkg_dev_name(struct blkio_group *blkg)
303a3acb 334{
d3d32e69
TH
335 /* some drivers (floppy) instantiate a queue w/o disk registered */
336 if (blkg->q->backing_dev_info.dev)
337 return dev_name(blkg->q->backing_dev_info.dev);
338 return NULL;
303a3acb
DS
339}
340
d3d32e69
TH
341/**
342 * blkcg_print_blkgs - helper for printing per-blkg data
343 * @sf: seq_file to print to
344 * @blkcg: blkcg of interest
345 * @prfill: fill function to print out a blkg
346 * @pol: policy in question
347 * @data: data to be passed to @prfill
348 * @show_total: to print out sum of prfill return values or not
349 *
350 * This function invokes @prfill on each blkg of @blkcg if pd for the
351 * policy specified by @pol exists. @prfill is invoked with @sf, the
352 * policy data and @data. If @show_total is %true, the sum of the return
353 * values from @prfill is printed with "Total" label at the end.
354 *
355 * This is to be used to construct print functions for
356 * cftype->read_seq_string method.
357 */
829fdb50 358void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
d366e7ec 359 u64 (*prfill)(struct seq_file *, void *, int),
ec399347
TH
360 const struct blkio_policy_type *pol, int data,
361 bool show_total)
5624a4e4 362{
d3d32e69
TH
363 struct blkio_group *blkg;
364 struct hlist_node *n;
365 u64 total = 0;
5624a4e4 366
d3d32e69
TH
367 spin_lock_irq(&blkcg->lock);
368 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
a2b1693b 369 if (blkcg_policy_enabled(blkg->q, pol))
ec399347 370 total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
d3d32e69
TH
371 spin_unlock_irq(&blkcg->lock);
372
373 if (show_total)
374 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
375}
829fdb50 376EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
d3d32e69
TH
377
378/**
379 * __blkg_prfill_u64 - prfill helper for a single u64 value
380 * @sf: seq_file to print to
d366e7ec 381 * @pdata: policy private data of interest
d3d32e69
TH
382 * @v: value to print
383 *
d366e7ec 384 * Print @v to @sf for the device assocaited with @pdata.
d3d32e69 385 */
d366e7ec 386u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
d3d32e69 387{
d366e7ec 388 const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
d3d32e69
TH
389
390 if (!dname)
391 return 0;
392
393 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
394 return v;
395}
829fdb50 396EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
d3d32e69
TH
397
398/**
399 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
400 * @sf: seq_file to print to
d366e7ec 401 * @pdata: policy private data of interest
d3d32e69
TH
402 * @rwstat: rwstat to print
403 *
d366e7ec 404 * Print @rwstat to @sf for the device assocaited with @pdata.
d3d32e69 405 */
d366e7ec 406u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
829fdb50 407 const struct blkg_rwstat *rwstat)
d3d32e69
TH
408{
409 static const char *rwstr[] = {
410 [BLKG_RWSTAT_READ] = "Read",
411 [BLKG_RWSTAT_WRITE] = "Write",
412 [BLKG_RWSTAT_SYNC] = "Sync",
413 [BLKG_RWSTAT_ASYNC] = "Async",
414 };
d366e7ec 415 const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
d3d32e69
TH
416 u64 v;
417 int i;
418
419 if (!dname)
420 return 0;
421
422 for (i = 0; i < BLKG_RWSTAT_NR; i++)
423 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
424 (unsigned long long)rwstat->cnt[i]);
425
426 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
427 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
428 return v;
429}
430
5bc4afb1
TH
431/**
432 * blkg_prfill_stat - prfill callback for blkg_stat
433 * @sf: seq_file to print to
434 * @pdata: policy private data of interest
435 * @off: offset to the blkg_stat in @pdata
436 *
437 * prfill callback for printing a blkg_stat.
438 */
439u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
d3d32e69 440{
d366e7ec 441 return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
d3d32e69 442}
5bc4afb1 443EXPORT_SYMBOL_GPL(blkg_prfill_stat);
d3d32e69 444
5bc4afb1
TH
445/**
446 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
447 * @sf: seq_file to print to
448 * @pdata: policy private data of interest
449 * @off: offset to the blkg_rwstat in @pdata
450 *
451 * prfill callback for printing a blkg_rwstat.
452 */
453u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
d3d32e69 454{
d366e7ec 455 struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
d3d32e69 456
d366e7ec 457 return __blkg_prfill_rwstat(sf, pdata, &rwstat);
d3d32e69 458}
5bc4afb1 459EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
d3d32e69 460
3a8b31d3
TH
461/**
462 * blkg_conf_prep - parse and prepare for per-blkg config update
463 * @blkcg: target block cgroup
da8b0662 464 * @pol: target policy
3a8b31d3
TH
465 * @input: input string
466 * @ctx: blkg_conf_ctx to be filled
467 *
468 * Parse per-blkg config update from @input and initialize @ctx with the
469 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
da8b0662
TH
470 * value. This function returns with RCU read lock and queue lock held and
471 * must be paired with blkg_conf_finish().
3a8b31d3 472 */
da8b0662
TH
473int blkg_conf_prep(struct blkio_cgroup *blkcg,
474 const struct blkio_policy_type *pol, const char *input,
829fdb50 475 struct blkg_conf_ctx *ctx)
da8b0662 476 __acquires(rcu) __acquires(disk->queue->queue_lock)
34d0f179 477{
3a8b31d3
TH
478 struct gendisk *disk;
479 struct blkio_group *blkg;
726fa694
TH
480 unsigned int major, minor;
481 unsigned long long v;
482 int part, ret;
34d0f179 483
726fa694
TH
484 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
485 return -EINVAL;
3a8b31d3 486
726fa694 487 disk = get_gendisk(MKDEV(major, minor), &part);
4bfd482e 488 if (!disk || part)
726fa694 489 return -EINVAL;
e56da7e2
TH
490
491 rcu_read_lock();
4bfd482e 492 spin_lock_irq(disk->queue->queue_lock);
da8b0662 493
a2b1693b
TH
494 if (blkcg_policy_enabled(disk->queue, pol))
495 blkg = blkg_lookup_create(blkcg, disk->queue, false);
496 else
497 blkg = ERR_PTR(-EINVAL);
e56da7e2 498
4bfd482e
TH
499 if (IS_ERR(blkg)) {
500 ret = PTR_ERR(blkg);
3a8b31d3 501 rcu_read_unlock();
da8b0662 502 spin_unlock_irq(disk->queue->queue_lock);
3a8b31d3
TH
503 put_disk(disk);
504 /*
505 * If queue was bypassing, we should retry. Do so after a
506 * short msleep(). It isn't strictly necessary but queue
507 * can be bypassing for some time and it's always nice to
508 * avoid busy looping.
509 */
510 if (ret == -EBUSY) {
511 msleep(10);
512 ret = restart_syscall();
7702e8f4 513 }
726fa694 514 return ret;
062a644d 515 }
3a8b31d3
TH
516
517 ctx->disk = disk;
518 ctx->blkg = blkg;
726fa694
TH
519 ctx->v = v;
520 return 0;
34d0f179 521}
829fdb50 522EXPORT_SYMBOL_GPL(blkg_conf_prep);
34d0f179 523
3a8b31d3
TH
524/**
525 * blkg_conf_finish - finish up per-blkg config update
526 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
527 *
528 * Finish up after per-blkg config update. This function must be paired
529 * with blkg_conf_prep().
530 */
829fdb50 531void blkg_conf_finish(struct blkg_conf_ctx *ctx)
da8b0662 532 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
34d0f179 533{
da8b0662 534 spin_unlock_irq(ctx->disk->queue->queue_lock);
3a8b31d3
TH
535 rcu_read_unlock();
536 put_disk(ctx->disk);
34d0f179 537}
829fdb50 538EXPORT_SYMBOL_GPL(blkg_conf_finish);
34d0f179 539
31e4c28d 540struct cftype blkio_files[] = {
84c124da
DS
541 {
542 .name = "reset_stats",
543 .write_u64 = blkiocg_reset_stats,
22084190 544 },
4baf6e33 545 { } /* terminate */
31e4c28d
VG
546};
547
9f13ef67
TH
548/**
549 * blkiocg_pre_destroy - cgroup pre_destroy callback
9f13ef67
TH
550 * @cgroup: cgroup of interest
551 *
552 * This function is called when @cgroup is about to go away and responsible
553 * for shooting down all blkgs associated with @cgroup. blkgs should be
554 * removed while holding both q and blkcg locks. As blkcg lock is nested
555 * inside q lock, this function performs reverse double lock dancing.
556 *
557 * This is the blkcg counterpart of ioc_release_fn().
558 */
959d851c 559static int blkiocg_pre_destroy(struct cgroup *cgroup)
31e4c28d
VG
560{
561 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
b1c35769 562
9f13ef67 563 spin_lock_irq(&blkcg->lock);
7ee9c562 564
9f13ef67
TH
565 while (!hlist_empty(&blkcg->blkg_list)) {
566 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
567 struct blkio_group, blkcg_node);
c875f4d0 568 struct request_queue *q = blkg->q;
b1c35769 569
9f13ef67
TH
570 if (spin_trylock(q->queue_lock)) {
571 blkg_destroy(blkg);
572 spin_unlock(q->queue_lock);
573 } else {
574 spin_unlock_irq(&blkcg->lock);
9f13ef67 575 cpu_relax();
a5567932 576 spin_lock_irq(&blkcg->lock);
0f3942a3 577 }
9f13ef67 578 }
b1c35769 579
9f13ef67 580 spin_unlock_irq(&blkcg->lock);
7ee9c562
TH
581 return 0;
582}
583
959d851c 584static void blkiocg_destroy(struct cgroup *cgroup)
7ee9c562
TH
585{
586 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
587
67523c48
BB
588 if (blkcg != &blkio_root_cgroup)
589 kfree(blkcg);
31e4c28d
VG
590}
591
761b3ef5 592static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
31e4c28d 593{
9a9e8a26 594 static atomic64_t id_seq = ATOMIC64_INIT(0);
0341509f
LZ
595 struct blkio_cgroup *blkcg;
596 struct cgroup *parent = cgroup->parent;
31e4c28d 597
0341509f 598 if (!parent) {
31e4c28d
VG
599 blkcg = &blkio_root_cgroup;
600 goto done;
601 }
602
31e4c28d
VG
603 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
604 if (!blkcg)
605 return ERR_PTR(-ENOMEM);
606
3381cb8d 607 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
9a9e8a26 608 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
31e4c28d
VG
609done:
610 spin_lock_init(&blkcg->lock);
611 INIT_HLIST_HEAD(&blkcg->blkg_list);
612
613 return &blkcg->css;
614}
615
5efd6113
TH
616/**
617 * blkcg_init_queue - initialize blkcg part of request queue
618 * @q: request_queue to initialize
619 *
620 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
621 * part of new request_queue @q.
622 *
623 * RETURNS:
624 * 0 on success, -errno on failure.
625 */
626int blkcg_init_queue(struct request_queue *q)
627{
923adde1
TH
628 int ret;
629
5efd6113
TH
630 might_sleep();
631
923adde1
TH
632 ret = blk_throtl_init(q);
633 if (ret)
634 return ret;
635
636 mutex_lock(&all_q_mutex);
637 INIT_LIST_HEAD(&q->all_q_node);
638 list_add_tail(&q->all_q_node, &all_q_list);
639 mutex_unlock(&all_q_mutex);
640
641 return 0;
5efd6113
TH
642}
643
644/**
645 * blkcg_drain_queue - drain blkcg part of request_queue
646 * @q: request_queue to drain
647 *
648 * Called from blk_drain_queue(). Responsible for draining blkcg part.
649 */
650void blkcg_drain_queue(struct request_queue *q)
651{
652 lockdep_assert_held(q->queue_lock);
653
654 blk_throtl_drain(q);
655}
656
657/**
658 * blkcg_exit_queue - exit and release blkcg part of request_queue
659 * @q: request_queue being released
660 *
661 * Called from blk_release_queue(). Responsible for exiting blkcg part.
662 */
663void blkcg_exit_queue(struct request_queue *q)
664{
923adde1
TH
665 mutex_lock(&all_q_mutex);
666 list_del_init(&q->all_q_node);
667 mutex_unlock(&all_q_mutex);
668
e8989fae
TH
669 blkg_destroy_all(q, true);
670
5efd6113
TH
671 blk_throtl_exit(q);
672}
673
31e4c28d
VG
674/*
675 * We cannot support shared io contexts, as we have no mean to support
676 * two tasks with the same ioc in two different groups without major rework
677 * of the main cic data structures. For now we allow a task to change
678 * its cgroup only if it's the only owner of its ioc.
679 */
761b3ef5 680static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
31e4c28d 681{
bb9d97b6 682 struct task_struct *task;
31e4c28d
VG
683 struct io_context *ioc;
684 int ret = 0;
685
686 /* task_lock() is needed to avoid races with exit_io_context() */
bb9d97b6
TH
687 cgroup_taskset_for_each(task, cgrp, tset) {
688 task_lock(task);
689 ioc = task->io_context;
690 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
691 ret = -EINVAL;
692 task_unlock(task);
693 if (ret)
694 break;
695 }
31e4c28d
VG
696 return ret;
697}
698
676f7c8f
TH
699struct cgroup_subsys blkio_subsys = {
700 .name = "blkio",
701 .create = blkiocg_create,
702 .can_attach = blkiocg_can_attach,
959d851c 703 .pre_destroy = blkiocg_pre_destroy,
676f7c8f 704 .destroy = blkiocg_destroy,
676f7c8f 705 .subsys_id = blkio_subsys_id,
4baf6e33 706 .base_cftypes = blkio_files,
676f7c8f
TH
707 .module = THIS_MODULE,
708};
709EXPORT_SYMBOL_GPL(blkio_subsys);
710
a2b1693b
TH
711/**
712 * blkcg_activate_policy - activate a blkcg policy on a request_queue
713 * @q: request_queue of interest
714 * @pol: blkcg policy to activate
715 *
716 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
717 * bypass mode to populate its blkgs with policy_data for @pol.
718 *
719 * Activation happens with @q bypassed, so nobody would be accessing blkgs
720 * from IO path. Update of each blkg is protected by both queue and blkcg
721 * locks so that holding either lock and testing blkcg_policy_enabled() is
722 * always enough for dereferencing policy data.
723 *
724 * The caller is responsible for synchronizing [de]activations and policy
725 * [un]registerations. Returns 0 on success, -errno on failure.
726 */
727int blkcg_activate_policy(struct request_queue *q,
728 const struct blkio_policy_type *pol)
729{
730 LIST_HEAD(pds);
731 struct blkio_group *blkg;
732 struct blkg_policy_data *pd, *n;
733 int cnt = 0, ret;
734
735 if (blkcg_policy_enabled(q, pol))
736 return 0;
737
738 blk_queue_bypass_start(q);
739
740 /* make sure the root blkg exists and count the existing blkgs */
741 spin_lock_irq(q->queue_lock);
742
743 rcu_read_lock();
744 blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
745 rcu_read_unlock();
746
747 if (IS_ERR(blkg)) {
748 ret = PTR_ERR(blkg);
749 goto out_unlock;
750 }
751 q->root_blkg = blkg;
752
753 list_for_each_entry(blkg, &q->blkg_list, q_node)
754 cnt++;
755
756 spin_unlock_irq(q->queue_lock);
757
758 /* allocate policy_data for all existing blkgs */
759 while (cnt--) {
760 pd = kzalloc_node(blkg_pd_size(pol), GFP_KERNEL, q->node);
761 if (!pd) {
762 ret = -ENOMEM;
763 goto out_free;
764 }
765 list_add_tail(&pd->alloc_node, &pds);
766 }
767
768 /*
769 * Install the allocated pds. With @q bypassing, no new blkg
770 * should have been created while the queue lock was dropped.
771 */
772 spin_lock_irq(q->queue_lock);
773
774 list_for_each_entry(blkg, &q->blkg_list, q_node) {
775 if (WARN_ON(list_empty(&pds))) {
776 /* umm... this shouldn't happen, just abort */
777 ret = -ENOMEM;
778 goto out_unlock;
779 }
780 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
781 list_del_init(&pd->alloc_node);
782
783 /* grab blkcg lock too while installing @pd on @blkg */
784 spin_lock(&blkg->blkcg->lock);
785
786 blkg->pd[pol->plid] = pd;
787 pd->blkg = blkg;
788 pol->ops.blkio_init_group_fn(blkg);
789
790 spin_unlock(&blkg->blkcg->lock);
791 }
792
793 __set_bit(pol->plid, q->blkcg_pols);
794 ret = 0;
795out_unlock:
796 spin_unlock_irq(q->queue_lock);
797out_free:
798 blk_queue_bypass_end(q);
799 list_for_each_entry_safe(pd, n, &pds, alloc_node)
800 kfree(pd);
801 return ret;
802}
803EXPORT_SYMBOL_GPL(blkcg_activate_policy);
804
805/**
806 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
807 * @q: request_queue of interest
808 * @pol: blkcg policy to deactivate
809 *
810 * Deactivate @pol on @q. Follows the same synchronization rules as
811 * blkcg_activate_policy().
812 */
813void blkcg_deactivate_policy(struct request_queue *q,
814 const struct blkio_policy_type *pol)
815{
816 struct blkio_group *blkg;
817
818 if (!blkcg_policy_enabled(q, pol))
819 return;
820
821 blk_queue_bypass_start(q);
822 spin_lock_irq(q->queue_lock);
823
824 __clear_bit(pol->plid, q->blkcg_pols);
825
826 list_for_each_entry(blkg, &q->blkg_list, q_node) {
827 /* grab blkcg lock too while removing @pd from @blkg */
828 spin_lock(&blkg->blkcg->lock);
829
830 if (pol->ops.blkio_exit_group_fn)
831 pol->ops.blkio_exit_group_fn(blkg);
832
833 kfree(blkg->pd[pol->plid]);
834 blkg->pd[pol->plid] = NULL;
835
836 spin_unlock(&blkg->blkcg->lock);
837 }
838
839 spin_unlock_irq(q->queue_lock);
840 blk_queue_bypass_end(q);
841}
842EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
843
8bd435b3
TH
844/**
845 * blkio_policy_register - register a blkcg policy
846 * @blkiop: blkcg policy to register
847 *
848 * Register @blkiop with blkcg core. Might sleep and @blkiop may be
849 * modified on successful registration. Returns 0 on success and -errno on
850 * failure.
851 */
852int blkio_policy_register(struct blkio_policy_type *blkiop)
3e252066 853{
8bd435b3 854 int i, ret;
e8989fae 855
bc0d6501
TH
856 mutex_lock(&blkcg_pol_mutex);
857
8bd435b3
TH
858 /* find an empty slot */
859 ret = -ENOSPC;
860 for (i = 0; i < BLKCG_MAX_POLS; i++)
861 if (!blkio_policy[i])
862 break;
863 if (i >= BLKCG_MAX_POLS)
864 goto out_unlock;
035d10b2 865
8bd435b3
TH
866 /* register and update blkgs */
867 blkiop->plid = i;
868 blkio_policy[i] = blkiop;
869
8bd435b3 870 /* everything is in place, add intf files for the new policy */
44ea53de
TH
871 if (blkiop->cftypes)
872 WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
8bd435b3
TH
873 ret = 0;
874out_unlock:
bc0d6501 875 mutex_unlock(&blkcg_pol_mutex);
8bd435b3 876 return ret;
3e252066
VG
877}
878EXPORT_SYMBOL_GPL(blkio_policy_register);
879
8bd435b3
TH
880/**
881 * blkiop_policy_unregister - unregister a blkcg policy
882 * @blkiop: blkcg policy to unregister
883 *
884 * Undo blkio_policy_register(@blkiop). Might sleep.
885 */
3e252066
VG
886void blkio_policy_unregister(struct blkio_policy_type *blkiop)
887{
bc0d6501
TH
888 mutex_lock(&blkcg_pol_mutex);
889
8bd435b3
TH
890 if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
891 goto out_unlock;
892
893 /* kill the intf files first */
44ea53de
TH
894 if (blkiop->cftypes)
895 cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
896
8bd435b3 897 /* unregister and update blkgs */
035d10b2 898 blkio_policy[blkiop->plid] = NULL;
8bd435b3 899out_unlock:
bc0d6501 900 mutex_unlock(&blkcg_pol_mutex);
3e252066
VG
901}
902EXPORT_SYMBOL_GPL(blkio_policy_unregister);