]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - block/blk-cgroup.c
blkcg: drop stuff unused after per-queue policy activation update
[mirror_ubuntu-artful-kernel.git] / block / blk-cgroup.c
CommitLineData
31e4c28d
VG
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
22084190 14#include <linux/kdev_t.h>
9d6a986c 15#include <linux/module.h>
accee785 16#include <linux/err.h>
9195291e 17#include <linux/blkdev.h>
5a0e3ad6 18#include <linux/slab.h>
34d0f179 19#include <linux/genhd.h>
72e06c25 20#include <linux/delay.h>
9a9e8a26 21#include <linux/atomic.h>
72e06c25 22#include "blk-cgroup.h"
5efd6113 23#include "blk.h"
3e252066 24
84c124da
DS
25#define MAX_KEY_LEN 100
26
bc0d6501 27static DEFINE_MUTEX(blkcg_pol_mutex);
923adde1 28
3381cb8d 29struct blkio_cgroup blkio_root_cgroup = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
9d6a986c
VG
30EXPORT_SYMBOL_GPL(blkio_root_cgroup);
31
8bd435b3 32static struct blkio_policy_type *blkio_policy[BLKCG_MAX_POLS];
035d10b2 33
31e4c28d
VG
34struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
35{
36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37 struct blkio_cgroup, css);
38}
9d6a986c 39EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
31e4c28d 40
4f85cb96 41static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
70087dc3
VG
42{
43 return container_of(task_subsys_state(tsk, blkio_subsys_id),
44 struct blkio_cgroup, css);
45}
4f85cb96
TH
46
47struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
48{
49 if (bio && bio->bi_css)
50 return container_of(bio->bi_css, struct blkio_cgroup, css);
51 return task_blkio_cgroup(current);
52}
53EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
70087dc3 54
a2b1693b
TH
55static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkio_policy_type *pol)
57{
58 return pol && test_bit(pol->plid, q->blkcg_pols);
59}
60
61static size_t blkg_pd_size(const struct blkio_policy_type *pol)
62{
63 return sizeof(struct blkg_policy_data) + pol->pdata_size;
64}
65
0381411e
TH
66/**
67 * blkg_free - free a blkg
68 * @blkg: blkg to free
69 *
70 * Free @blkg which may be partially allocated.
71 */
72static void blkg_free(struct blkio_group *blkg)
73{
e8989fae 74 int i;
549d3aa8
TH
75
76 if (!blkg)
77 return;
78
8bd435b3 79 for (i = 0; i < BLKCG_MAX_POLS; i++) {
9ade5ea4 80 struct blkio_policy_type *pol = blkio_policy[i];
e8989fae
TH
81 struct blkg_policy_data *pd = blkg->pd[i];
82
9ade5ea4
TH
83 if (!pd)
84 continue;
85
86 if (pol && pol->ops.blkio_exit_group_fn)
87 pol->ops.blkio_exit_group_fn(blkg);
88
9ade5ea4 89 kfree(pd);
0381411e 90 }
e8989fae 91
549d3aa8 92 kfree(blkg);
0381411e
TH
93}
94
95/**
96 * blkg_alloc - allocate a blkg
97 * @blkcg: block cgroup the new blkg is associated with
98 * @q: request_queue the new blkg is associated with
0381411e 99 *
e8989fae 100 * Allocate a new blkg assocating @blkcg and @q.
0381411e
TH
101 */
102static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
e8989fae 103 struct request_queue *q)
0381411e
TH
104{
105 struct blkio_group *blkg;
e8989fae 106 int i;
0381411e
TH
107
108 /* alloc and init base part */
109 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
110 if (!blkg)
111 return NULL;
112
c875f4d0 113 blkg->q = q;
e8989fae 114 INIT_LIST_HEAD(&blkg->q_node);
0381411e 115 blkg->blkcg = blkcg;
1adaf3dd 116 blkg->refcnt = 1;
0381411e
TH
117 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
118
8bd435b3 119 for (i = 0; i < BLKCG_MAX_POLS; i++) {
e8989fae
TH
120 struct blkio_policy_type *pol = blkio_policy[i];
121 struct blkg_policy_data *pd;
0381411e 122
a2b1693b 123 if (!blkcg_policy_enabled(q, pol))
e8989fae
TH
124 continue;
125
126 /* alloc per-policy data and attach it to blkg */
a2b1693b 127 pd = kzalloc_node(blkg_pd_size(pol), GFP_ATOMIC, q->node);
e8989fae
TH
128 if (!pd) {
129 blkg_free(blkg);
130 return NULL;
131 }
549d3aa8 132
e8989fae
TH
133 blkg->pd[i] = pd;
134 pd->blkg = blkg;
0381411e
TH
135 }
136
549d3aa8 137 /* invoke per-policy init */
8bd435b3 138 for (i = 0; i < BLKCG_MAX_POLS; i++) {
e8989fae
TH
139 struct blkio_policy_type *pol = blkio_policy[i];
140
a2b1693b 141 if (blkcg_policy_enabled(blkg->q, pol))
e8989fae
TH
142 pol->ops.blkio_init_group_fn(blkg);
143 }
144
0381411e
TH
145 return blkg;
146}
147
80fd9979
TH
148static struct blkio_group *__blkg_lookup(struct blkio_cgroup *blkcg,
149 struct request_queue *q)
150{
151 struct blkio_group *blkg;
152 struct hlist_node *n;
153
154 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
155 if (blkg->q == q)
156 return blkg;
157 return NULL;
158}
159
160/**
161 * blkg_lookup - lookup blkg for the specified blkcg - q pair
162 * @blkcg: blkcg of interest
163 * @q: request_queue of interest
164 *
165 * Lookup blkg for the @blkcg - @q pair. This function should be called
166 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
167 * - see blk_queue_bypass_start() for details.
168 */
169struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
170 struct request_queue *q)
171{
172 WARN_ON_ONCE(!rcu_read_lock_held());
173
174 if (unlikely(blk_queue_bypass(q)))
175 return NULL;
176 return __blkg_lookup(blkcg, q);
177}
178EXPORT_SYMBOL_GPL(blkg_lookup);
179
3c96cb32
TH
180static struct blkio_group *__blkg_lookup_create(struct blkio_cgroup *blkcg,
181 struct request_queue *q)
cd1604fa 182 __releases(q->queue_lock) __acquires(q->queue_lock)
5624a4e4 183{
1cd9e039 184 struct blkio_group *blkg;
5624a4e4 185
cd1604fa
TH
186 WARN_ON_ONCE(!rcu_read_lock_held());
187 lockdep_assert_held(q->queue_lock);
188
80fd9979 189 blkg = __blkg_lookup(blkcg, q);
cd1604fa
TH
190 if (blkg)
191 return blkg;
192
7ee9c562 193 /* blkg holds a reference to blkcg */
cd1604fa
TH
194 if (!css_tryget(&blkcg->css))
195 return ERR_PTR(-EINVAL);
196
197 /*
198 * Allocate and initialize.
cd1604fa 199 */
1cd9e039 200 blkg = blkg_alloc(blkcg, q);
cd1604fa
TH
201
202 /* did alloc fail? */
1cd9e039 203 if (unlikely(!blkg)) {
cd1604fa
TH
204 blkg = ERR_PTR(-ENOMEM);
205 goto out;
206 }
207
208 /* insert */
209 spin_lock(&blkcg->lock);
31e4c28d 210 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
e8989fae 211 list_add(&blkg->q_node, &q->blkg_list);
cd1604fa
TH
212 spin_unlock(&blkcg->lock);
213out:
cd1604fa 214 return blkg;
31e4c28d 215}
3c96cb32
TH
216
217struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
218 struct request_queue *q)
219{
220 /*
221 * This could be the first entry point of blkcg implementation and
222 * we shouldn't allow anything to go through for a bypassing queue.
223 */
224 if (unlikely(blk_queue_bypass(q)))
225 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
226 return __blkg_lookup_create(blkcg, q);
227}
cd1604fa 228EXPORT_SYMBOL_GPL(blkg_lookup_create);
31e4c28d 229
e8989fae 230static void blkg_destroy(struct blkio_group *blkg)
03aa264a
TH
231{
232 struct request_queue *q = blkg->q;
9f13ef67 233 struct blkio_cgroup *blkcg = blkg->blkcg;
03aa264a
TH
234
235 lockdep_assert_held(q->queue_lock);
9f13ef67 236 lockdep_assert_held(&blkcg->lock);
03aa264a
TH
237
238 /* Something wrong if we are trying to remove same group twice */
e8989fae 239 WARN_ON_ONCE(list_empty(&blkg->q_node));
9f13ef67 240 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
e8989fae 241 list_del_init(&blkg->q_node);
9f13ef67 242 hlist_del_init_rcu(&blkg->blkcg_node);
03aa264a 243
03aa264a
TH
244 /*
245 * Put the reference taken at the time of creation so that when all
246 * queues are gone, group can be destroyed.
247 */
248 blkg_put(blkg);
249}
250
9f13ef67
TH
251/**
252 * blkg_destroy_all - destroy all blkgs associated with a request_queue
253 * @q: request_queue of interest
9f13ef67 254 *
3c96cb32 255 * Destroy all blkgs associated with @q.
9f13ef67 256 */
3c96cb32 257static void blkg_destroy_all(struct request_queue *q)
72e06c25 258{
03aa264a 259 struct blkio_group *blkg, *n;
72e06c25 260
9f13ef67 261 spin_lock_irq(q->queue_lock);
72e06c25 262
9f13ef67
TH
263 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
264 struct blkio_cgroup *blkcg = blkg->blkcg;
72e06c25 265
9f13ef67
TH
266 spin_lock(&blkcg->lock);
267 blkg_destroy(blkg);
268 spin_unlock(&blkcg->lock);
72e06c25 269 }
9f13ef67
TH
270
271 spin_unlock_irq(q->queue_lock);
72e06c25
TH
272}
273
1adaf3dd
TH
274static void blkg_rcu_free(struct rcu_head *rcu_head)
275{
276 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
277}
278
279void __blkg_release(struct blkio_group *blkg)
280{
281 /* release the extra blkcg reference this blkg has been holding */
282 css_put(&blkg->blkcg->css);
283
284 /*
285 * A group is freed in rcu manner. But having an rcu lock does not
286 * mean that one can access all the fields of blkg and assume these
287 * are valid. For example, don't try to follow throtl_data and
288 * request queue links.
289 *
290 * Having a reference to blkg under an rcu allows acess to only
291 * values local to groups like group stats and group rate limits
292 */
293 call_rcu(&blkg->rcu_head, blkg_rcu_free);
294}
295EXPORT_SYMBOL_GPL(__blkg_release);
296
303a3acb 297static int
84c124da 298blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
303a3acb 299{
997a026c 300 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
303a3acb
DS
301 struct blkio_group *blkg;
302 struct hlist_node *n;
bc0d6501 303 int i;
303a3acb 304
bc0d6501 305 mutex_lock(&blkcg_pol_mutex);
303a3acb 306 spin_lock_irq(&blkcg->lock);
997a026c
TH
307
308 /*
309 * Note that stat reset is racy - it doesn't synchronize against
310 * stat updates. This is a debug feature which shouldn't exist
311 * anyway. If you get hit by a race, retry.
312 */
303a3acb 313 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
8bd435b3 314 for (i = 0; i < BLKCG_MAX_POLS; i++) {
bc0d6501 315 struct blkio_policy_type *pol = blkio_policy[i];
549d3aa8 316
a2b1693b
TH
317 if (blkcg_policy_enabled(blkg->q, pol) &&
318 pol->ops.blkio_reset_group_stats_fn)
9ade5ea4 319 pol->ops.blkio_reset_group_stats_fn(blkg);
bc0d6501 320 }
303a3acb 321 }
f0bdc8cd 322
303a3acb 323 spin_unlock_irq(&blkcg->lock);
bc0d6501 324 mutex_unlock(&blkcg_pol_mutex);
303a3acb
DS
325 return 0;
326}
327
d3d32e69 328static const char *blkg_dev_name(struct blkio_group *blkg)
303a3acb 329{
d3d32e69
TH
330 /* some drivers (floppy) instantiate a queue w/o disk registered */
331 if (blkg->q->backing_dev_info.dev)
332 return dev_name(blkg->q->backing_dev_info.dev);
333 return NULL;
303a3acb
DS
334}
335
d3d32e69
TH
336/**
337 * blkcg_print_blkgs - helper for printing per-blkg data
338 * @sf: seq_file to print to
339 * @blkcg: blkcg of interest
340 * @prfill: fill function to print out a blkg
341 * @pol: policy in question
342 * @data: data to be passed to @prfill
343 * @show_total: to print out sum of prfill return values or not
344 *
345 * This function invokes @prfill on each blkg of @blkcg if pd for the
346 * policy specified by @pol exists. @prfill is invoked with @sf, the
347 * policy data and @data. If @show_total is %true, the sum of the return
348 * values from @prfill is printed with "Total" label at the end.
349 *
350 * This is to be used to construct print functions for
351 * cftype->read_seq_string method.
352 */
829fdb50 353void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
d366e7ec 354 u64 (*prfill)(struct seq_file *, void *, int),
ec399347
TH
355 const struct blkio_policy_type *pol, int data,
356 bool show_total)
5624a4e4 357{
d3d32e69
TH
358 struct blkio_group *blkg;
359 struct hlist_node *n;
360 u64 total = 0;
5624a4e4 361
d3d32e69
TH
362 spin_lock_irq(&blkcg->lock);
363 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
a2b1693b 364 if (blkcg_policy_enabled(blkg->q, pol))
ec399347 365 total += prfill(sf, blkg->pd[pol->plid]->pdata, data);
d3d32e69
TH
366 spin_unlock_irq(&blkcg->lock);
367
368 if (show_total)
369 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
370}
829fdb50 371EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
d3d32e69
TH
372
373/**
374 * __blkg_prfill_u64 - prfill helper for a single u64 value
375 * @sf: seq_file to print to
d366e7ec 376 * @pdata: policy private data of interest
d3d32e69
TH
377 * @v: value to print
378 *
d366e7ec 379 * Print @v to @sf for the device assocaited with @pdata.
d3d32e69 380 */
d366e7ec 381u64 __blkg_prfill_u64(struct seq_file *sf, void *pdata, u64 v)
d3d32e69 382{
d366e7ec 383 const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
d3d32e69
TH
384
385 if (!dname)
386 return 0;
387
388 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
389 return v;
390}
829fdb50 391EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
d3d32e69
TH
392
393/**
394 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
395 * @sf: seq_file to print to
d366e7ec 396 * @pdata: policy private data of interest
d3d32e69
TH
397 * @rwstat: rwstat to print
398 *
d366e7ec 399 * Print @rwstat to @sf for the device assocaited with @pdata.
d3d32e69 400 */
d366e7ec 401u64 __blkg_prfill_rwstat(struct seq_file *sf, void *pdata,
829fdb50 402 const struct blkg_rwstat *rwstat)
d3d32e69
TH
403{
404 static const char *rwstr[] = {
405 [BLKG_RWSTAT_READ] = "Read",
406 [BLKG_RWSTAT_WRITE] = "Write",
407 [BLKG_RWSTAT_SYNC] = "Sync",
408 [BLKG_RWSTAT_ASYNC] = "Async",
409 };
d366e7ec 410 const char *dname = blkg_dev_name(pdata_to_blkg(pdata));
d3d32e69
TH
411 u64 v;
412 int i;
413
414 if (!dname)
415 return 0;
416
417 for (i = 0; i < BLKG_RWSTAT_NR; i++)
418 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
419 (unsigned long long)rwstat->cnt[i]);
420
421 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
422 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
423 return v;
424}
425
5bc4afb1
TH
426/**
427 * blkg_prfill_stat - prfill callback for blkg_stat
428 * @sf: seq_file to print to
429 * @pdata: policy private data of interest
430 * @off: offset to the blkg_stat in @pdata
431 *
432 * prfill callback for printing a blkg_stat.
433 */
434u64 blkg_prfill_stat(struct seq_file *sf, void *pdata, int off)
d3d32e69 435{
d366e7ec 436 return __blkg_prfill_u64(sf, pdata, blkg_stat_read(pdata + off));
d3d32e69 437}
5bc4afb1 438EXPORT_SYMBOL_GPL(blkg_prfill_stat);
d3d32e69 439
5bc4afb1
TH
440/**
441 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
442 * @sf: seq_file to print to
443 * @pdata: policy private data of interest
444 * @off: offset to the blkg_rwstat in @pdata
445 *
446 * prfill callback for printing a blkg_rwstat.
447 */
448u64 blkg_prfill_rwstat(struct seq_file *sf, void *pdata, int off)
d3d32e69 449{
d366e7ec 450 struct blkg_rwstat rwstat = blkg_rwstat_read(pdata + off);
d3d32e69 451
d366e7ec 452 return __blkg_prfill_rwstat(sf, pdata, &rwstat);
d3d32e69 453}
5bc4afb1 454EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
d3d32e69 455
3a8b31d3
TH
456/**
457 * blkg_conf_prep - parse and prepare for per-blkg config update
458 * @blkcg: target block cgroup
da8b0662 459 * @pol: target policy
3a8b31d3
TH
460 * @input: input string
461 * @ctx: blkg_conf_ctx to be filled
462 *
463 * Parse per-blkg config update from @input and initialize @ctx with the
464 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
da8b0662
TH
465 * value. This function returns with RCU read lock and queue lock held and
466 * must be paired with blkg_conf_finish().
3a8b31d3 467 */
da8b0662
TH
468int blkg_conf_prep(struct blkio_cgroup *blkcg,
469 const struct blkio_policy_type *pol, const char *input,
829fdb50 470 struct blkg_conf_ctx *ctx)
da8b0662 471 __acquires(rcu) __acquires(disk->queue->queue_lock)
34d0f179 472{
3a8b31d3
TH
473 struct gendisk *disk;
474 struct blkio_group *blkg;
726fa694
TH
475 unsigned int major, minor;
476 unsigned long long v;
477 int part, ret;
34d0f179 478
726fa694
TH
479 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
480 return -EINVAL;
3a8b31d3 481
726fa694 482 disk = get_gendisk(MKDEV(major, minor), &part);
4bfd482e 483 if (!disk || part)
726fa694 484 return -EINVAL;
e56da7e2
TH
485
486 rcu_read_lock();
4bfd482e 487 spin_lock_irq(disk->queue->queue_lock);
da8b0662 488
a2b1693b 489 if (blkcg_policy_enabled(disk->queue, pol))
3c96cb32 490 blkg = blkg_lookup_create(blkcg, disk->queue);
a2b1693b
TH
491 else
492 blkg = ERR_PTR(-EINVAL);
e56da7e2 493
4bfd482e
TH
494 if (IS_ERR(blkg)) {
495 ret = PTR_ERR(blkg);
3a8b31d3 496 rcu_read_unlock();
da8b0662 497 spin_unlock_irq(disk->queue->queue_lock);
3a8b31d3
TH
498 put_disk(disk);
499 /*
500 * If queue was bypassing, we should retry. Do so after a
501 * short msleep(). It isn't strictly necessary but queue
502 * can be bypassing for some time and it's always nice to
503 * avoid busy looping.
504 */
505 if (ret == -EBUSY) {
506 msleep(10);
507 ret = restart_syscall();
7702e8f4 508 }
726fa694 509 return ret;
062a644d 510 }
3a8b31d3
TH
511
512 ctx->disk = disk;
513 ctx->blkg = blkg;
726fa694
TH
514 ctx->v = v;
515 return 0;
34d0f179 516}
829fdb50 517EXPORT_SYMBOL_GPL(blkg_conf_prep);
34d0f179 518
3a8b31d3
TH
519/**
520 * blkg_conf_finish - finish up per-blkg config update
521 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
522 *
523 * Finish up after per-blkg config update. This function must be paired
524 * with blkg_conf_prep().
525 */
829fdb50 526void blkg_conf_finish(struct blkg_conf_ctx *ctx)
da8b0662 527 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
34d0f179 528{
da8b0662 529 spin_unlock_irq(ctx->disk->queue->queue_lock);
3a8b31d3
TH
530 rcu_read_unlock();
531 put_disk(ctx->disk);
34d0f179 532}
829fdb50 533EXPORT_SYMBOL_GPL(blkg_conf_finish);
34d0f179 534
31e4c28d 535struct cftype blkio_files[] = {
84c124da
DS
536 {
537 .name = "reset_stats",
538 .write_u64 = blkiocg_reset_stats,
22084190 539 },
4baf6e33 540 { } /* terminate */
31e4c28d
VG
541};
542
9f13ef67
TH
543/**
544 * blkiocg_pre_destroy - cgroup pre_destroy callback
9f13ef67
TH
545 * @cgroup: cgroup of interest
546 *
547 * This function is called when @cgroup is about to go away and responsible
548 * for shooting down all blkgs associated with @cgroup. blkgs should be
549 * removed while holding both q and blkcg locks. As blkcg lock is nested
550 * inside q lock, this function performs reverse double lock dancing.
551 *
552 * This is the blkcg counterpart of ioc_release_fn().
553 */
959d851c 554static int blkiocg_pre_destroy(struct cgroup *cgroup)
31e4c28d
VG
555{
556 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
b1c35769 557
9f13ef67 558 spin_lock_irq(&blkcg->lock);
7ee9c562 559
9f13ef67
TH
560 while (!hlist_empty(&blkcg->blkg_list)) {
561 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
562 struct blkio_group, blkcg_node);
c875f4d0 563 struct request_queue *q = blkg->q;
b1c35769 564
9f13ef67
TH
565 if (spin_trylock(q->queue_lock)) {
566 blkg_destroy(blkg);
567 spin_unlock(q->queue_lock);
568 } else {
569 spin_unlock_irq(&blkcg->lock);
9f13ef67 570 cpu_relax();
a5567932 571 spin_lock_irq(&blkcg->lock);
0f3942a3 572 }
9f13ef67 573 }
b1c35769 574
9f13ef67 575 spin_unlock_irq(&blkcg->lock);
7ee9c562
TH
576 return 0;
577}
578
959d851c 579static void blkiocg_destroy(struct cgroup *cgroup)
7ee9c562
TH
580{
581 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
582
67523c48
BB
583 if (blkcg != &blkio_root_cgroup)
584 kfree(blkcg);
31e4c28d
VG
585}
586
761b3ef5 587static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
31e4c28d 588{
9a9e8a26 589 static atomic64_t id_seq = ATOMIC64_INIT(0);
0341509f
LZ
590 struct blkio_cgroup *blkcg;
591 struct cgroup *parent = cgroup->parent;
31e4c28d 592
0341509f 593 if (!parent) {
31e4c28d
VG
594 blkcg = &blkio_root_cgroup;
595 goto done;
596 }
597
31e4c28d
VG
598 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
599 if (!blkcg)
600 return ERR_PTR(-ENOMEM);
601
3381cb8d 602 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
9a9e8a26 603 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
31e4c28d
VG
604done:
605 spin_lock_init(&blkcg->lock);
606 INIT_HLIST_HEAD(&blkcg->blkg_list);
607
608 return &blkcg->css;
609}
610
5efd6113
TH
611/**
612 * blkcg_init_queue - initialize blkcg part of request queue
613 * @q: request_queue to initialize
614 *
615 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
616 * part of new request_queue @q.
617 *
618 * RETURNS:
619 * 0 on success, -errno on failure.
620 */
621int blkcg_init_queue(struct request_queue *q)
622{
623 might_sleep();
624
3c96cb32 625 return blk_throtl_init(q);
5efd6113
TH
626}
627
628/**
629 * blkcg_drain_queue - drain blkcg part of request_queue
630 * @q: request_queue to drain
631 *
632 * Called from blk_drain_queue(). Responsible for draining blkcg part.
633 */
634void blkcg_drain_queue(struct request_queue *q)
635{
636 lockdep_assert_held(q->queue_lock);
637
638 blk_throtl_drain(q);
639}
640
641/**
642 * blkcg_exit_queue - exit and release blkcg part of request_queue
643 * @q: request_queue being released
644 *
645 * Called from blk_release_queue(). Responsible for exiting blkcg part.
646 */
647void blkcg_exit_queue(struct request_queue *q)
648{
3c96cb32 649 blkg_destroy_all(q);
5efd6113
TH
650 blk_throtl_exit(q);
651}
652
31e4c28d
VG
653/*
654 * We cannot support shared io contexts, as we have no mean to support
655 * two tasks with the same ioc in two different groups without major rework
656 * of the main cic data structures. For now we allow a task to change
657 * its cgroup only if it's the only owner of its ioc.
658 */
761b3ef5 659static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
31e4c28d 660{
bb9d97b6 661 struct task_struct *task;
31e4c28d
VG
662 struct io_context *ioc;
663 int ret = 0;
664
665 /* task_lock() is needed to avoid races with exit_io_context() */
bb9d97b6
TH
666 cgroup_taskset_for_each(task, cgrp, tset) {
667 task_lock(task);
668 ioc = task->io_context;
669 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
670 ret = -EINVAL;
671 task_unlock(task);
672 if (ret)
673 break;
674 }
31e4c28d
VG
675 return ret;
676}
677
676f7c8f
TH
678struct cgroup_subsys blkio_subsys = {
679 .name = "blkio",
680 .create = blkiocg_create,
681 .can_attach = blkiocg_can_attach,
959d851c 682 .pre_destroy = blkiocg_pre_destroy,
676f7c8f 683 .destroy = blkiocg_destroy,
676f7c8f 684 .subsys_id = blkio_subsys_id,
4baf6e33 685 .base_cftypes = blkio_files,
676f7c8f
TH
686 .module = THIS_MODULE,
687};
688EXPORT_SYMBOL_GPL(blkio_subsys);
689
a2b1693b
TH
690/**
691 * blkcg_activate_policy - activate a blkcg policy on a request_queue
692 * @q: request_queue of interest
693 * @pol: blkcg policy to activate
694 *
695 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
696 * bypass mode to populate its blkgs with policy_data for @pol.
697 *
698 * Activation happens with @q bypassed, so nobody would be accessing blkgs
699 * from IO path. Update of each blkg is protected by both queue and blkcg
700 * locks so that holding either lock and testing blkcg_policy_enabled() is
701 * always enough for dereferencing policy data.
702 *
703 * The caller is responsible for synchronizing [de]activations and policy
704 * [un]registerations. Returns 0 on success, -errno on failure.
705 */
706int blkcg_activate_policy(struct request_queue *q,
707 const struct blkio_policy_type *pol)
708{
709 LIST_HEAD(pds);
710 struct blkio_group *blkg;
711 struct blkg_policy_data *pd, *n;
712 int cnt = 0, ret;
713
714 if (blkcg_policy_enabled(q, pol))
715 return 0;
716
717 blk_queue_bypass_start(q);
718
719 /* make sure the root blkg exists and count the existing blkgs */
720 spin_lock_irq(q->queue_lock);
721
722 rcu_read_lock();
3c96cb32 723 blkg = __blkg_lookup_create(&blkio_root_cgroup, q);
a2b1693b
TH
724 rcu_read_unlock();
725
726 if (IS_ERR(blkg)) {
727 ret = PTR_ERR(blkg);
728 goto out_unlock;
729 }
730 q->root_blkg = blkg;
731
732 list_for_each_entry(blkg, &q->blkg_list, q_node)
733 cnt++;
734
735 spin_unlock_irq(q->queue_lock);
736
737 /* allocate policy_data for all existing blkgs */
738 while (cnt--) {
739 pd = kzalloc_node(blkg_pd_size(pol), GFP_KERNEL, q->node);
740 if (!pd) {
741 ret = -ENOMEM;
742 goto out_free;
743 }
744 list_add_tail(&pd->alloc_node, &pds);
745 }
746
747 /*
748 * Install the allocated pds. With @q bypassing, no new blkg
749 * should have been created while the queue lock was dropped.
750 */
751 spin_lock_irq(q->queue_lock);
752
753 list_for_each_entry(blkg, &q->blkg_list, q_node) {
754 if (WARN_ON(list_empty(&pds))) {
755 /* umm... this shouldn't happen, just abort */
756 ret = -ENOMEM;
757 goto out_unlock;
758 }
759 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
760 list_del_init(&pd->alloc_node);
761
762 /* grab blkcg lock too while installing @pd on @blkg */
763 spin_lock(&blkg->blkcg->lock);
764
765 blkg->pd[pol->plid] = pd;
766 pd->blkg = blkg;
767 pol->ops.blkio_init_group_fn(blkg);
768
769 spin_unlock(&blkg->blkcg->lock);
770 }
771
772 __set_bit(pol->plid, q->blkcg_pols);
773 ret = 0;
774out_unlock:
775 spin_unlock_irq(q->queue_lock);
776out_free:
777 blk_queue_bypass_end(q);
778 list_for_each_entry_safe(pd, n, &pds, alloc_node)
779 kfree(pd);
780 return ret;
781}
782EXPORT_SYMBOL_GPL(blkcg_activate_policy);
783
784/**
785 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
786 * @q: request_queue of interest
787 * @pol: blkcg policy to deactivate
788 *
789 * Deactivate @pol on @q. Follows the same synchronization rules as
790 * blkcg_activate_policy().
791 */
792void blkcg_deactivate_policy(struct request_queue *q,
793 const struct blkio_policy_type *pol)
794{
795 struct blkio_group *blkg;
796
797 if (!blkcg_policy_enabled(q, pol))
798 return;
799
800 blk_queue_bypass_start(q);
801 spin_lock_irq(q->queue_lock);
802
803 __clear_bit(pol->plid, q->blkcg_pols);
804
805 list_for_each_entry(blkg, &q->blkg_list, q_node) {
806 /* grab blkcg lock too while removing @pd from @blkg */
807 spin_lock(&blkg->blkcg->lock);
808
809 if (pol->ops.blkio_exit_group_fn)
810 pol->ops.blkio_exit_group_fn(blkg);
811
812 kfree(blkg->pd[pol->plid]);
813 blkg->pd[pol->plid] = NULL;
814
815 spin_unlock(&blkg->blkcg->lock);
816 }
817
818 spin_unlock_irq(q->queue_lock);
819 blk_queue_bypass_end(q);
820}
821EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
822
8bd435b3
TH
823/**
824 * blkio_policy_register - register a blkcg policy
825 * @blkiop: blkcg policy to register
826 *
827 * Register @blkiop with blkcg core. Might sleep and @blkiop may be
828 * modified on successful registration. Returns 0 on success and -errno on
829 * failure.
830 */
831int blkio_policy_register(struct blkio_policy_type *blkiop)
3e252066 832{
8bd435b3 833 int i, ret;
e8989fae 834
bc0d6501
TH
835 mutex_lock(&blkcg_pol_mutex);
836
8bd435b3
TH
837 /* find an empty slot */
838 ret = -ENOSPC;
839 for (i = 0; i < BLKCG_MAX_POLS; i++)
840 if (!blkio_policy[i])
841 break;
842 if (i >= BLKCG_MAX_POLS)
843 goto out_unlock;
035d10b2 844
8bd435b3
TH
845 /* register and update blkgs */
846 blkiop->plid = i;
847 blkio_policy[i] = blkiop;
848
8bd435b3 849 /* everything is in place, add intf files for the new policy */
44ea53de
TH
850 if (blkiop->cftypes)
851 WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
8bd435b3
TH
852 ret = 0;
853out_unlock:
bc0d6501 854 mutex_unlock(&blkcg_pol_mutex);
8bd435b3 855 return ret;
3e252066
VG
856}
857EXPORT_SYMBOL_GPL(blkio_policy_register);
858
8bd435b3
TH
859/**
860 * blkiop_policy_unregister - unregister a blkcg policy
861 * @blkiop: blkcg policy to unregister
862 *
863 * Undo blkio_policy_register(@blkiop). Might sleep.
864 */
3e252066
VG
865void blkio_policy_unregister(struct blkio_policy_type *blkiop)
866{
bc0d6501
TH
867 mutex_lock(&blkcg_pol_mutex);
868
8bd435b3
TH
869 if (WARN_ON(blkio_policy[blkiop->plid] != blkiop))
870 goto out_unlock;
871
872 /* kill the intf files first */
44ea53de
TH
873 if (blkiop->cftypes)
874 cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
875
8bd435b3 876 /* unregister and update blkgs */
035d10b2 877 blkio_policy[blkiop->plid] = NULL;
8bd435b3 878out_unlock:
bc0d6501 879 mutex_unlock(&blkcg_pol_mutex);
3e252066
VG
880}
881EXPORT_SYMBOL_GPL(blkio_policy_unregister);