]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - block/blk-cgroup.c
blkcg: remove blkio_group->stats_lock
[mirror_ubuntu-zesty-kernel.git] / block / blk-cgroup.c
CommitLineData
31e4c28d
VG
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
22084190
VG
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
9d6a986c 16#include <linux/module.h>
accee785 17#include <linux/err.h>
9195291e 18#include <linux/blkdev.h>
5a0e3ad6 19#include <linux/slab.h>
34d0f179 20#include <linux/genhd.h>
72e06c25
TH
21#include <linux/delay.h>
22#include "blk-cgroup.h"
5efd6113 23#include "blk.h"
3e252066 24
84c124da
DS
25#define MAX_KEY_LEN 100
26
3e252066
VG
27static DEFINE_SPINLOCK(blkio_list_lock);
28static LIST_HEAD(blkio_list);
b1c35769 29
923adde1
TH
30static DEFINE_MUTEX(all_q_mutex);
31static LIST_HEAD(all_q_list);
32
1cd9e039
VG
33/* List of groups pending per cpu stats allocation */
34static DEFINE_SPINLOCK(alloc_list_lock);
35static LIST_HEAD(alloc_list);
36
37static void blkio_stat_alloc_fn(struct work_struct *);
38static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
39
31e4c28d 40struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
9d6a986c
VG
41EXPORT_SYMBOL_GPL(blkio_root_cgroup);
42
035d10b2
TH
43static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
44
67523c48
BB
45static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
46 struct cgroup *);
bb9d97b6
TH
47static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
48 struct cgroup_taskset *);
49static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
50 struct cgroup_taskset *);
7ee9c562 51static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
67523c48
BB
52static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
53static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
54
062a644d
VG
55/* for encoding cft->private value on file */
56#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
57/* What policy owns the file, proportional or throttle */
58#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
59#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
60
67523c48
BB
61struct cgroup_subsys blkio_subsys = {
62 .name = "blkio",
63 .create = blkiocg_create,
bb9d97b6
TH
64 .can_attach = blkiocg_can_attach,
65 .attach = blkiocg_attach,
7ee9c562 66 .pre_destroy = blkiocg_pre_destroy,
67523c48
BB
67 .destroy = blkiocg_destroy,
68 .populate = blkiocg_populate,
67523c48 69 .subsys_id = blkio_subsys_id,
67523c48
BB
70 .module = THIS_MODULE,
71};
72EXPORT_SYMBOL_GPL(blkio_subsys);
73
31e4c28d
VG
74struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
75{
76 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
77 struct blkio_cgroup, css);
78}
9d6a986c 79EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
31e4c28d 80
4f85cb96 81static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
70087dc3
VG
82{
83 return container_of(task_subsys_state(tsk, blkio_subsys_id),
84 struct blkio_cgroup, css);
85}
4f85cb96
TH
86
87struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
88{
89 if (bio && bio->bi_css)
90 return container_of(bio->bi_css, struct blkio_cgroup, css);
91 return task_blkio_cgroup(current);
92}
93EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
70087dc3 94
c1768268
TH
95static inline void blkio_update_group_weight(struct blkio_group *blkg,
96 int plid, unsigned int weight)
062a644d
VG
97{
98 struct blkio_policy_type *blkiop;
99
100 list_for_each_entry(blkiop, &blkio_list, list) {
101 /* If this policy does not own the blkg, do not send updates */
c1768268 102 if (blkiop->plid != plid)
062a644d
VG
103 continue;
104 if (blkiop->ops.blkio_update_group_weight_fn)
ca32aefc 105 blkiop->ops.blkio_update_group_weight_fn(blkg->q,
fe071437 106 blkg, weight);
062a644d
VG
107 }
108}
109
c1768268
TH
110static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
111 u64 bps, int fileid)
4c9eefa1
VG
112{
113 struct blkio_policy_type *blkiop;
114
115 list_for_each_entry(blkiop, &blkio_list, list) {
116
117 /* If this policy does not own the blkg, do not send updates */
c1768268 118 if (blkiop->plid != plid)
4c9eefa1
VG
119 continue;
120
121 if (fileid == BLKIO_THROTL_read_bps_device
122 && blkiop->ops.blkio_update_group_read_bps_fn)
ca32aefc 123 blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
fe071437 124 blkg, bps);
4c9eefa1
VG
125
126 if (fileid == BLKIO_THROTL_write_bps_device
127 && blkiop->ops.blkio_update_group_write_bps_fn)
ca32aefc 128 blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
fe071437 129 blkg, bps);
4c9eefa1
VG
130 }
131}
132
7702e8f4 133static inline void blkio_update_group_iops(struct blkio_group *blkg,
c1768268
TH
134 int plid, unsigned int iops,
135 int fileid)
7702e8f4
VG
136{
137 struct blkio_policy_type *blkiop;
138
139 list_for_each_entry(blkiop, &blkio_list, list) {
140
141 /* If this policy does not own the blkg, do not send updates */
c1768268 142 if (blkiop->plid != plid)
7702e8f4
VG
143 continue;
144
145 if (fileid == BLKIO_THROTL_read_iops_device
146 && blkiop->ops.blkio_update_group_read_iops_fn)
ca32aefc 147 blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
fe071437 148 blkg, iops);
7702e8f4
VG
149
150 if (fileid == BLKIO_THROTL_write_iops_device
151 && blkiop->ops.blkio_update_group_write_iops_fn)
ca32aefc 152 blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
fe071437 153 blkg,iops);
7702e8f4
VG
154 }
155}
156
9195291e
DS
157/*
158 * Add to the appropriate stat variable depending on the request type.
edf1b879 159 * This should be called with queue_lock held.
9195291e 160 */
84c124da
DS
161static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
162 bool sync)
9195291e 163{
84c124da
DS
164 if (direction)
165 stat[BLKIO_STAT_WRITE] += add;
9195291e 166 else
84c124da
DS
167 stat[BLKIO_STAT_READ] += add;
168 if (sync)
169 stat[BLKIO_STAT_SYNC] += add;
9195291e 170 else
84c124da 171 stat[BLKIO_STAT_ASYNC] += add;
9195291e
DS
172}
173
cdc1184c
DS
174/*
175 * Decrements the appropriate stat variable if non-zero depending on the
176 * request type. Panics on value being zero.
edf1b879 177 * This should be called with the queue_lock held.
cdc1184c
DS
178 */
179static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
180{
181 if (direction) {
182 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
183 stat[BLKIO_STAT_WRITE]--;
184 } else {
185 BUG_ON(stat[BLKIO_STAT_READ] == 0);
186 stat[BLKIO_STAT_READ]--;
187 }
188 if (sync) {
189 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
190 stat[BLKIO_STAT_SYNC]--;
191 } else {
192 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
193 stat[BLKIO_STAT_ASYNC]--;
194 }
195}
196
197#ifdef CONFIG_DEBUG_BLK_CGROUP
edf1b879 198/* This should be called with the queue_lock held. */
812df48d 199static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
c1768268
TH
200 struct blkio_policy_type *pol,
201 struct blkio_group *curr_blkg)
812df48d 202{
c1768268 203 struct blkg_policy_data *pd = blkg->pd[pol->plid];
549d3aa8
TH
204
205 if (blkio_blkg_waiting(&pd->stats))
812df48d
DS
206 return;
207 if (blkg == curr_blkg)
208 return;
549d3aa8
TH
209 pd->stats.start_group_wait_time = sched_clock();
210 blkio_mark_blkg_waiting(&pd->stats);
812df48d
DS
211}
212
edf1b879 213/* This should be called with the queue_lock held. */
812df48d
DS
214static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
215{
216 unsigned long long now;
217
218 if (!blkio_blkg_waiting(stats))
219 return;
220
221 now = sched_clock();
222 if (time_after64(now, stats->start_group_wait_time))
223 stats->group_wait_time += now - stats->start_group_wait_time;
224 blkio_clear_blkg_waiting(stats);
225}
226
edf1b879 227/* This should be called with the queue_lock held. */
812df48d
DS
228static void blkio_end_empty_time(struct blkio_group_stats *stats)
229{
230 unsigned long long now;
231
232 if (!blkio_blkg_empty(stats))
233 return;
234
235 now = sched_clock();
236 if (time_after64(now, stats->start_empty_time))
237 stats->empty_time += now - stats->start_empty_time;
238 blkio_clear_blkg_empty(stats);
239}
240
c1768268
TH
241void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
242 struct blkio_policy_type *pol)
812df48d 243{
edf1b879 244 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
812df48d 245
edf1b879
TH
246 lockdep_assert_held(blkg->q->queue_lock);
247 BUG_ON(blkio_blkg_idling(stats));
248
249 stats->start_idle_time = sched_clock();
250 blkio_mark_blkg_idling(stats);
812df48d
DS
251}
252EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
253
c1768268
TH
254void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
255 struct blkio_policy_type *pol)
812df48d 256{
edf1b879
TH
257 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
258
259 lockdep_assert_held(blkg->q->queue_lock);
812df48d 260
812df48d 261 if (blkio_blkg_idling(stats)) {
edf1b879
TH
262 unsigned long long now = sched_clock();
263
264 if (time_after64(now, stats->start_idle_time)) {
265 u64_stats_update_begin(&stats->syncp);
812df48d 266 stats->idle_time += now - stats->start_idle_time;
edf1b879
TH
267 u64_stats_update_end(&stats->syncp);
268 }
812df48d
DS
269 blkio_clear_blkg_idling(stats);
270 }
812df48d
DS
271}
272EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
273
c1768268
TH
274void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
275 struct blkio_policy_type *pol)
cdc1184c 276{
edf1b879 277 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
cdc1184c 278
edf1b879
TH
279 lockdep_assert_held(blkg->q->queue_lock);
280
281 u64_stats_update_begin(&stats->syncp);
cdc1184c
DS
282 stats->avg_queue_size_sum +=
283 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
284 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
285 stats->avg_queue_size_samples++;
812df48d 286 blkio_update_group_wait_time(stats);
edf1b879 287 u64_stats_update_end(&stats->syncp);
cdc1184c 288}
a11cdaa7
DS
289EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
290
c1768268
TH
291void blkiocg_set_start_empty_time(struct blkio_group *blkg,
292 struct blkio_policy_type *pol)
28baf442 293{
edf1b879 294 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
28baf442 295
edf1b879 296 lockdep_assert_held(blkg->q->queue_lock);
28baf442
DS
297
298 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
edf1b879 299 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
28baf442 300 return;
28baf442
DS
301
302 /*
e5ff082e
VG
303 * group is already marked empty. This can happen if cfqq got new
304 * request in parent group and moved to this group while being added
305 * to service tree. Just ignore the event and move on.
28baf442 306 */
edf1b879 307 if (blkio_blkg_empty(stats))
e5ff082e 308 return;
e5ff082e 309
28baf442
DS
310 stats->start_empty_time = sched_clock();
311 blkio_mark_blkg_empty(stats);
28baf442
DS
312}
313EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
314
a11cdaa7 315void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
c1768268
TH
316 struct blkio_policy_type *pol,
317 unsigned long dequeue)
a11cdaa7 318{
c1768268 319 struct blkg_policy_data *pd = blkg->pd[pol->plid];
549d3aa8 320
edf1b879
TH
321 lockdep_assert_held(blkg->q->queue_lock);
322
549d3aa8 323 pd->stats.dequeue += dequeue;
a11cdaa7
DS
324}
325EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
812df48d
DS
326#else
327static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
c1768268
TH
328 struct blkio_policy_type *pol,
329 struct blkio_group *curr_blkg) { }
330static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
cdc1184c
DS
331#endif
332
a11cdaa7 333void blkiocg_update_io_add_stats(struct blkio_group *blkg,
c1768268
TH
334 struct blkio_policy_type *pol,
335 struct blkio_group *curr_blkg, bool direction,
336 bool sync)
cdc1184c 337{
edf1b879
TH
338 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
339
340 lockdep_assert_held(blkg->q->queue_lock);
341
342 u64_stats_update_begin(&stats->syncp);
343 blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
344 blkio_end_empty_time(stats);
345 u64_stats_update_end(&stats->syncp);
cdc1184c 346
c1768268 347 blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
cdc1184c 348}
a11cdaa7 349EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
cdc1184c 350
a11cdaa7 351void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
c1768268
TH
352 struct blkio_policy_type *pol,
353 bool direction, bool sync)
cdc1184c 354{
edf1b879
TH
355 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
356
357 lockdep_assert_held(blkg->q->queue_lock);
cdc1184c 358
edf1b879
TH
359 u64_stats_update_begin(&stats->syncp);
360 blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
361 sync);
362 u64_stats_update_end(&stats->syncp);
cdc1184c 363}
a11cdaa7 364EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
cdc1184c 365
c1768268
TH
366void blkiocg_update_timeslice_used(struct blkio_group *blkg,
367 struct blkio_policy_type *pol,
368 unsigned long time,
369 unsigned long unaccounted_time)
22084190 370{
edf1b879
TH
371 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
372
373 lockdep_assert_held(blkg->q->queue_lock);
303a3acb 374
edf1b879
TH
375 u64_stats_update_begin(&stats->syncp);
376 stats->time += time;
a23e6869 377#ifdef CONFIG_DEBUG_BLK_CGROUP
edf1b879 378 stats->unaccounted_time += unaccounted_time;
a23e6869 379#endif
edf1b879 380 u64_stats_update_end(&stats->syncp);
22084190 381}
303a3acb 382EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
22084190 383
5624a4e4
VG
384/*
385 * should be called under rcu read lock or queue lock to make sure blkg pointer
386 * is valid.
387 */
84c124da 388void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
c1768268
TH
389 struct blkio_policy_type *pol,
390 uint64_t bytes, bool direction, bool sync)
9195291e 391{
c1768268 392 struct blkg_policy_data *pd = blkg->pd[pol->plid];
5624a4e4 393 struct blkio_group_stats_cpu *stats_cpu;
575969a0
VG
394 unsigned long flags;
395
1cd9e039
VG
396 /* If per cpu stats are not allocated yet, don't do any accounting. */
397 if (pd->stats_cpu == NULL)
398 return;
399
575969a0
VG
400 /*
401 * Disabling interrupts to provide mutual exclusion between two
402 * writes on same cpu. It probably is not needed for 64bit. Not
403 * optimizing that case yet.
404 */
405 local_irq_save(flags);
9195291e 406
549d3aa8 407 stats_cpu = this_cpu_ptr(pd->stats_cpu);
5624a4e4 408
575969a0 409 u64_stats_update_begin(&stats_cpu->syncp);
5624a4e4
VG
410 stats_cpu->sectors += bytes >> 9;
411 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
412 1, direction, sync);
413 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
414 bytes, direction, sync);
575969a0
VG
415 u64_stats_update_end(&stats_cpu->syncp);
416 local_irq_restore(flags);
9195291e 417}
84c124da 418EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
9195291e 419
84c124da 420void blkiocg_update_completion_stats(struct blkio_group *blkg,
c1768268
TH
421 struct blkio_policy_type *pol,
422 uint64_t start_time,
423 uint64_t io_start_time, bool direction,
424 bool sync)
9195291e 425{
edf1b879 426 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
9195291e
DS
427 unsigned long long now = sched_clock();
428
edf1b879
TH
429 lockdep_assert_held(blkg->q->queue_lock);
430
431 u64_stats_update_begin(&stats->syncp);
84c124da
DS
432 if (time_after64(now, io_start_time))
433 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
434 now - io_start_time, direction, sync);
435 if (time_after64(io_start_time, start_time))
436 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
437 io_start_time - start_time, direction, sync);
edf1b879 438 u64_stats_update_end(&stats->syncp);
9195291e 439}
84c124da 440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
9195291e 441
317389a7 442/* Merged stats are per cpu. */
c1768268
TH
443void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
444 struct blkio_policy_type *pol,
445 bool direction, bool sync)
812d4026 446{
edf1b879
TH
447 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
448
449 lockdep_assert_held(blkg->q->queue_lock);
812d4026 450
edf1b879 451 u64_stats_update_begin(&stats->syncp);
5fe224d2 452 blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
edf1b879 453 u64_stats_update_end(&stats->syncp);
812d4026
DS
454}
455EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
456
1cd9e039
VG
457/*
458 * Worker for allocating per cpu stat for blk groups. This is scheduled on
459 * the system_nrt_wq once there are some groups on the alloc_list waiting
460 * for allocation.
461 */
462static void blkio_stat_alloc_fn(struct work_struct *work)
463{
464 static void *pcpu_stats[BLKIO_NR_POLICIES];
465 struct delayed_work *dwork = to_delayed_work(work);
466 struct blkio_group *blkg;
467 int i;
468 bool empty = false;
469
470alloc_stats:
471 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
472 if (pcpu_stats[i] != NULL)
473 continue;
474
475 pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
476
477 /* Allocation failed. Try again after some time. */
478 if (pcpu_stats[i] == NULL) {
479 queue_delayed_work(system_nrt_wq, dwork,
480 msecs_to_jiffies(10));
481 return;
482 }
483 }
484
485 spin_lock_irq(&blkio_list_lock);
486 spin_lock(&alloc_list_lock);
487
488 /* cgroup got deleted or queue exited. */
489 if (!list_empty(&alloc_list)) {
490 blkg = list_first_entry(&alloc_list, struct blkio_group,
491 alloc_node);
492 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
493 struct blkg_policy_data *pd = blkg->pd[i];
494
495 if (blkio_policy[i] && pd && !pd->stats_cpu)
496 swap(pd->stats_cpu, pcpu_stats[i]);
497 }
498
499 list_del_init(&blkg->alloc_node);
500 }
501
502 empty = list_empty(&alloc_list);
503
504 spin_unlock(&alloc_list_lock);
505 spin_unlock_irq(&blkio_list_lock);
506
507 if (!empty)
508 goto alloc_stats;
509}
510
0381411e
TH
511/**
512 * blkg_free - free a blkg
513 * @blkg: blkg to free
514 *
515 * Free @blkg which may be partially allocated.
516 */
517static void blkg_free(struct blkio_group *blkg)
518{
e8989fae 519 int i;
549d3aa8
TH
520
521 if (!blkg)
522 return;
523
e8989fae
TH
524 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
525 struct blkg_policy_data *pd = blkg->pd[i];
526
527 if (pd) {
528 free_percpu(pd->stats_cpu);
529 kfree(pd);
530 }
0381411e 531 }
e8989fae 532
549d3aa8 533 kfree(blkg);
0381411e
TH
534}
535
536/**
537 * blkg_alloc - allocate a blkg
538 * @blkcg: block cgroup the new blkg is associated with
539 * @q: request_queue the new blkg is associated with
0381411e 540 *
e8989fae 541 * Allocate a new blkg assocating @blkcg and @q.
0381411e
TH
542 */
543static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
e8989fae 544 struct request_queue *q)
0381411e
TH
545{
546 struct blkio_group *blkg;
e8989fae 547 int i;
0381411e
TH
548
549 /* alloc and init base part */
550 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
551 if (!blkg)
552 return NULL;
553
c875f4d0 554 blkg->q = q;
e8989fae 555 INIT_LIST_HEAD(&blkg->q_node);
1cd9e039 556 INIT_LIST_HEAD(&blkg->alloc_node);
0381411e 557 blkg->blkcg = blkcg;
1adaf3dd 558 blkg->refcnt = 1;
0381411e
TH
559 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
560
e8989fae
TH
561 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
562 struct blkio_policy_type *pol = blkio_policy[i];
563 struct blkg_policy_data *pd;
0381411e 564
e8989fae
TH
565 if (!pol)
566 continue;
567
568 /* alloc per-policy data and attach it to blkg */
569 pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
570 q->node);
571 if (!pd) {
572 blkg_free(blkg);
573 return NULL;
574 }
549d3aa8 575
e8989fae
TH
576 blkg->pd[i] = pd;
577 pd->blkg = blkg;
0381411e
TH
578 }
579
549d3aa8 580 /* invoke per-policy init */
e8989fae
TH
581 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
582 struct blkio_policy_type *pol = blkio_policy[i];
583
584 if (pol)
585 pol->ops.blkio_init_group_fn(blkg);
586 }
587
0381411e
TH
588 return blkg;
589}
590
cd1604fa
TH
591struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
592 struct request_queue *q,
593 enum blkio_policy_id plid,
594 bool for_root)
595 __releases(q->queue_lock) __acquires(q->queue_lock)
5624a4e4 596{
1cd9e039 597 struct blkio_group *blkg;
5624a4e4 598
cd1604fa
TH
599 WARN_ON_ONCE(!rcu_read_lock_held());
600 lockdep_assert_held(q->queue_lock);
601
602 /*
603 * This could be the first entry point of blkcg implementation and
604 * we shouldn't allow anything to go through for a bypassing queue.
605 * The following can be removed if blkg lookup is guaranteed to
606 * fail on a bypassing queue.
607 */
608 if (unlikely(blk_queue_bypass(q)) && !for_root)
609 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
610
e8989fae 611 blkg = blkg_lookup(blkcg, q);
cd1604fa
TH
612 if (blkg)
613 return blkg;
614
7ee9c562 615 /* blkg holds a reference to blkcg */
cd1604fa
TH
616 if (!css_tryget(&blkcg->css))
617 return ERR_PTR(-EINVAL);
618
619 /*
620 * Allocate and initialize.
cd1604fa 621 */
1cd9e039 622 blkg = blkg_alloc(blkcg, q);
cd1604fa
TH
623
624 /* did alloc fail? */
1cd9e039 625 if (unlikely(!blkg)) {
cd1604fa
TH
626 blkg = ERR_PTR(-ENOMEM);
627 goto out;
628 }
629
630 /* insert */
631 spin_lock(&blkcg->lock);
31e4c28d 632 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
e8989fae 633 list_add(&blkg->q_node, &q->blkg_list);
cd1604fa 634 spin_unlock(&blkcg->lock);
1cd9e039
VG
635
636 spin_lock(&alloc_list_lock);
637 list_add(&blkg->alloc_node, &alloc_list);
638 /* Queue per cpu stat allocation from worker thread. */
639 queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
640 spin_unlock(&alloc_list_lock);
cd1604fa 641out:
cd1604fa 642 return blkg;
31e4c28d 643}
cd1604fa 644EXPORT_SYMBOL_GPL(blkg_lookup_create);
31e4c28d 645
31e4c28d 646/* called under rcu_read_lock(). */
cd1604fa 647struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
e8989fae 648 struct request_queue *q)
31e4c28d
VG
649{
650 struct blkio_group *blkg;
651 struct hlist_node *n;
31e4c28d 652
ca32aefc 653 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
e8989fae 654 if (blkg->q == q)
31e4c28d 655 return blkg;
31e4c28d
VG
656 return NULL;
657}
cd1604fa 658EXPORT_SYMBOL_GPL(blkg_lookup);
31e4c28d 659
e8989fae 660static void blkg_destroy(struct blkio_group *blkg)
03aa264a
TH
661{
662 struct request_queue *q = blkg->q;
9f13ef67 663 struct blkio_cgroup *blkcg = blkg->blkcg;
03aa264a
TH
664
665 lockdep_assert_held(q->queue_lock);
9f13ef67 666 lockdep_assert_held(&blkcg->lock);
03aa264a
TH
667
668 /* Something wrong if we are trying to remove same group twice */
e8989fae 669 WARN_ON_ONCE(list_empty(&blkg->q_node));
9f13ef67 670 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
e8989fae 671 list_del_init(&blkg->q_node);
9f13ef67 672 hlist_del_init_rcu(&blkg->blkcg_node);
03aa264a 673
1cd9e039
VG
674 spin_lock(&alloc_list_lock);
675 list_del_init(&blkg->alloc_node);
676 spin_unlock(&alloc_list_lock);
677
03aa264a
TH
678 /*
679 * Put the reference taken at the time of creation so that when all
680 * queues are gone, group can be destroyed.
681 */
682 blkg_put(blkg);
683}
684
e8989fae
TH
685/*
686 * XXX: This updates blkg policy data in-place for root blkg, which is
687 * necessary across elevator switch and policy registration as root blkgs
688 * aren't shot down. This broken and racy implementation is temporary.
689 * Eventually, blkg shoot down will be replaced by proper in-place update.
690 */
691void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
692{
693 struct blkio_policy_type *pol = blkio_policy[plid];
694 struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
695 struct blkg_policy_data *pd;
696
697 if (!blkg)
698 return;
699
700 kfree(blkg->pd[plid]);
701 blkg->pd[plid] = NULL;
702
703 if (!pol)
704 return;
705
706 pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
707 WARN_ON_ONCE(!pd);
708
709 pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
710 WARN_ON_ONCE(!pd->stats_cpu);
711
712 blkg->pd[plid] = pd;
713 pd->blkg = blkg;
714 pol->ops.blkio_init_group_fn(blkg);
715}
716EXPORT_SYMBOL_GPL(update_root_blkg_pd);
717
9f13ef67
TH
718/**
719 * blkg_destroy_all - destroy all blkgs associated with a request_queue
720 * @q: request_queue of interest
721 * @destroy_root: whether to destroy root blkg or not
722 *
723 * Destroy blkgs associated with @q. If @destroy_root is %true, all are
724 * destroyed; otherwise, root blkg is left alone.
725 */
e8989fae 726void blkg_destroy_all(struct request_queue *q, bool destroy_root)
72e06c25 727{
03aa264a 728 struct blkio_group *blkg, *n;
72e06c25 729
9f13ef67 730 spin_lock_irq(q->queue_lock);
72e06c25 731
9f13ef67
TH
732 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
733 struct blkio_cgroup *blkcg = blkg->blkcg;
72e06c25 734
9f13ef67
TH
735 /* skip root? */
736 if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
737 continue;
72e06c25 738
9f13ef67
TH
739 spin_lock(&blkcg->lock);
740 blkg_destroy(blkg);
741 spin_unlock(&blkcg->lock);
72e06c25 742 }
9f13ef67
TH
743
744 spin_unlock_irq(q->queue_lock);
72e06c25 745}
03aa264a 746EXPORT_SYMBOL_GPL(blkg_destroy_all);
72e06c25 747
1adaf3dd
TH
748static void blkg_rcu_free(struct rcu_head *rcu_head)
749{
750 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
751}
752
753void __blkg_release(struct blkio_group *blkg)
754{
755 /* release the extra blkcg reference this blkg has been holding */
756 css_put(&blkg->blkcg->css);
757
758 /*
759 * A group is freed in rcu manner. But having an rcu lock does not
760 * mean that one can access all the fields of blkg and assume these
761 * are valid. For example, don't try to follow throtl_data and
762 * request queue links.
763 *
764 * Having a reference to blkg under an rcu allows acess to only
765 * values local to groups like group stats and group rate limits
766 */
767 call_rcu(&blkg->rcu_head, blkg_rcu_free);
768}
769EXPORT_SYMBOL_GPL(__blkg_release);
770
c1768268 771static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
f0bdc8cd 772{
c1768268 773 struct blkg_policy_data *pd = blkg->pd[plid];
997a026c 774 int cpu;
1cd9e039
VG
775
776 if (pd->stats_cpu == NULL)
777 return;
997a026c
TH
778
779 for_each_possible_cpu(cpu) {
780 struct blkio_group_stats_cpu *sc =
781 per_cpu_ptr(pd->stats_cpu, cpu);
782
783 sc->sectors = 0;
784 memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
f0bdc8cd
VG
785 }
786}
787
303a3acb 788static int
84c124da 789blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
303a3acb 790{
997a026c 791 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
303a3acb
DS
792 struct blkio_group *blkg;
793 struct hlist_node *n;
cdc1184c 794 int i;
303a3acb 795
e8989fae 796 spin_lock(&blkio_list_lock);
303a3acb 797 spin_lock_irq(&blkcg->lock);
997a026c
TH
798
799 /*
800 * Note that stat reset is racy - it doesn't synchronize against
801 * stat updates. This is a debug feature which shouldn't exist
802 * anyway. If you get hit by a race, retry.
803 */
303a3acb 804 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
e8989fae 805 struct blkio_policy_type *pol;
549d3aa8 806
e8989fae
TH
807 list_for_each_entry(pol, &blkio_list, list) {
808 struct blkg_policy_data *pd = blkg->pd[pol->plid];
997a026c
TH
809 struct blkio_group_stats *stats = &pd->stats;
810
811 /* queued stats shouldn't be cleared */
812 for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
813 if (i != BLKIO_STAT_QUEUED)
814 memset(stats->stat_arr[i], 0,
815 sizeof(stats->stat_arr[i]));
816 stats->time = 0;
812df48d 817#ifdef CONFIG_DEBUG_BLK_CGROUP
997a026c
TH
818 memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
819 BLKG_STATS_DEBUG_CLEAR_SIZE);
812df48d 820#endif
e8989fae
TH
821 blkio_reset_stats_cpu(blkg, pol->plid);
822 }
303a3acb 823 }
f0bdc8cd 824
303a3acb 825 spin_unlock_irq(&blkcg->lock);
e8989fae 826 spin_unlock(&blkio_list_lock);
303a3acb
DS
827 return 0;
828}
829
7a4dd281
TH
830static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
831 char *str, int chars_left, bool diskname_only)
303a3acb 832{
7a4dd281 833 snprintf(str, chars_left, "%s", dname);
303a3acb
DS
834 chars_left -= strlen(str);
835 if (chars_left <= 0) {
836 printk(KERN_WARNING
837 "Possibly incorrect cgroup stat display format");
838 return;
839 }
84c124da
DS
840 if (diskname_only)
841 return;
303a3acb 842 switch (type) {
84c124da 843 case BLKIO_STAT_READ:
303a3acb
DS
844 strlcat(str, " Read", chars_left);
845 break;
84c124da 846 case BLKIO_STAT_WRITE:
303a3acb
DS
847 strlcat(str, " Write", chars_left);
848 break;
84c124da 849 case BLKIO_STAT_SYNC:
303a3acb
DS
850 strlcat(str, " Sync", chars_left);
851 break;
84c124da 852 case BLKIO_STAT_ASYNC:
303a3acb
DS
853 strlcat(str, " Async", chars_left);
854 break;
84c124da 855 case BLKIO_STAT_TOTAL:
303a3acb
DS
856 strlcat(str, " Total", chars_left);
857 break;
858 default:
859 strlcat(str, " Invalid", chars_left);
860 }
861}
862
c1768268 863static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
5624a4e4
VG
864 enum stat_type_cpu type, enum stat_sub_type sub_type)
865{
c1768268 866 struct blkg_policy_data *pd = blkg->pd[plid];
5624a4e4
VG
867 int cpu;
868 struct blkio_group_stats_cpu *stats_cpu;
575969a0 869 u64 val = 0, tval;
5624a4e4 870
1cd9e039
VG
871 if (pd->stats_cpu == NULL)
872 return val;
873
5624a4e4 874 for_each_possible_cpu(cpu) {
575969a0 875 unsigned int start;
549d3aa8 876 stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
5624a4e4 877
575969a0
VG
878 do {
879 start = u64_stats_fetch_begin(&stats_cpu->syncp);
880 if (type == BLKIO_STAT_CPU_SECTORS)
881 tval = stats_cpu->sectors;
882 else
883 tval = stats_cpu->stat_arr_cpu[type][sub_type];
884 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
885
886 val += tval;
5624a4e4
VG
887 }
888
889 return val;
890}
891
c1768268 892static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
7a4dd281
TH
893 struct cgroup_map_cb *cb, const char *dname,
894 enum stat_type_cpu type)
5624a4e4
VG
895{
896 uint64_t disk_total, val;
897 char key_str[MAX_KEY_LEN];
898 enum stat_sub_type sub_type;
899
900 if (type == BLKIO_STAT_CPU_SECTORS) {
c1768268 901 val = blkio_read_stat_cpu(blkg, plid, type, 0);
c4c76a05
TH
902 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
903 cb->fill(cb, key_str, val);
904 return val;
5624a4e4
VG
905 }
906
907 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
908 sub_type++) {
7a4dd281
TH
909 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
910 false);
c1768268 911 val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
5624a4e4
VG
912 cb->fill(cb, key_str, val);
913 }
914
c1768268
TH
915 disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
916 blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
5624a4e4 917
7a4dd281
TH
918 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
919 false);
5624a4e4
VG
920 cb->fill(cb, key_str, disk_total);
921 return disk_total;
922}
923
c1768268 924static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
7a4dd281
TH
925 struct cgroup_map_cb *cb, const char *dname,
926 enum stat_type type)
303a3acb 927{
c4c76a05
TH
928 struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
929 uint64_t v = 0, disk_total = 0;
303a3acb 930 char key_str[MAX_KEY_LEN];
edf1b879 931 unsigned int sync_start;
c4c76a05 932 int st;
84c124da 933
c4c76a05 934 if (type >= BLKIO_STAT_ARR_NR) {
edf1b879
TH
935 do {
936 sync_start = u64_stats_fetch_begin(&stats->syncp);
937 switch (type) {
938 case BLKIO_STAT_TIME:
939 v = stats->time;
940 break;
9026e521 941#ifdef CONFIG_DEBUG_BLK_CGROUP
edf1b879
TH
942 case BLKIO_STAT_UNACCOUNTED_TIME:
943 v = stats->unaccounted_time;
944 break;
945 case BLKIO_STAT_AVG_QUEUE_SIZE: {
946 uint64_t samples = stats->avg_queue_size_samples;
c4c76a05 947
edf1b879
TH
948 if (samples) {
949 v = stats->avg_queue_size_sum;
950 do_div(v, samples);
951 }
952 break;
c4c76a05 953 }
edf1b879
TH
954 case BLKIO_STAT_IDLE_TIME:
955 v = stats->idle_time;
956 break;
957 case BLKIO_STAT_EMPTY_TIME:
958 v = stats->empty_time;
959 break;
960 case BLKIO_STAT_DEQUEUE:
961 v = stats->dequeue;
962 break;
963 case BLKIO_STAT_GROUP_WAIT_TIME:
964 v = stats->group_wait_time;
965 break;
84c124da 966#endif
edf1b879
TH
967 default:
968 WARN_ON_ONCE(1);
969 }
970 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
303a3acb 971
c4c76a05
TH
972 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
973 cb->fill(cb, key_str, v);
974 return v;
303a3acb 975 }
c4c76a05
TH
976
977 for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
edf1b879
TH
978 do {
979 sync_start = u64_stats_fetch_begin(&stats->syncp);
980 v = stats->stat_arr[type][st];
981 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
c4c76a05
TH
982
983 blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
984 cb->fill(cb, key_str, v);
985 if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
986 disk_total += v;
987 }
988
7a4dd281
TH
989 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
990 false);
303a3acb
DS
991 cb->fill(cb, key_str, disk_total);
992 return disk_total;
993}
994
4bfd482e
TH
995static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
996 int fileid, struct blkio_cgroup *blkcg)
34d0f179 997{
ece84241 998 struct gendisk *disk = NULL;
e56da7e2 999 struct blkio_group *blkg = NULL;
549d3aa8 1000 struct blkg_policy_data *pd;
34d0f179 1001 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
d11bb446 1002 unsigned long major, minor;
ece84241
TH
1003 int i = 0, ret = -EINVAL;
1004 int part;
34d0f179 1005 dev_t dev;
d11bb446 1006 u64 temp;
34d0f179
GJ
1007
1008 memset(s, 0, sizeof(s));
1009
1010 while ((p = strsep(&buf, " ")) != NULL) {
1011 if (!*p)
1012 continue;
1013
1014 s[i++] = p;
1015
1016 /* Prevent from inputing too many things */
1017 if (i == 3)
1018 break;
1019 }
1020
1021 if (i != 2)
ece84241 1022 goto out;
34d0f179
GJ
1023
1024 p = strsep(&s[0], ":");
1025 if (p != NULL)
1026 major_s = p;
1027 else
ece84241 1028 goto out;
34d0f179
GJ
1029
1030 minor_s = s[0];
1031 if (!minor_s)
ece84241 1032 goto out;
34d0f179 1033
ece84241
TH
1034 if (strict_strtoul(major_s, 10, &major))
1035 goto out;
34d0f179 1036
ece84241
TH
1037 if (strict_strtoul(minor_s, 10, &minor))
1038 goto out;
34d0f179
GJ
1039
1040 dev = MKDEV(major, minor);
1041
ece84241
TH
1042 if (strict_strtoull(s[1], 10, &temp))
1043 goto out;
34d0f179 1044
e56da7e2 1045 disk = get_gendisk(dev, &part);
4bfd482e 1046 if (!disk || part)
e56da7e2 1047 goto out;
e56da7e2
TH
1048
1049 rcu_read_lock();
1050
4bfd482e
TH
1051 spin_lock_irq(disk->queue->queue_lock);
1052 blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
1053 spin_unlock_irq(disk->queue->queue_lock);
e56da7e2 1054
4bfd482e
TH
1055 if (IS_ERR(blkg)) {
1056 ret = PTR_ERR(blkg);
1057 goto out_unlock;
d11bb446 1058 }
34d0f179 1059
549d3aa8
TH
1060 pd = blkg->pd[plid];
1061
062a644d
VG
1062 switch (plid) {
1063 case BLKIO_POLICY_PROP:
d11bb446
WG
1064 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
1065 temp > BLKIO_WEIGHT_MAX)
e56da7e2 1066 goto out_unlock;
34d0f179 1067
549d3aa8 1068 pd->conf.weight = temp;
c1768268 1069 blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
4c9eefa1
VG
1070 break;
1071 case BLKIO_POLICY_THROTL:
7702e8f4
VG
1072 switch(fileid) {
1073 case BLKIO_THROTL_read_bps_device:
549d3aa8 1074 pd->conf.bps[READ] = temp;
c1768268 1075 blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
e56da7e2 1076 break;
7702e8f4 1077 case BLKIO_THROTL_write_bps_device:
549d3aa8 1078 pd->conf.bps[WRITE] = temp;
c1768268 1079 blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
7702e8f4
VG
1080 break;
1081 case BLKIO_THROTL_read_iops_device:
e56da7e2
TH
1082 if (temp > THROTL_IOPS_MAX)
1083 goto out_unlock;
549d3aa8 1084 pd->conf.iops[READ] = temp;
c1768268 1085 blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
e56da7e2 1086 break;
7702e8f4 1087 case BLKIO_THROTL_write_iops_device:
d11bb446 1088 if (temp > THROTL_IOPS_MAX)
e56da7e2 1089 goto out_unlock;
549d3aa8 1090 pd->conf.iops[WRITE] = temp;
c1768268 1091 blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
7702e8f4
VG
1092 break;
1093 }
062a644d
VG
1094 break;
1095 default:
1096 BUG();
1097 }
ece84241 1098 ret = 0;
e56da7e2
TH
1099out_unlock:
1100 rcu_read_unlock();
ece84241
TH
1101out:
1102 put_disk(disk);
e56da7e2
TH
1103
1104 /*
1105 * If queue was bypassing, we should retry. Do so after a short
1106 * msleep(). It isn't strictly necessary but queue can be
1107 * bypassing for some time and it's always nice to avoid busy
1108 * looping.
1109 */
1110 if (ret == -EBUSY) {
1111 msleep(10);
1112 return restart_syscall();
1113 }
ece84241 1114 return ret;
34d0f179
GJ
1115}
1116
062a644d
VG
1117static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1118 const char *buffer)
34d0f179
GJ
1119{
1120 int ret = 0;
1121 char *buf;
e56da7e2 1122 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
062a644d
VG
1123 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1124 int fileid = BLKIOFILE_ATTR(cft->private);
34d0f179
GJ
1125
1126 buf = kstrdup(buffer, GFP_KERNEL);
1127 if (!buf)
1128 return -ENOMEM;
1129
4bfd482e 1130 ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
34d0f179
GJ
1131 kfree(buf);
1132 return ret;
1133}
1134
92616b5b
VG
1135static const char *blkg_dev_name(struct blkio_group *blkg)
1136{
1137 /* some drivers (floppy) instantiate a queue w/o disk registered */
1138 if (blkg->q->backing_dev_info.dev)
1139 return dev_name(blkg->q->backing_dev_info.dev);
1140 return NULL;
1141}
1142
4bfd482e
TH
1143static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1144 struct seq_file *m)
34d0f179 1145{
c1768268 1146 int plid = BLKIOFILE_POLICY(cft->private);
4bfd482e 1147 int fileid = BLKIOFILE_ATTR(cft->private);
c1768268
TH
1148 struct blkg_policy_data *pd = blkg->pd[plid];
1149 const char *dname = blkg_dev_name(blkg);
4bfd482e
TH
1150 int rw = WRITE;
1151
92616b5b
VG
1152 if (!dname)
1153 return;
1154
c1768268 1155 switch (plid) {
062a644d 1156 case BLKIO_POLICY_PROP:
549d3aa8 1157 if (pd->conf.weight)
7a4dd281 1158 seq_printf(m, "%s\t%u\n",
549d3aa8 1159 dname, pd->conf.weight);
4c9eefa1
VG
1160 break;
1161 case BLKIO_POLICY_THROTL:
4bfd482e 1162 switch (fileid) {
7702e8f4 1163 case BLKIO_THROTL_read_bps_device:
4bfd482e 1164 rw = READ;
7702e8f4 1165 case BLKIO_THROTL_write_bps_device:
549d3aa8 1166 if (pd->conf.bps[rw])
7a4dd281 1167 seq_printf(m, "%s\t%llu\n",
549d3aa8 1168 dname, pd->conf.bps[rw]);
7702e8f4
VG
1169 break;
1170 case BLKIO_THROTL_read_iops_device:
4bfd482e 1171 rw = READ;
7702e8f4 1172 case BLKIO_THROTL_write_iops_device:
549d3aa8 1173 if (pd->conf.iops[rw])
7a4dd281 1174 seq_printf(m, "%s\t%u\n",
549d3aa8 1175 dname, pd->conf.iops[rw]);
7702e8f4
VG
1176 break;
1177 }
062a644d
VG
1178 break;
1179 default:
1180 BUG();
1181 }
1182}
34d0f179 1183
062a644d 1184/* cgroup files which read their data from policy nodes end up here */
4bfd482e
TH
1185static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1186 struct seq_file *m)
34d0f179 1187{
4bfd482e
TH
1188 struct blkio_group *blkg;
1189 struct hlist_node *n;
34d0f179 1190
4bfd482e
TH
1191 spin_lock_irq(&blkcg->lock);
1192 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
e8989fae 1193 blkio_print_group_conf(cft, blkg, m);
4bfd482e 1194 spin_unlock_irq(&blkcg->lock);
062a644d
VG
1195}
1196
1197static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1198 struct seq_file *m)
1199{
1200 struct blkio_cgroup *blkcg;
1201 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1202 int name = BLKIOFILE_ATTR(cft->private);
1203
1204 blkcg = cgroup_to_blkio_cgroup(cgrp);
1205
1206 switch(plid) {
1207 case BLKIO_POLICY_PROP:
1208 switch(name) {
1209 case BLKIO_PROP_weight_device:
4bfd482e 1210 blkio_read_conf(cft, blkcg, m);
062a644d
VG
1211 return 0;
1212 default:
1213 BUG();
1214 }
1215 break;
4c9eefa1
VG
1216 case BLKIO_POLICY_THROTL:
1217 switch(name){
1218 case BLKIO_THROTL_read_bps_device:
1219 case BLKIO_THROTL_write_bps_device:
7702e8f4
VG
1220 case BLKIO_THROTL_read_iops_device:
1221 case BLKIO_THROTL_write_iops_device:
4bfd482e 1222 blkio_read_conf(cft, blkcg, m);
4c9eefa1
VG
1223 return 0;
1224 default:
1225 BUG();
1226 }
1227 break;
062a644d
VG
1228 default:
1229 BUG();
1230 }
1231
1232 return 0;
1233}
1234
1235static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
5624a4e4
VG
1236 struct cftype *cft, struct cgroup_map_cb *cb,
1237 enum stat_type type, bool show_total, bool pcpu)
062a644d
VG
1238{
1239 struct blkio_group *blkg;
1240 struct hlist_node *n;
1241 uint64_t cgroup_total = 0;
1242
c875f4d0
TH
1243 spin_lock_irq(&blkcg->lock);
1244
1245 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
92616b5b 1246 const char *dname = blkg_dev_name(blkg);
c1768268 1247 int plid = BLKIOFILE_POLICY(cft->private);
7a4dd281 1248
e8989fae 1249 if (!dname)
7a4dd281 1250 continue;
edf1b879 1251 if (pcpu)
c1768268
TH
1252 cgroup_total += blkio_get_stat_cpu(blkg, plid,
1253 cb, dname, type);
edf1b879 1254 else
c1768268
TH
1255 cgroup_total += blkio_get_stat(blkg, plid,
1256 cb, dname, type);
062a644d
VG
1257 }
1258 if (show_total)
1259 cb->fill(cb, "Total", cgroup_total);
c875f4d0
TH
1260
1261 spin_unlock_irq(&blkcg->lock);
062a644d
VG
1262 return 0;
1263}
1264
1265/* All map kind of cgroup file get serviced by this function */
1266static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1267 struct cgroup_map_cb *cb)
1268{
1269 struct blkio_cgroup *blkcg;
1270 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1271 int name = BLKIOFILE_ATTR(cft->private);
1272
1273 blkcg = cgroup_to_blkio_cgroup(cgrp);
1274
1275 switch(plid) {
1276 case BLKIO_POLICY_PROP:
1277 switch(name) {
1278 case BLKIO_PROP_time:
1279 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1280 BLKIO_STAT_TIME, 0, 0);
062a644d
VG
1281 case BLKIO_PROP_sectors:
1282 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1283 BLKIO_STAT_CPU_SECTORS, 0, 1);
062a644d
VG
1284 case BLKIO_PROP_io_service_bytes:
1285 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1286 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
062a644d
VG
1287 case BLKIO_PROP_io_serviced:
1288 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1289 BLKIO_STAT_CPU_SERVICED, 1, 1);
062a644d
VG
1290 case BLKIO_PROP_io_service_time:
1291 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1292 BLKIO_STAT_SERVICE_TIME, 1, 0);
062a644d
VG
1293 case BLKIO_PROP_io_wait_time:
1294 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1295 BLKIO_STAT_WAIT_TIME, 1, 0);
062a644d
VG
1296 case BLKIO_PROP_io_merged:
1297 return blkio_read_blkg_stats(blkcg, cft, cb,
5fe224d2 1298 BLKIO_STAT_MERGED, 1, 0);
062a644d
VG
1299 case BLKIO_PROP_io_queued:
1300 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1301 BLKIO_STAT_QUEUED, 1, 0);
062a644d 1302#ifdef CONFIG_DEBUG_BLK_CGROUP
9026e521
JT
1303 case BLKIO_PROP_unaccounted_time:
1304 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1305 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
062a644d
VG
1306 case BLKIO_PROP_dequeue:
1307 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1308 BLKIO_STAT_DEQUEUE, 0, 0);
062a644d
VG
1309 case BLKIO_PROP_avg_queue_size:
1310 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1311 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
062a644d
VG
1312 case BLKIO_PROP_group_wait_time:
1313 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1314 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
062a644d
VG
1315 case BLKIO_PROP_idle_time:
1316 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1317 BLKIO_STAT_IDLE_TIME, 0, 0);
062a644d
VG
1318 case BLKIO_PROP_empty_time:
1319 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1320 BLKIO_STAT_EMPTY_TIME, 0, 0);
062a644d
VG
1321#endif
1322 default:
1323 BUG();
1324 }
1325 break;
4c9eefa1
VG
1326 case BLKIO_POLICY_THROTL:
1327 switch(name){
1328 case BLKIO_THROTL_io_service_bytes:
1329 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1330 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
4c9eefa1
VG
1331 case BLKIO_THROTL_io_serviced:
1332 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1333 BLKIO_STAT_CPU_SERVICED, 1, 1);
4c9eefa1
VG
1334 default:
1335 BUG();
1336 }
1337 break;
062a644d
VG
1338 default:
1339 BUG();
1340 }
1341
1342 return 0;
1343}
1344
4bfd482e 1345static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
062a644d
VG
1346{
1347 struct blkio_group *blkg;
1348 struct hlist_node *n;
062a644d
VG
1349
1350 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1351 return -EINVAL;
1352
1353 spin_lock(&blkio_list_lock);
1354 spin_lock_irq(&blkcg->lock);
1355 blkcg->weight = (unsigned int)val;
1356
549d3aa8 1357 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
e8989fae 1358 struct blkg_policy_data *pd = blkg->pd[plid];
549d3aa8 1359
e8989fae 1360 if (!pd->conf.weight)
c1768268 1361 blkio_update_group_weight(blkg, plid, blkcg->weight);
549d3aa8 1362 }
062a644d 1363
062a644d
VG
1364 spin_unlock_irq(&blkcg->lock);
1365 spin_unlock(&blkio_list_lock);
1366 return 0;
1367}
1368
1369static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1370 struct blkio_cgroup *blkcg;
1371 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1372 int name = BLKIOFILE_ATTR(cft->private);
1373
1374 blkcg = cgroup_to_blkio_cgroup(cgrp);
1375
1376 switch(plid) {
1377 case BLKIO_POLICY_PROP:
1378 switch(name) {
1379 case BLKIO_PROP_weight:
1380 return (u64)blkcg->weight;
1381 }
1382 break;
1383 default:
1384 BUG();
1385 }
1386 return 0;
1387}
1388
1389static int
1390blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1391{
1392 struct blkio_cgroup *blkcg;
1393 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1394 int name = BLKIOFILE_ATTR(cft->private);
1395
1396 blkcg = cgroup_to_blkio_cgroup(cgrp);
1397
1398 switch(plid) {
1399 case BLKIO_POLICY_PROP:
1400 switch(name) {
1401 case BLKIO_PROP_weight:
4bfd482e 1402 return blkio_weight_write(blkcg, plid, val);
062a644d
VG
1403 }
1404 break;
1405 default:
1406 BUG();
1407 }
34d0f179 1408
34d0f179
GJ
1409 return 0;
1410}
1411
31e4c28d 1412struct cftype blkio_files[] = {
34d0f179
GJ
1413 {
1414 .name = "weight_device",
062a644d
VG
1415 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1416 BLKIO_PROP_weight_device),
1417 .read_seq_string = blkiocg_file_read,
1418 .write_string = blkiocg_file_write,
34d0f179
GJ
1419 .max_write_len = 256,
1420 },
31e4c28d
VG
1421 {
1422 .name = "weight",
062a644d
VG
1423 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1424 BLKIO_PROP_weight),
1425 .read_u64 = blkiocg_file_read_u64,
1426 .write_u64 = blkiocg_file_write_u64,
31e4c28d 1427 },
22084190
VG
1428 {
1429 .name = "time",
062a644d
VG
1430 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1431 BLKIO_PROP_time),
1432 .read_map = blkiocg_file_read_map,
22084190
VG
1433 },
1434 {
1435 .name = "sectors",
062a644d
VG
1436 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1437 BLKIO_PROP_sectors),
1438 .read_map = blkiocg_file_read_map,
303a3acb
DS
1439 },
1440 {
1441 .name = "io_service_bytes",
062a644d
VG
1442 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1443 BLKIO_PROP_io_service_bytes),
1444 .read_map = blkiocg_file_read_map,
303a3acb
DS
1445 },
1446 {
1447 .name = "io_serviced",
062a644d
VG
1448 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1449 BLKIO_PROP_io_serviced),
1450 .read_map = blkiocg_file_read_map,
303a3acb
DS
1451 },
1452 {
1453 .name = "io_service_time",
062a644d
VG
1454 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1455 BLKIO_PROP_io_service_time),
1456 .read_map = blkiocg_file_read_map,
303a3acb
DS
1457 },
1458 {
1459 .name = "io_wait_time",
062a644d
VG
1460 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1461 BLKIO_PROP_io_wait_time),
1462 .read_map = blkiocg_file_read_map,
84c124da 1463 },
812d4026
DS
1464 {
1465 .name = "io_merged",
062a644d
VG
1466 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1467 BLKIO_PROP_io_merged),
1468 .read_map = blkiocg_file_read_map,
812d4026 1469 },
cdc1184c
DS
1470 {
1471 .name = "io_queued",
062a644d
VG
1472 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1473 BLKIO_PROP_io_queued),
1474 .read_map = blkiocg_file_read_map,
cdc1184c 1475 },
84c124da
DS
1476 {
1477 .name = "reset_stats",
1478 .write_u64 = blkiocg_reset_stats,
22084190 1479 },
13f98250
VG
1480#ifdef CONFIG_BLK_DEV_THROTTLING
1481 {
1482 .name = "throttle.read_bps_device",
1483 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1484 BLKIO_THROTL_read_bps_device),
1485 .read_seq_string = blkiocg_file_read,
1486 .write_string = blkiocg_file_write,
1487 .max_write_len = 256,
1488 },
1489
1490 {
1491 .name = "throttle.write_bps_device",
1492 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1493 BLKIO_THROTL_write_bps_device),
1494 .read_seq_string = blkiocg_file_read,
1495 .write_string = blkiocg_file_write,
1496 .max_write_len = 256,
1497 },
1498
1499 {
1500 .name = "throttle.read_iops_device",
1501 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1502 BLKIO_THROTL_read_iops_device),
1503 .read_seq_string = blkiocg_file_read,
1504 .write_string = blkiocg_file_write,
1505 .max_write_len = 256,
1506 },
1507
1508 {
1509 .name = "throttle.write_iops_device",
1510 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1511 BLKIO_THROTL_write_iops_device),
1512 .read_seq_string = blkiocg_file_read,
1513 .write_string = blkiocg_file_write,
1514 .max_write_len = 256,
1515 },
1516 {
1517 .name = "throttle.io_service_bytes",
1518 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1519 BLKIO_THROTL_io_service_bytes),
1520 .read_map = blkiocg_file_read_map,
1521 },
1522 {
1523 .name = "throttle.io_serviced",
1524 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1525 BLKIO_THROTL_io_serviced),
1526 .read_map = blkiocg_file_read_map,
1527 },
1528#endif /* CONFIG_BLK_DEV_THROTTLING */
1529
22084190 1530#ifdef CONFIG_DEBUG_BLK_CGROUP
cdc1184c
DS
1531 {
1532 .name = "avg_queue_size",
062a644d
VG
1533 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1534 BLKIO_PROP_avg_queue_size),
1535 .read_map = blkiocg_file_read_map,
cdc1184c 1536 },
812df48d
DS
1537 {
1538 .name = "group_wait_time",
062a644d
VG
1539 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1540 BLKIO_PROP_group_wait_time),
1541 .read_map = blkiocg_file_read_map,
812df48d
DS
1542 },
1543 {
1544 .name = "idle_time",
062a644d
VG
1545 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1546 BLKIO_PROP_idle_time),
1547 .read_map = blkiocg_file_read_map,
812df48d
DS
1548 },
1549 {
1550 .name = "empty_time",
062a644d
VG
1551 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1552 BLKIO_PROP_empty_time),
1553 .read_map = blkiocg_file_read_map,
812df48d 1554 },
cdc1184c 1555 {
22084190 1556 .name = "dequeue",
062a644d
VG
1557 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1558 BLKIO_PROP_dequeue),
1559 .read_map = blkiocg_file_read_map,
cdc1184c 1560 },
9026e521
JT
1561 {
1562 .name = "unaccounted_time",
1563 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1564 BLKIO_PROP_unaccounted_time),
1565 .read_map = blkiocg_file_read_map,
1566 },
22084190 1567#endif
31e4c28d
VG
1568};
1569
1570static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1571{
1572 return cgroup_add_files(cgroup, subsys, blkio_files,
1573 ARRAY_SIZE(blkio_files));
1574}
1575
9f13ef67
TH
1576/**
1577 * blkiocg_pre_destroy - cgroup pre_destroy callback
1578 * @subsys: cgroup subsys
1579 * @cgroup: cgroup of interest
1580 *
1581 * This function is called when @cgroup is about to go away and responsible
1582 * for shooting down all blkgs associated with @cgroup. blkgs should be
1583 * removed while holding both q and blkcg locks. As blkcg lock is nested
1584 * inside q lock, this function performs reverse double lock dancing.
1585 *
1586 * This is the blkcg counterpart of ioc_release_fn().
1587 */
7ee9c562
TH
1588static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1589 struct cgroup *cgroup)
31e4c28d
VG
1590{
1591 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
b1c35769 1592
9f13ef67 1593 spin_lock_irq(&blkcg->lock);
7ee9c562 1594
9f13ef67
TH
1595 while (!hlist_empty(&blkcg->blkg_list)) {
1596 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
1597 struct blkio_group, blkcg_node);
c875f4d0 1598 struct request_queue *q = blkg->q;
b1c35769 1599
9f13ef67
TH
1600 if (spin_trylock(q->queue_lock)) {
1601 blkg_destroy(blkg);
1602 spin_unlock(q->queue_lock);
1603 } else {
1604 spin_unlock_irq(&blkcg->lock);
9f13ef67 1605 cpu_relax();
9f13ef67 1606 spin_lock(&blkcg->lock);
0f3942a3 1607 }
9f13ef67 1608 }
b1c35769 1609
9f13ef67 1610 spin_unlock_irq(&blkcg->lock);
7ee9c562
TH
1611 return 0;
1612}
1613
1614static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1615{
1616 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1617
67523c48
BB
1618 if (blkcg != &blkio_root_cgroup)
1619 kfree(blkcg);
31e4c28d
VG
1620}
1621
1622static struct cgroup_subsys_state *
1623blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1624{
0341509f
LZ
1625 struct blkio_cgroup *blkcg;
1626 struct cgroup *parent = cgroup->parent;
31e4c28d 1627
0341509f 1628 if (!parent) {
31e4c28d
VG
1629 blkcg = &blkio_root_cgroup;
1630 goto done;
1631 }
1632
31e4c28d
VG
1633 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1634 if (!blkcg)
1635 return ERR_PTR(-ENOMEM);
1636
1637 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1638done:
1639 spin_lock_init(&blkcg->lock);
1640 INIT_HLIST_HEAD(&blkcg->blkg_list);
1641
1642 return &blkcg->css;
1643}
1644
5efd6113
TH
1645/**
1646 * blkcg_init_queue - initialize blkcg part of request queue
1647 * @q: request_queue to initialize
1648 *
1649 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1650 * part of new request_queue @q.
1651 *
1652 * RETURNS:
1653 * 0 on success, -errno on failure.
1654 */
1655int blkcg_init_queue(struct request_queue *q)
1656{
923adde1
TH
1657 int ret;
1658
5efd6113
TH
1659 might_sleep();
1660
923adde1
TH
1661 ret = blk_throtl_init(q);
1662 if (ret)
1663 return ret;
1664
1665 mutex_lock(&all_q_mutex);
1666 INIT_LIST_HEAD(&q->all_q_node);
1667 list_add_tail(&q->all_q_node, &all_q_list);
1668 mutex_unlock(&all_q_mutex);
1669
1670 return 0;
5efd6113
TH
1671}
1672
1673/**
1674 * blkcg_drain_queue - drain blkcg part of request_queue
1675 * @q: request_queue to drain
1676 *
1677 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1678 */
1679void blkcg_drain_queue(struct request_queue *q)
1680{
1681 lockdep_assert_held(q->queue_lock);
1682
1683 blk_throtl_drain(q);
1684}
1685
1686/**
1687 * blkcg_exit_queue - exit and release blkcg part of request_queue
1688 * @q: request_queue being released
1689 *
1690 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1691 */
1692void blkcg_exit_queue(struct request_queue *q)
1693{
923adde1
TH
1694 mutex_lock(&all_q_mutex);
1695 list_del_init(&q->all_q_node);
1696 mutex_unlock(&all_q_mutex);
1697
e8989fae
TH
1698 blkg_destroy_all(q, true);
1699
5efd6113
TH
1700 blk_throtl_exit(q);
1701}
1702
31e4c28d
VG
1703/*
1704 * We cannot support shared io contexts, as we have no mean to support
1705 * two tasks with the same ioc in two different groups without major rework
1706 * of the main cic data structures. For now we allow a task to change
1707 * its cgroup only if it's the only owner of its ioc.
1708 */
bb9d97b6
TH
1709static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1710 struct cgroup_taskset *tset)
31e4c28d 1711{
bb9d97b6 1712 struct task_struct *task;
31e4c28d
VG
1713 struct io_context *ioc;
1714 int ret = 0;
1715
1716 /* task_lock() is needed to avoid races with exit_io_context() */
bb9d97b6
TH
1717 cgroup_taskset_for_each(task, cgrp, tset) {
1718 task_lock(task);
1719 ioc = task->io_context;
1720 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1721 ret = -EINVAL;
1722 task_unlock(task);
1723 if (ret)
1724 break;
1725 }
31e4c28d
VG
1726 return ret;
1727}
1728
bb9d97b6
TH
1729static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1730 struct cgroup_taskset *tset)
31e4c28d 1731{
bb9d97b6 1732 struct task_struct *task;
31e4c28d
VG
1733 struct io_context *ioc;
1734
bb9d97b6 1735 cgroup_taskset_for_each(task, cgrp, tset) {
b3c9dd18
LT
1736 /* we don't lose anything even if ioc allocation fails */
1737 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1738 if (ioc) {
1739 ioc_cgroup_changed(ioc);
11a3122f 1740 put_io_context(ioc);
b3c9dd18 1741 }
bb9d97b6 1742 }
31e4c28d
VG
1743}
1744
923adde1
TH
1745static void blkcg_bypass_start(void)
1746 __acquires(&all_q_mutex)
1747{
1748 struct request_queue *q;
1749
1750 mutex_lock(&all_q_mutex);
1751
1752 list_for_each_entry(q, &all_q_list, all_q_node) {
1753 blk_queue_bypass_start(q);
e8989fae 1754 blkg_destroy_all(q, false);
923adde1
TH
1755 }
1756}
1757
1758static void blkcg_bypass_end(void)
1759 __releases(&all_q_mutex)
1760{
1761 struct request_queue *q;
1762
1763 list_for_each_entry(q, &all_q_list, all_q_node)
1764 blk_queue_bypass_end(q);
1765
1766 mutex_unlock(&all_q_mutex);
1767}
1768
3e252066
VG
1769void blkio_policy_register(struct blkio_policy_type *blkiop)
1770{
e8989fae
TH
1771 struct request_queue *q;
1772
923adde1 1773 blkcg_bypass_start();
3e252066 1774 spin_lock(&blkio_list_lock);
035d10b2
TH
1775
1776 BUG_ON(blkio_policy[blkiop->plid]);
1777 blkio_policy[blkiop->plid] = blkiop;
3e252066 1778 list_add_tail(&blkiop->list, &blkio_list);
035d10b2 1779
3e252066 1780 spin_unlock(&blkio_list_lock);
e8989fae
TH
1781 list_for_each_entry(q, &all_q_list, all_q_node)
1782 update_root_blkg_pd(q, blkiop->plid);
923adde1 1783 blkcg_bypass_end();
3e252066
VG
1784}
1785EXPORT_SYMBOL_GPL(blkio_policy_register);
1786
1787void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1788{
e8989fae
TH
1789 struct request_queue *q;
1790
923adde1 1791 blkcg_bypass_start();
3e252066 1792 spin_lock(&blkio_list_lock);
035d10b2
TH
1793
1794 BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1795 blkio_policy[blkiop->plid] = NULL;
3e252066 1796 list_del_init(&blkiop->list);
035d10b2 1797
3e252066 1798 spin_unlock(&blkio_list_lock);
e8989fae
TH
1799 list_for_each_entry(q, &all_q_list, all_q_node)
1800 update_root_blkg_pd(q, blkiop->plid);
923adde1 1801 blkcg_bypass_end();
3e252066
VG
1802}
1803EXPORT_SYMBOL_GPL(blkio_policy_unregister);