]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - block/blk-cgroup.c
blkcg: let blkcg core handle policy private data allocation
[mirror_ubuntu-hirsute-kernel.git] / block / blk-cgroup.c
CommitLineData
31e4c28d
VG
1/*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13#include <linux/ioprio.h>
22084190
VG
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h>
9d6a986c 16#include <linux/module.h>
accee785 17#include <linux/err.h>
9195291e 18#include <linux/blkdev.h>
5a0e3ad6 19#include <linux/slab.h>
34d0f179 20#include <linux/genhd.h>
72e06c25
TH
21#include <linux/delay.h>
22#include "blk-cgroup.h"
5efd6113 23#include "blk.h"
3e252066 24
84c124da
DS
25#define MAX_KEY_LEN 100
26
3e252066
VG
27static DEFINE_SPINLOCK(blkio_list_lock);
28static LIST_HEAD(blkio_list);
b1c35769 29
923adde1
TH
30static DEFINE_MUTEX(all_q_mutex);
31static LIST_HEAD(all_q_list);
32
31e4c28d 33struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
9d6a986c
VG
34EXPORT_SYMBOL_GPL(blkio_root_cgroup);
35
035d10b2
TH
36static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
37
67523c48
BB
38static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
39 struct cgroup *);
bb9d97b6
TH
40static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
41 struct cgroup_taskset *);
42static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
43 struct cgroup_taskset *);
7ee9c562 44static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
67523c48
BB
45static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
46static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
47
062a644d
VG
48/* for encoding cft->private value on file */
49#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
50/* What policy owns the file, proportional or throttle */
51#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
52#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
53
67523c48
BB
54struct cgroup_subsys blkio_subsys = {
55 .name = "blkio",
56 .create = blkiocg_create,
bb9d97b6
TH
57 .can_attach = blkiocg_can_attach,
58 .attach = blkiocg_attach,
7ee9c562 59 .pre_destroy = blkiocg_pre_destroy,
67523c48
BB
60 .destroy = blkiocg_destroy,
61 .populate = blkiocg_populate,
67523c48 62 .subsys_id = blkio_subsys_id,
67523c48
BB
63 .module = THIS_MODULE,
64};
65EXPORT_SYMBOL_GPL(blkio_subsys);
66
31e4c28d
VG
67struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
68{
69 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
70 struct blkio_cgroup, css);
71}
9d6a986c 72EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
31e4c28d 73
70087dc3
VG
74struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
75{
76 return container_of(task_subsys_state(tsk, blkio_subsys_id),
77 struct blkio_cgroup, css);
78}
79EXPORT_SYMBOL_GPL(task_blkio_cgroup);
80
062a644d
VG
81static inline void
82blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
83{
84 struct blkio_policy_type *blkiop;
85
86 list_for_each_entry(blkiop, &blkio_list, list) {
87 /* If this policy does not own the blkg, do not send updates */
88 if (blkiop->plid != blkg->plid)
89 continue;
90 if (blkiop->ops.blkio_update_group_weight_fn)
ca32aefc 91 blkiop->ops.blkio_update_group_weight_fn(blkg->q,
fe071437 92 blkg, weight);
062a644d
VG
93 }
94}
95
4c9eefa1
VG
96static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
97 int fileid)
98{
99 struct blkio_policy_type *blkiop;
100
101 list_for_each_entry(blkiop, &blkio_list, list) {
102
103 /* If this policy does not own the blkg, do not send updates */
104 if (blkiop->plid != blkg->plid)
105 continue;
106
107 if (fileid == BLKIO_THROTL_read_bps_device
108 && blkiop->ops.blkio_update_group_read_bps_fn)
ca32aefc 109 blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
fe071437 110 blkg, bps);
4c9eefa1
VG
111
112 if (fileid == BLKIO_THROTL_write_bps_device
113 && blkiop->ops.blkio_update_group_write_bps_fn)
ca32aefc 114 blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
fe071437 115 blkg, bps);
4c9eefa1
VG
116 }
117}
118
7702e8f4
VG
119static inline void blkio_update_group_iops(struct blkio_group *blkg,
120 unsigned int iops, int fileid)
121{
122 struct blkio_policy_type *blkiop;
123
124 list_for_each_entry(blkiop, &blkio_list, list) {
125
126 /* If this policy does not own the blkg, do not send updates */
127 if (blkiop->plid != blkg->plid)
128 continue;
129
130 if (fileid == BLKIO_THROTL_read_iops_device
131 && blkiop->ops.blkio_update_group_read_iops_fn)
ca32aefc 132 blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
fe071437 133 blkg, iops);
7702e8f4
VG
134
135 if (fileid == BLKIO_THROTL_write_iops_device
136 && blkiop->ops.blkio_update_group_write_iops_fn)
ca32aefc 137 blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
fe071437 138 blkg,iops);
7702e8f4
VG
139 }
140}
141
9195291e
DS
142/*
143 * Add to the appropriate stat variable depending on the request type.
144 * This should be called with the blkg->stats_lock held.
145 */
84c124da
DS
146static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
147 bool sync)
9195291e 148{
84c124da
DS
149 if (direction)
150 stat[BLKIO_STAT_WRITE] += add;
9195291e 151 else
84c124da
DS
152 stat[BLKIO_STAT_READ] += add;
153 if (sync)
154 stat[BLKIO_STAT_SYNC] += add;
9195291e 155 else
84c124da 156 stat[BLKIO_STAT_ASYNC] += add;
9195291e
DS
157}
158
cdc1184c
DS
159/*
160 * Decrements the appropriate stat variable if non-zero depending on the
161 * request type. Panics on value being zero.
162 * This should be called with the blkg->stats_lock held.
163 */
164static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
165{
166 if (direction) {
167 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
168 stat[BLKIO_STAT_WRITE]--;
169 } else {
170 BUG_ON(stat[BLKIO_STAT_READ] == 0);
171 stat[BLKIO_STAT_READ]--;
172 }
173 if (sync) {
174 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
175 stat[BLKIO_STAT_SYNC]--;
176 } else {
177 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
178 stat[BLKIO_STAT_ASYNC]--;
179 }
180}
181
182#ifdef CONFIG_DEBUG_BLK_CGROUP
812df48d
DS
183/* This should be called with the blkg->stats_lock held. */
184static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
185 struct blkio_group *curr_blkg)
186{
187 if (blkio_blkg_waiting(&blkg->stats))
188 return;
189 if (blkg == curr_blkg)
190 return;
191 blkg->stats.start_group_wait_time = sched_clock();
192 blkio_mark_blkg_waiting(&blkg->stats);
193}
194
195/* This should be called with the blkg->stats_lock held. */
196static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
197{
198 unsigned long long now;
199
200 if (!blkio_blkg_waiting(stats))
201 return;
202
203 now = sched_clock();
204 if (time_after64(now, stats->start_group_wait_time))
205 stats->group_wait_time += now - stats->start_group_wait_time;
206 blkio_clear_blkg_waiting(stats);
207}
208
209/* This should be called with the blkg->stats_lock held. */
210static void blkio_end_empty_time(struct blkio_group_stats *stats)
211{
212 unsigned long long now;
213
214 if (!blkio_blkg_empty(stats))
215 return;
216
217 now = sched_clock();
218 if (time_after64(now, stats->start_empty_time))
219 stats->empty_time += now - stats->start_empty_time;
220 blkio_clear_blkg_empty(stats);
221}
222
223void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
224{
225 unsigned long flags;
226
227 spin_lock_irqsave(&blkg->stats_lock, flags);
228 BUG_ON(blkio_blkg_idling(&blkg->stats));
229 blkg->stats.start_idle_time = sched_clock();
230 blkio_mark_blkg_idling(&blkg->stats);
231 spin_unlock_irqrestore(&blkg->stats_lock, flags);
232}
233EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
234
235void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
236{
237 unsigned long flags;
238 unsigned long long now;
239 struct blkio_group_stats *stats;
240
241 spin_lock_irqsave(&blkg->stats_lock, flags);
242 stats = &blkg->stats;
243 if (blkio_blkg_idling(stats)) {
244 now = sched_clock();
245 if (time_after64(now, stats->start_idle_time))
246 stats->idle_time += now - stats->start_idle_time;
247 blkio_clear_blkg_idling(stats);
248 }
249 spin_unlock_irqrestore(&blkg->stats_lock, flags);
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
252
a11cdaa7 253void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
cdc1184c
DS
254{
255 unsigned long flags;
256 struct blkio_group_stats *stats;
257
258 spin_lock_irqsave(&blkg->stats_lock, flags);
259 stats = &blkg->stats;
260 stats->avg_queue_size_sum +=
261 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
262 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
263 stats->avg_queue_size_samples++;
812df48d 264 blkio_update_group_wait_time(stats);
cdc1184c
DS
265 spin_unlock_irqrestore(&blkg->stats_lock, flags);
266}
a11cdaa7
DS
267EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
268
e5ff082e 269void blkiocg_set_start_empty_time(struct blkio_group *blkg)
28baf442
DS
270{
271 unsigned long flags;
272 struct blkio_group_stats *stats;
273
274 spin_lock_irqsave(&blkg->stats_lock, flags);
275 stats = &blkg->stats;
276
277 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
278 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
279 spin_unlock_irqrestore(&blkg->stats_lock, flags);
280 return;
281 }
282
283 /*
e5ff082e
VG
284 * group is already marked empty. This can happen if cfqq got new
285 * request in parent group and moved to this group while being added
286 * to service tree. Just ignore the event and move on.
28baf442 287 */
e5ff082e
VG
288 if(blkio_blkg_empty(stats)) {
289 spin_unlock_irqrestore(&blkg->stats_lock, flags);
290 return;
291 }
292
28baf442
DS
293 stats->start_empty_time = sched_clock();
294 blkio_mark_blkg_empty(stats);
295 spin_unlock_irqrestore(&blkg->stats_lock, flags);
296}
297EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
298
a11cdaa7
DS
299void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
300 unsigned long dequeue)
301{
302 blkg->stats.dequeue += dequeue;
303}
304EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
812df48d
DS
305#else
306static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
307 struct blkio_group *curr_blkg) {}
308static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
cdc1184c
DS
309#endif
310
a11cdaa7 311void blkiocg_update_io_add_stats(struct blkio_group *blkg,
cdc1184c
DS
312 struct blkio_group *curr_blkg, bool direction,
313 bool sync)
314{
315 unsigned long flags;
316
317 spin_lock_irqsave(&blkg->stats_lock, flags);
318 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
319 sync);
812df48d
DS
320 blkio_end_empty_time(&blkg->stats);
321 blkio_set_start_group_wait_time(blkg, curr_blkg);
cdc1184c
DS
322 spin_unlock_irqrestore(&blkg->stats_lock, flags);
323}
a11cdaa7 324EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
cdc1184c 325
a11cdaa7 326void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
cdc1184c
DS
327 bool direction, bool sync)
328{
329 unsigned long flags;
330
331 spin_lock_irqsave(&blkg->stats_lock, flags);
332 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
333 direction, sync);
334 spin_unlock_irqrestore(&blkg->stats_lock, flags);
335}
a11cdaa7 336EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
cdc1184c 337
167400d3
JT
338void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
339 unsigned long unaccounted_time)
22084190 340{
303a3acb
DS
341 unsigned long flags;
342
343 spin_lock_irqsave(&blkg->stats_lock, flags);
344 blkg->stats.time += time;
a23e6869 345#ifdef CONFIG_DEBUG_BLK_CGROUP
167400d3 346 blkg->stats.unaccounted_time += unaccounted_time;
a23e6869 347#endif
303a3acb 348 spin_unlock_irqrestore(&blkg->stats_lock, flags);
22084190 349}
303a3acb 350EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
22084190 351
5624a4e4
VG
352/*
353 * should be called under rcu read lock or queue lock to make sure blkg pointer
354 * is valid.
355 */
84c124da
DS
356void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
357 uint64_t bytes, bool direction, bool sync)
9195291e 358{
5624a4e4 359 struct blkio_group_stats_cpu *stats_cpu;
575969a0
VG
360 unsigned long flags;
361
362 /*
363 * Disabling interrupts to provide mutual exclusion between two
364 * writes on same cpu. It probably is not needed for 64bit. Not
365 * optimizing that case yet.
366 */
367 local_irq_save(flags);
9195291e 368
5624a4e4
VG
369 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
370
575969a0 371 u64_stats_update_begin(&stats_cpu->syncp);
5624a4e4
VG
372 stats_cpu->sectors += bytes >> 9;
373 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
374 1, direction, sync);
375 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
376 bytes, direction, sync);
575969a0
VG
377 u64_stats_update_end(&stats_cpu->syncp);
378 local_irq_restore(flags);
9195291e 379}
84c124da 380EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
9195291e 381
84c124da
DS
382void blkiocg_update_completion_stats(struct blkio_group *blkg,
383 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
9195291e
DS
384{
385 struct blkio_group_stats *stats;
386 unsigned long flags;
387 unsigned long long now = sched_clock();
388
389 spin_lock_irqsave(&blkg->stats_lock, flags);
390 stats = &blkg->stats;
84c124da
DS
391 if (time_after64(now, io_start_time))
392 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
393 now - io_start_time, direction, sync);
394 if (time_after64(io_start_time, start_time))
395 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
396 io_start_time - start_time, direction, sync);
9195291e
DS
397 spin_unlock_irqrestore(&blkg->stats_lock, flags);
398}
84c124da 399EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
9195291e 400
317389a7 401/* Merged stats are per cpu. */
812d4026
DS
402void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
403 bool sync)
404{
317389a7 405 struct blkio_group_stats_cpu *stats_cpu;
812d4026
DS
406 unsigned long flags;
407
317389a7
VG
408 /*
409 * Disabling interrupts to provide mutual exclusion between two
410 * writes on same cpu. It probably is not needed for 64bit. Not
411 * optimizing that case yet.
412 */
413 local_irq_save(flags);
414
415 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
416
417 u64_stats_update_begin(&stats_cpu->syncp);
418 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
419 direction, sync);
420 u64_stats_update_end(&stats_cpu->syncp);
421 local_irq_restore(flags);
812d4026
DS
422}
423EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
424
0381411e
TH
425/**
426 * blkg_free - free a blkg
427 * @blkg: blkg to free
428 *
429 * Free @blkg which may be partially allocated.
430 */
431static void blkg_free(struct blkio_group *blkg)
432{
433 if (blkg) {
434 free_percpu(blkg->stats_cpu);
435 kfree(blkg->pd);
436 kfree(blkg);
437 }
438}
439
440/**
441 * blkg_alloc - allocate a blkg
442 * @blkcg: block cgroup the new blkg is associated with
443 * @q: request_queue the new blkg is associated with
444 * @pol: policy the new blkg is associated with
445 *
446 * Allocate a new blkg assocating @blkcg and @q for @pol.
447 *
448 * FIXME: Should be called with queue locked but currently isn't due to
449 * percpu stat breakage.
450 */
451static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
452 struct request_queue *q,
453 struct blkio_policy_type *pol)
454{
455 struct blkio_group *blkg;
456
457 /* alloc and init base part */
458 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
459 if (!blkg)
460 return NULL;
461
462 spin_lock_init(&blkg->stats_lock);
463 rcu_assign_pointer(blkg->q, q);
464 blkg->blkcg = blkcg;
465 blkg->plid = pol->plid;
466 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
467
468 /* alloc per-policy data */
469 blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC,
470 q->node);
471 if (!blkg->pd) {
472 blkg_free(blkg);
473 return NULL;
474 }
475
476 /* broken, read comment in the callsite */
477 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
478 if (!blkg->stats_cpu) {
479 blkg_free(blkg);
480 return NULL;
481 }
482
483 /* attach pd to blkg and invoke per-policy init */
484 blkg->pd->blkg = blkg;
485 pol->ops.blkio_init_group_fn(blkg);
486 return blkg;
487}
488
cd1604fa
TH
489struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
490 struct request_queue *q,
491 enum blkio_policy_id plid,
492 bool for_root)
493 __releases(q->queue_lock) __acquires(q->queue_lock)
5624a4e4 494{
cd1604fa
TH
495 struct blkio_policy_type *pol = blkio_policy[plid];
496 struct blkio_group *blkg, *new_blkg;
5624a4e4 497
cd1604fa
TH
498 WARN_ON_ONCE(!rcu_read_lock_held());
499 lockdep_assert_held(q->queue_lock);
500
501 /*
502 * This could be the first entry point of blkcg implementation and
503 * we shouldn't allow anything to go through for a bypassing queue.
504 * The following can be removed if blkg lookup is guaranteed to
505 * fail on a bypassing queue.
506 */
507 if (unlikely(blk_queue_bypass(q)) && !for_root)
508 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
509
510 blkg = blkg_lookup(blkcg, q, plid);
511 if (blkg)
512 return blkg;
513
7ee9c562 514 /* blkg holds a reference to blkcg */
cd1604fa
TH
515 if (!css_tryget(&blkcg->css))
516 return ERR_PTR(-EINVAL);
517
518 /*
519 * Allocate and initialize.
520 *
521 * FIXME: The following is broken. Percpu memory allocation
522 * requires %GFP_KERNEL context and can't be performed from IO
523 * path. Allocation here should inherently be atomic and the
524 * following lock dancing can be removed once the broken percpu
525 * allocation is fixed.
526 */
527 spin_unlock_irq(q->queue_lock);
528 rcu_read_unlock();
529
0381411e 530 new_blkg = blkg_alloc(blkcg, q, pol);
cd1604fa
TH
531
532 rcu_read_lock();
533 spin_lock_irq(q->queue_lock);
31e4c28d 534
cd1604fa
TH
535 /* did bypass get turned on inbetween? */
536 if (unlikely(blk_queue_bypass(q)) && !for_root) {
537 blkg = ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
538 goto out;
539 }
540
541 /* did someone beat us to it? */
542 blkg = blkg_lookup(blkcg, q, plid);
543 if (unlikely(blkg))
544 goto out;
545
546 /* did alloc fail? */
0381411e 547 if (unlikely(!new_blkg)) {
cd1604fa
TH
548 blkg = ERR_PTR(-ENOMEM);
549 goto out;
550 }
551
552 /* insert */
553 spin_lock(&blkcg->lock);
554 swap(blkg, new_blkg);
31e4c28d 555 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
cd1604fa
TH
556 pol->ops.blkio_link_group_fn(q, blkg);
557 spin_unlock(&blkcg->lock);
558out:
0381411e 559 blkg_free(new_blkg);
cd1604fa 560 return blkg;
31e4c28d 561}
cd1604fa 562EXPORT_SYMBOL_GPL(blkg_lookup_create);
31e4c28d 563
b1c35769
VG
564static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
565{
566 hlist_del_init_rcu(&blkg->blkcg_node);
b1c35769
VG
567}
568
569/*
570 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
571 * indicating that blk_group was unhashed by the time we got to it.
572 */
31e4c28d
VG
573int blkiocg_del_blkio_group(struct blkio_group *blkg)
574{
7ee9c562 575 struct blkio_cgroup *blkcg = blkg->blkcg;
b1c35769 576 unsigned long flags;
b1c35769
VG
577 int ret = 1;
578
7ee9c562
TH
579 spin_lock_irqsave(&blkcg->lock, flags);
580 if (!hlist_unhashed(&blkg->blkcg_node)) {
581 __blkiocg_del_blkio_group(blkg);
582 ret = 0;
b1c35769 583 }
7ee9c562 584 spin_unlock_irqrestore(&blkcg->lock, flags);
0f3942a3 585
b1c35769 586 return ret;
31e4c28d 587}
9d6a986c 588EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
31e4c28d
VG
589
590/* called under rcu_read_lock(). */
cd1604fa
TH
591struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
592 struct request_queue *q,
593 enum blkio_policy_id plid)
31e4c28d
VG
594{
595 struct blkio_group *blkg;
596 struct hlist_node *n;
31e4c28d 597
ca32aefc
TH
598 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
599 if (blkg->q == q && blkg->plid == plid)
31e4c28d 600 return blkg;
31e4c28d
VG
601 return NULL;
602}
cd1604fa 603EXPORT_SYMBOL_GPL(blkg_lookup);
31e4c28d 604
72e06c25
TH
605void blkg_destroy_all(struct request_queue *q)
606{
607 struct blkio_policy_type *pol;
608
609 while (true) {
610 bool done = true;
611
612 spin_lock(&blkio_list_lock);
613 spin_lock_irq(q->queue_lock);
614
615 /*
616 * clear_queue_fn() might return with non-empty group list
617 * if it raced cgroup removal and lost. cgroup removal is
618 * guaranteed to make forward progress and retrying after a
619 * while is enough. This ugliness is scheduled to be
620 * removed after locking update.
621 */
622 list_for_each_entry(pol, &blkio_list, list)
623 if (!pol->ops.blkio_clear_queue_fn(q))
624 done = false;
625
626 spin_unlock_irq(q->queue_lock);
627 spin_unlock(&blkio_list_lock);
628
629 if (done)
630 break;
631
632 msleep(10); /* just some random duration I like */
633 }
634}
635
f0bdc8cd
VG
636static void blkio_reset_stats_cpu(struct blkio_group *blkg)
637{
638 struct blkio_group_stats_cpu *stats_cpu;
639 int i, j, k;
640 /*
641 * Note: On 64 bit arch this should not be an issue. This has the
642 * possibility of returning some inconsistent value on 32bit arch
643 * as 64bit update on 32bit is non atomic. Taking care of this
644 * corner case makes code very complicated, like sending IPIs to
645 * cpus, taking care of stats of offline cpus etc.
646 *
647 * reset stats is anyway more of a debug feature and this sounds a
648 * corner case. So I am not complicating the code yet until and
649 * unless this becomes a real issue.
650 */
651 for_each_possible_cpu(i) {
652 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
653 stats_cpu->sectors = 0;
654 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
655 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
656 stats_cpu->stat_arr_cpu[j][k] = 0;
657 }
658}
659
303a3acb 660static int
84c124da 661blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
303a3acb
DS
662{
663 struct blkio_cgroup *blkcg;
664 struct blkio_group *blkg;
812df48d 665 struct blkio_group_stats *stats;
303a3acb 666 struct hlist_node *n;
cdc1184c
DS
667 uint64_t queued[BLKIO_STAT_TOTAL];
668 int i;
812df48d
DS
669#ifdef CONFIG_DEBUG_BLK_CGROUP
670 bool idling, waiting, empty;
671 unsigned long long now = sched_clock();
672#endif
303a3acb
DS
673
674 blkcg = cgroup_to_blkio_cgroup(cgroup);
675 spin_lock_irq(&blkcg->lock);
676 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
677 spin_lock(&blkg->stats_lock);
812df48d
DS
678 stats = &blkg->stats;
679#ifdef CONFIG_DEBUG_BLK_CGROUP
680 idling = blkio_blkg_idling(stats);
681 waiting = blkio_blkg_waiting(stats);
682 empty = blkio_blkg_empty(stats);
683#endif
cdc1184c 684 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
812df48d
DS
685 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
686 memset(stats, 0, sizeof(struct blkio_group_stats));
cdc1184c 687 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
812df48d
DS
688 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
689#ifdef CONFIG_DEBUG_BLK_CGROUP
690 if (idling) {
691 blkio_mark_blkg_idling(stats);
692 stats->start_idle_time = now;
693 }
694 if (waiting) {
695 blkio_mark_blkg_waiting(stats);
696 stats->start_group_wait_time = now;
697 }
698 if (empty) {
699 blkio_mark_blkg_empty(stats);
700 stats->start_empty_time = now;
701 }
702#endif
303a3acb 703 spin_unlock(&blkg->stats_lock);
f0bdc8cd
VG
704
705 /* Reset Per cpu stats which don't take blkg->stats_lock */
706 blkio_reset_stats_cpu(blkg);
303a3acb 707 }
f0bdc8cd 708
303a3acb
DS
709 spin_unlock_irq(&blkcg->lock);
710 return 0;
711}
712
7a4dd281
TH
713static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
714 char *str, int chars_left, bool diskname_only)
303a3acb 715{
7a4dd281 716 snprintf(str, chars_left, "%s", dname);
303a3acb
DS
717 chars_left -= strlen(str);
718 if (chars_left <= 0) {
719 printk(KERN_WARNING
720 "Possibly incorrect cgroup stat display format");
721 return;
722 }
84c124da
DS
723 if (diskname_only)
724 return;
303a3acb 725 switch (type) {
84c124da 726 case BLKIO_STAT_READ:
303a3acb
DS
727 strlcat(str, " Read", chars_left);
728 break;
84c124da 729 case BLKIO_STAT_WRITE:
303a3acb
DS
730 strlcat(str, " Write", chars_left);
731 break;
84c124da 732 case BLKIO_STAT_SYNC:
303a3acb
DS
733 strlcat(str, " Sync", chars_left);
734 break;
84c124da 735 case BLKIO_STAT_ASYNC:
303a3acb
DS
736 strlcat(str, " Async", chars_left);
737 break;
84c124da 738 case BLKIO_STAT_TOTAL:
303a3acb
DS
739 strlcat(str, " Total", chars_left);
740 break;
741 default:
742 strlcat(str, " Invalid", chars_left);
743 }
744}
745
84c124da 746static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
7a4dd281 747 struct cgroup_map_cb *cb, const char *dname)
84c124da 748{
7a4dd281 749 blkio_get_key_name(0, dname, str, chars_left, true);
84c124da
DS
750 cb->fill(cb, str, val);
751 return val;
752}
303a3acb 753
5624a4e4
VG
754
755static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
756 enum stat_type_cpu type, enum stat_sub_type sub_type)
757{
758 int cpu;
759 struct blkio_group_stats_cpu *stats_cpu;
575969a0 760 u64 val = 0, tval;
5624a4e4
VG
761
762 for_each_possible_cpu(cpu) {
575969a0 763 unsigned int start;
5624a4e4
VG
764 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
765
575969a0
VG
766 do {
767 start = u64_stats_fetch_begin(&stats_cpu->syncp);
768 if (type == BLKIO_STAT_CPU_SECTORS)
769 tval = stats_cpu->sectors;
770 else
771 tval = stats_cpu->stat_arr_cpu[type][sub_type];
772 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
773
774 val += tval;
5624a4e4
VG
775 }
776
777 return val;
778}
779
780static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
7a4dd281
TH
781 struct cgroup_map_cb *cb, const char *dname,
782 enum stat_type_cpu type)
5624a4e4
VG
783{
784 uint64_t disk_total, val;
785 char key_str[MAX_KEY_LEN];
786 enum stat_sub_type sub_type;
787
788 if (type == BLKIO_STAT_CPU_SECTORS) {
789 val = blkio_read_stat_cpu(blkg, type, 0);
7a4dd281
TH
790 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb,
791 dname);
5624a4e4
VG
792 }
793
794 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
795 sub_type++) {
7a4dd281
TH
796 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
797 false);
5624a4e4
VG
798 val = blkio_read_stat_cpu(blkg, type, sub_type);
799 cb->fill(cb, key_str, val);
800 }
801
802 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
803 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
804
7a4dd281
TH
805 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
806 false);
5624a4e4
VG
807 cb->fill(cb, key_str, disk_total);
808 return disk_total;
809}
810
84c124da
DS
811/* This should be called with blkg->stats_lock held */
812static uint64_t blkio_get_stat(struct blkio_group *blkg,
7a4dd281
TH
813 struct cgroup_map_cb *cb, const char *dname,
814 enum stat_type type)
303a3acb
DS
815{
816 uint64_t disk_total;
817 char key_str[MAX_KEY_LEN];
84c124da
DS
818 enum stat_sub_type sub_type;
819
820 if (type == BLKIO_STAT_TIME)
821 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 822 blkg->stats.time, cb, dname);
9026e521 823#ifdef CONFIG_DEBUG_BLK_CGROUP
167400d3
JT
824 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
825 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 826 blkg->stats.unaccounted_time, cb, dname);
cdc1184c
DS
827 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
828 uint64_t sum = blkg->stats.avg_queue_size_sum;
829 uint64_t samples = blkg->stats.avg_queue_size_samples;
830 if (samples)
831 do_div(sum, samples);
832 else
833 sum = 0;
7a4dd281
TH
834 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
835 sum, cb, dname);
cdc1184c 836 }
812df48d
DS
837 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
838 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 839 blkg->stats.group_wait_time, cb, dname);
812df48d
DS
840 if (type == BLKIO_STAT_IDLE_TIME)
841 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 842 blkg->stats.idle_time, cb, dname);
812df48d
DS
843 if (type == BLKIO_STAT_EMPTY_TIME)
844 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 845 blkg->stats.empty_time, cb, dname);
84c124da
DS
846 if (type == BLKIO_STAT_DEQUEUE)
847 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
7a4dd281 848 blkg->stats.dequeue, cb, dname);
84c124da 849#endif
303a3acb 850
84c124da
DS
851 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
852 sub_type++) {
7a4dd281
TH
853 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
854 false);
84c124da 855 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
303a3acb 856 }
84c124da
DS
857 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
858 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
7a4dd281
TH
859 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
860 false);
303a3acb
DS
861 cb->fill(cb, key_str, disk_total);
862 return disk_total;
863}
864
4bfd482e
TH
865static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
866 int fileid, struct blkio_cgroup *blkcg)
34d0f179 867{
ece84241 868 struct gendisk *disk = NULL;
e56da7e2 869 struct blkio_group *blkg = NULL;
34d0f179 870 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
d11bb446 871 unsigned long major, minor;
ece84241
TH
872 int i = 0, ret = -EINVAL;
873 int part;
34d0f179 874 dev_t dev;
d11bb446 875 u64 temp;
34d0f179
GJ
876
877 memset(s, 0, sizeof(s));
878
879 while ((p = strsep(&buf, " ")) != NULL) {
880 if (!*p)
881 continue;
882
883 s[i++] = p;
884
885 /* Prevent from inputing too many things */
886 if (i == 3)
887 break;
888 }
889
890 if (i != 2)
ece84241 891 goto out;
34d0f179
GJ
892
893 p = strsep(&s[0], ":");
894 if (p != NULL)
895 major_s = p;
896 else
ece84241 897 goto out;
34d0f179
GJ
898
899 minor_s = s[0];
900 if (!minor_s)
ece84241 901 goto out;
34d0f179 902
ece84241
TH
903 if (strict_strtoul(major_s, 10, &major))
904 goto out;
34d0f179 905
ece84241
TH
906 if (strict_strtoul(minor_s, 10, &minor))
907 goto out;
34d0f179
GJ
908
909 dev = MKDEV(major, minor);
910
ece84241
TH
911 if (strict_strtoull(s[1], 10, &temp))
912 goto out;
34d0f179 913
e56da7e2 914 disk = get_gendisk(dev, &part);
4bfd482e 915 if (!disk || part)
e56da7e2 916 goto out;
e56da7e2
TH
917
918 rcu_read_lock();
919
4bfd482e
TH
920 spin_lock_irq(disk->queue->queue_lock);
921 blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
922 spin_unlock_irq(disk->queue->queue_lock);
e56da7e2 923
4bfd482e
TH
924 if (IS_ERR(blkg)) {
925 ret = PTR_ERR(blkg);
926 goto out_unlock;
d11bb446 927 }
34d0f179 928
062a644d
VG
929 switch (plid) {
930 case BLKIO_POLICY_PROP:
d11bb446
WG
931 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
932 temp > BLKIO_WEIGHT_MAX)
e56da7e2 933 goto out_unlock;
34d0f179 934
4bfd482e
TH
935 blkg->conf.weight = temp;
936 blkio_update_group_weight(blkg, temp ?: blkcg->weight);
4c9eefa1
VG
937 break;
938 case BLKIO_POLICY_THROTL:
7702e8f4
VG
939 switch(fileid) {
940 case BLKIO_THROTL_read_bps_device:
4bfd482e
TH
941 blkg->conf.bps[READ] = temp;
942 blkio_update_group_bps(blkg, temp ?: -1, fileid);
e56da7e2 943 break;
7702e8f4 944 case BLKIO_THROTL_write_bps_device:
4bfd482e
TH
945 blkg->conf.bps[WRITE] = temp;
946 blkio_update_group_bps(blkg, temp ?: -1, fileid);
7702e8f4
VG
947 break;
948 case BLKIO_THROTL_read_iops_device:
e56da7e2
TH
949 if (temp > THROTL_IOPS_MAX)
950 goto out_unlock;
4bfd482e
TH
951 blkg->conf.iops[READ] = temp;
952 blkio_update_group_iops(blkg, temp ?: -1, fileid);
e56da7e2 953 break;
7702e8f4 954 case BLKIO_THROTL_write_iops_device:
d11bb446 955 if (temp > THROTL_IOPS_MAX)
e56da7e2 956 goto out_unlock;
4bfd482e
TH
957 blkg->conf.iops[WRITE] = temp;
958 blkio_update_group_iops(blkg, temp ?: -1, fileid);
7702e8f4
VG
959 break;
960 }
062a644d
VG
961 break;
962 default:
963 BUG();
964 }
ece84241 965 ret = 0;
e56da7e2
TH
966out_unlock:
967 rcu_read_unlock();
ece84241
TH
968out:
969 put_disk(disk);
e56da7e2
TH
970
971 /*
972 * If queue was bypassing, we should retry. Do so after a short
973 * msleep(). It isn't strictly necessary but queue can be
974 * bypassing for some time and it's always nice to avoid busy
975 * looping.
976 */
977 if (ret == -EBUSY) {
978 msleep(10);
979 return restart_syscall();
980 }
ece84241 981 return ret;
34d0f179
GJ
982}
983
062a644d
VG
984static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
985 const char *buffer)
34d0f179
GJ
986{
987 int ret = 0;
988 char *buf;
e56da7e2 989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
062a644d
VG
990 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
991 int fileid = BLKIOFILE_ATTR(cft->private);
34d0f179
GJ
992
993 buf = kstrdup(buffer, GFP_KERNEL);
994 if (!buf)
995 return -ENOMEM;
996
4bfd482e 997 ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
34d0f179
GJ
998 kfree(buf);
999 return ret;
1000}
1001
92616b5b
VG
1002static const char *blkg_dev_name(struct blkio_group *blkg)
1003{
1004 /* some drivers (floppy) instantiate a queue w/o disk registered */
1005 if (blkg->q->backing_dev_info.dev)
1006 return dev_name(blkg->q->backing_dev_info.dev);
1007 return NULL;
1008}
1009
4bfd482e
TH
1010static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1011 struct seq_file *m)
34d0f179 1012{
92616b5b 1013 const char *dname = blkg_dev_name(blkg);
4bfd482e
TH
1014 int fileid = BLKIOFILE_ATTR(cft->private);
1015 int rw = WRITE;
1016
92616b5b
VG
1017 if (!dname)
1018 return;
1019
4bfd482e 1020 switch (blkg->plid) {
062a644d 1021 case BLKIO_POLICY_PROP:
4bfd482e 1022 if (blkg->conf.weight)
7a4dd281
TH
1023 seq_printf(m, "%s\t%u\n",
1024 dname, blkg->conf.weight);
4c9eefa1
VG
1025 break;
1026 case BLKIO_POLICY_THROTL:
4bfd482e 1027 switch (fileid) {
7702e8f4 1028 case BLKIO_THROTL_read_bps_device:
4bfd482e 1029 rw = READ;
7702e8f4 1030 case BLKIO_THROTL_write_bps_device:
4bfd482e 1031 if (blkg->conf.bps[rw])
7a4dd281
TH
1032 seq_printf(m, "%s\t%llu\n",
1033 dname, blkg->conf.bps[rw]);
7702e8f4
VG
1034 break;
1035 case BLKIO_THROTL_read_iops_device:
4bfd482e 1036 rw = READ;
7702e8f4 1037 case BLKIO_THROTL_write_iops_device:
4bfd482e 1038 if (blkg->conf.iops[rw])
7a4dd281
TH
1039 seq_printf(m, "%s\t%u\n",
1040 dname, blkg->conf.iops[rw]);
7702e8f4
VG
1041 break;
1042 }
062a644d
VG
1043 break;
1044 default:
1045 BUG();
1046 }
1047}
34d0f179 1048
062a644d 1049/* cgroup files which read their data from policy nodes end up here */
4bfd482e
TH
1050static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1051 struct seq_file *m)
34d0f179 1052{
4bfd482e
TH
1053 struct blkio_group *blkg;
1054 struct hlist_node *n;
34d0f179 1055
4bfd482e
TH
1056 spin_lock_irq(&blkcg->lock);
1057 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1058 if (BLKIOFILE_POLICY(cft->private) == blkg->plid)
1059 blkio_print_group_conf(cft, blkg, m);
1060 spin_unlock_irq(&blkcg->lock);
062a644d
VG
1061}
1062
1063static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1064 struct seq_file *m)
1065{
1066 struct blkio_cgroup *blkcg;
1067 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1068 int name = BLKIOFILE_ATTR(cft->private);
1069
1070 blkcg = cgroup_to_blkio_cgroup(cgrp);
1071
1072 switch(plid) {
1073 case BLKIO_POLICY_PROP:
1074 switch(name) {
1075 case BLKIO_PROP_weight_device:
4bfd482e 1076 blkio_read_conf(cft, blkcg, m);
062a644d
VG
1077 return 0;
1078 default:
1079 BUG();
1080 }
1081 break;
4c9eefa1
VG
1082 case BLKIO_POLICY_THROTL:
1083 switch(name){
1084 case BLKIO_THROTL_read_bps_device:
1085 case BLKIO_THROTL_write_bps_device:
7702e8f4
VG
1086 case BLKIO_THROTL_read_iops_device:
1087 case BLKIO_THROTL_write_iops_device:
4bfd482e 1088 blkio_read_conf(cft, blkcg, m);
4c9eefa1
VG
1089 return 0;
1090 default:
1091 BUG();
1092 }
1093 break;
062a644d
VG
1094 default:
1095 BUG();
1096 }
1097
1098 return 0;
1099}
1100
1101static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
5624a4e4
VG
1102 struct cftype *cft, struct cgroup_map_cb *cb,
1103 enum stat_type type, bool show_total, bool pcpu)
062a644d
VG
1104{
1105 struct blkio_group *blkg;
1106 struct hlist_node *n;
1107 uint64_t cgroup_total = 0;
1108
1109 rcu_read_lock();
1110 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
92616b5b 1111 const char *dname = blkg_dev_name(blkg);
7a4dd281 1112
92616b5b 1113 if (!dname || BLKIOFILE_POLICY(cft->private) != blkg->plid)
7a4dd281
TH
1114 continue;
1115 if (pcpu)
1116 cgroup_total += blkio_get_stat_cpu(blkg, cb, dname,
1117 type);
1118 else {
1119 spin_lock_irq(&blkg->stats_lock);
1120 cgroup_total += blkio_get_stat(blkg, cb, dname, type);
1121 spin_unlock_irq(&blkg->stats_lock);
062a644d
VG
1122 }
1123 }
1124 if (show_total)
1125 cb->fill(cb, "Total", cgroup_total);
1126 rcu_read_unlock();
1127 return 0;
1128}
1129
1130/* All map kind of cgroup file get serviced by this function */
1131static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1132 struct cgroup_map_cb *cb)
1133{
1134 struct blkio_cgroup *blkcg;
1135 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1136 int name = BLKIOFILE_ATTR(cft->private);
1137
1138 blkcg = cgroup_to_blkio_cgroup(cgrp);
1139
1140 switch(plid) {
1141 case BLKIO_POLICY_PROP:
1142 switch(name) {
1143 case BLKIO_PROP_time:
1144 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1145 BLKIO_STAT_TIME, 0, 0);
062a644d
VG
1146 case BLKIO_PROP_sectors:
1147 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1148 BLKIO_STAT_CPU_SECTORS, 0, 1);
062a644d
VG
1149 case BLKIO_PROP_io_service_bytes:
1150 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1151 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
062a644d
VG
1152 case BLKIO_PROP_io_serviced:
1153 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1154 BLKIO_STAT_CPU_SERVICED, 1, 1);
062a644d
VG
1155 case BLKIO_PROP_io_service_time:
1156 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1157 BLKIO_STAT_SERVICE_TIME, 1, 0);
062a644d
VG
1158 case BLKIO_PROP_io_wait_time:
1159 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1160 BLKIO_STAT_WAIT_TIME, 1, 0);
062a644d
VG
1161 case BLKIO_PROP_io_merged:
1162 return blkio_read_blkg_stats(blkcg, cft, cb,
317389a7 1163 BLKIO_STAT_CPU_MERGED, 1, 1);
062a644d
VG
1164 case BLKIO_PROP_io_queued:
1165 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1166 BLKIO_STAT_QUEUED, 1, 0);
062a644d 1167#ifdef CONFIG_DEBUG_BLK_CGROUP
9026e521
JT
1168 case BLKIO_PROP_unaccounted_time:
1169 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1170 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
062a644d
VG
1171 case BLKIO_PROP_dequeue:
1172 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1173 BLKIO_STAT_DEQUEUE, 0, 0);
062a644d
VG
1174 case BLKIO_PROP_avg_queue_size:
1175 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1176 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
062a644d
VG
1177 case BLKIO_PROP_group_wait_time:
1178 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1179 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
062a644d
VG
1180 case BLKIO_PROP_idle_time:
1181 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1182 BLKIO_STAT_IDLE_TIME, 0, 0);
062a644d
VG
1183 case BLKIO_PROP_empty_time:
1184 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1185 BLKIO_STAT_EMPTY_TIME, 0, 0);
062a644d
VG
1186#endif
1187 default:
1188 BUG();
1189 }
1190 break;
4c9eefa1
VG
1191 case BLKIO_POLICY_THROTL:
1192 switch(name){
1193 case BLKIO_THROTL_io_service_bytes:
1194 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1195 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
4c9eefa1
VG
1196 case BLKIO_THROTL_io_serviced:
1197 return blkio_read_blkg_stats(blkcg, cft, cb,
5624a4e4 1198 BLKIO_STAT_CPU_SERVICED, 1, 1);
4c9eefa1
VG
1199 default:
1200 BUG();
1201 }
1202 break;
062a644d
VG
1203 default:
1204 BUG();
1205 }
1206
1207 return 0;
1208}
1209
4bfd482e 1210static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
062a644d
VG
1211{
1212 struct blkio_group *blkg;
1213 struct hlist_node *n;
062a644d
VG
1214
1215 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1216 return -EINVAL;
1217
1218 spin_lock(&blkio_list_lock);
1219 spin_lock_irq(&blkcg->lock);
1220 blkcg->weight = (unsigned int)val;
1221
4bfd482e
TH
1222 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1223 if (blkg->plid == plid && !blkg->conf.weight)
1224 blkio_update_group_weight(blkg, blkcg->weight);
062a644d 1225
062a644d
VG
1226 spin_unlock_irq(&blkcg->lock);
1227 spin_unlock(&blkio_list_lock);
1228 return 0;
1229}
1230
1231static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1232 struct blkio_cgroup *blkcg;
1233 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1234 int name = BLKIOFILE_ATTR(cft->private);
1235
1236 blkcg = cgroup_to_blkio_cgroup(cgrp);
1237
1238 switch(plid) {
1239 case BLKIO_POLICY_PROP:
1240 switch(name) {
1241 case BLKIO_PROP_weight:
1242 return (u64)blkcg->weight;
1243 }
1244 break;
1245 default:
1246 BUG();
1247 }
1248 return 0;
1249}
1250
1251static int
1252blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1253{
1254 struct blkio_cgroup *blkcg;
1255 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1256 int name = BLKIOFILE_ATTR(cft->private);
1257
1258 blkcg = cgroup_to_blkio_cgroup(cgrp);
1259
1260 switch(plid) {
1261 case BLKIO_POLICY_PROP:
1262 switch(name) {
1263 case BLKIO_PROP_weight:
4bfd482e 1264 return blkio_weight_write(blkcg, plid, val);
062a644d
VG
1265 }
1266 break;
1267 default:
1268 BUG();
1269 }
34d0f179 1270
34d0f179
GJ
1271 return 0;
1272}
1273
31e4c28d 1274struct cftype blkio_files[] = {
34d0f179
GJ
1275 {
1276 .name = "weight_device",
062a644d
VG
1277 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1278 BLKIO_PROP_weight_device),
1279 .read_seq_string = blkiocg_file_read,
1280 .write_string = blkiocg_file_write,
34d0f179
GJ
1281 .max_write_len = 256,
1282 },
31e4c28d
VG
1283 {
1284 .name = "weight",
062a644d
VG
1285 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1286 BLKIO_PROP_weight),
1287 .read_u64 = blkiocg_file_read_u64,
1288 .write_u64 = blkiocg_file_write_u64,
31e4c28d 1289 },
22084190
VG
1290 {
1291 .name = "time",
062a644d
VG
1292 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1293 BLKIO_PROP_time),
1294 .read_map = blkiocg_file_read_map,
22084190
VG
1295 },
1296 {
1297 .name = "sectors",
062a644d
VG
1298 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1299 BLKIO_PROP_sectors),
1300 .read_map = blkiocg_file_read_map,
303a3acb
DS
1301 },
1302 {
1303 .name = "io_service_bytes",
062a644d
VG
1304 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1305 BLKIO_PROP_io_service_bytes),
1306 .read_map = blkiocg_file_read_map,
303a3acb
DS
1307 },
1308 {
1309 .name = "io_serviced",
062a644d
VG
1310 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1311 BLKIO_PROP_io_serviced),
1312 .read_map = blkiocg_file_read_map,
303a3acb
DS
1313 },
1314 {
1315 .name = "io_service_time",
062a644d
VG
1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1317 BLKIO_PROP_io_service_time),
1318 .read_map = blkiocg_file_read_map,
303a3acb
DS
1319 },
1320 {
1321 .name = "io_wait_time",
062a644d
VG
1322 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1323 BLKIO_PROP_io_wait_time),
1324 .read_map = blkiocg_file_read_map,
84c124da 1325 },
812d4026
DS
1326 {
1327 .name = "io_merged",
062a644d
VG
1328 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1329 BLKIO_PROP_io_merged),
1330 .read_map = blkiocg_file_read_map,
812d4026 1331 },
cdc1184c
DS
1332 {
1333 .name = "io_queued",
062a644d
VG
1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1335 BLKIO_PROP_io_queued),
1336 .read_map = blkiocg_file_read_map,
cdc1184c 1337 },
84c124da
DS
1338 {
1339 .name = "reset_stats",
1340 .write_u64 = blkiocg_reset_stats,
22084190 1341 },
13f98250
VG
1342#ifdef CONFIG_BLK_DEV_THROTTLING
1343 {
1344 .name = "throttle.read_bps_device",
1345 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1346 BLKIO_THROTL_read_bps_device),
1347 .read_seq_string = blkiocg_file_read,
1348 .write_string = blkiocg_file_write,
1349 .max_write_len = 256,
1350 },
1351
1352 {
1353 .name = "throttle.write_bps_device",
1354 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1355 BLKIO_THROTL_write_bps_device),
1356 .read_seq_string = blkiocg_file_read,
1357 .write_string = blkiocg_file_write,
1358 .max_write_len = 256,
1359 },
1360
1361 {
1362 .name = "throttle.read_iops_device",
1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1364 BLKIO_THROTL_read_iops_device),
1365 .read_seq_string = blkiocg_file_read,
1366 .write_string = blkiocg_file_write,
1367 .max_write_len = 256,
1368 },
1369
1370 {
1371 .name = "throttle.write_iops_device",
1372 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1373 BLKIO_THROTL_write_iops_device),
1374 .read_seq_string = blkiocg_file_read,
1375 .write_string = blkiocg_file_write,
1376 .max_write_len = 256,
1377 },
1378 {
1379 .name = "throttle.io_service_bytes",
1380 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1381 BLKIO_THROTL_io_service_bytes),
1382 .read_map = blkiocg_file_read_map,
1383 },
1384 {
1385 .name = "throttle.io_serviced",
1386 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1387 BLKIO_THROTL_io_serviced),
1388 .read_map = blkiocg_file_read_map,
1389 },
1390#endif /* CONFIG_BLK_DEV_THROTTLING */
1391
22084190 1392#ifdef CONFIG_DEBUG_BLK_CGROUP
cdc1184c
DS
1393 {
1394 .name = "avg_queue_size",
062a644d
VG
1395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_avg_queue_size),
1397 .read_map = blkiocg_file_read_map,
cdc1184c 1398 },
812df48d
DS
1399 {
1400 .name = "group_wait_time",
062a644d
VG
1401 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1402 BLKIO_PROP_group_wait_time),
1403 .read_map = blkiocg_file_read_map,
812df48d
DS
1404 },
1405 {
1406 .name = "idle_time",
062a644d
VG
1407 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1408 BLKIO_PROP_idle_time),
1409 .read_map = blkiocg_file_read_map,
812df48d
DS
1410 },
1411 {
1412 .name = "empty_time",
062a644d
VG
1413 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1414 BLKIO_PROP_empty_time),
1415 .read_map = blkiocg_file_read_map,
812df48d 1416 },
cdc1184c 1417 {
22084190 1418 .name = "dequeue",
062a644d
VG
1419 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1420 BLKIO_PROP_dequeue),
1421 .read_map = blkiocg_file_read_map,
cdc1184c 1422 },
9026e521
JT
1423 {
1424 .name = "unaccounted_time",
1425 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1426 BLKIO_PROP_unaccounted_time),
1427 .read_map = blkiocg_file_read_map,
1428 },
22084190 1429#endif
31e4c28d
VG
1430};
1431
1432static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1433{
1434 return cgroup_add_files(cgroup, subsys, blkio_files,
1435 ARRAY_SIZE(blkio_files));
1436}
1437
7ee9c562
TH
1438static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1439 struct cgroup *cgroup)
31e4c28d
VG
1440{
1441 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
b1c35769
VG
1442 unsigned long flags;
1443 struct blkio_group *blkg;
ca32aefc 1444 struct request_queue *q;
3e252066 1445 struct blkio_policy_type *blkiop;
b1c35769
VG
1446
1447 rcu_read_lock();
7ee9c562 1448
0f3942a3
JA
1449 do {
1450 spin_lock_irqsave(&blkcg->lock, flags);
b1c35769 1451
0f3942a3
JA
1452 if (hlist_empty(&blkcg->blkg_list)) {
1453 spin_unlock_irqrestore(&blkcg->lock, flags);
1454 break;
1455 }
b1c35769 1456
0f3942a3
JA
1457 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1458 blkcg_node);
ca32aefc 1459 q = rcu_dereference(blkg->q);
0f3942a3 1460 __blkiocg_del_blkio_group(blkg);
31e4c28d 1461
0f3942a3 1462 spin_unlock_irqrestore(&blkcg->lock, flags);
b1c35769 1463
0f3942a3
JA
1464 /*
1465 * This blkio_group is being unlinked as associated cgroup is
1466 * going away. Let all the IO controlling policies know about
61014e96 1467 * this event.
0f3942a3
JA
1468 */
1469 spin_lock(&blkio_list_lock);
61014e96
VG
1470 list_for_each_entry(blkiop, &blkio_list, list) {
1471 if (blkiop->plid != blkg->plid)
1472 continue;
ca32aefc 1473 blkiop->ops.blkio_unlink_group_fn(q, blkg);
61014e96 1474 }
0f3942a3
JA
1475 spin_unlock(&blkio_list_lock);
1476 } while (1);
34d0f179 1477
b1c35769 1478 rcu_read_unlock();
7ee9c562
TH
1479
1480 return 0;
1481}
1482
1483static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1484{
1485 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1486
67523c48
BB
1487 if (blkcg != &blkio_root_cgroup)
1488 kfree(blkcg);
31e4c28d
VG
1489}
1490
1491static struct cgroup_subsys_state *
1492blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1493{
0341509f
LZ
1494 struct blkio_cgroup *blkcg;
1495 struct cgroup *parent = cgroup->parent;
31e4c28d 1496
0341509f 1497 if (!parent) {
31e4c28d
VG
1498 blkcg = &blkio_root_cgroup;
1499 goto done;
1500 }
1501
31e4c28d
VG
1502 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1503 if (!blkcg)
1504 return ERR_PTR(-ENOMEM);
1505
1506 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1507done:
1508 spin_lock_init(&blkcg->lock);
1509 INIT_HLIST_HEAD(&blkcg->blkg_list);
1510
1511 return &blkcg->css;
1512}
1513
5efd6113
TH
1514/**
1515 * blkcg_init_queue - initialize blkcg part of request queue
1516 * @q: request_queue to initialize
1517 *
1518 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1519 * part of new request_queue @q.
1520 *
1521 * RETURNS:
1522 * 0 on success, -errno on failure.
1523 */
1524int blkcg_init_queue(struct request_queue *q)
1525{
923adde1
TH
1526 int ret;
1527
5efd6113
TH
1528 might_sleep();
1529
923adde1
TH
1530 ret = blk_throtl_init(q);
1531 if (ret)
1532 return ret;
1533
1534 mutex_lock(&all_q_mutex);
1535 INIT_LIST_HEAD(&q->all_q_node);
1536 list_add_tail(&q->all_q_node, &all_q_list);
1537 mutex_unlock(&all_q_mutex);
1538
1539 return 0;
5efd6113
TH
1540}
1541
1542/**
1543 * blkcg_drain_queue - drain blkcg part of request_queue
1544 * @q: request_queue to drain
1545 *
1546 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1547 */
1548void blkcg_drain_queue(struct request_queue *q)
1549{
1550 lockdep_assert_held(q->queue_lock);
1551
1552 blk_throtl_drain(q);
1553}
1554
1555/**
1556 * blkcg_exit_queue - exit and release blkcg part of request_queue
1557 * @q: request_queue being released
1558 *
1559 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1560 */
1561void blkcg_exit_queue(struct request_queue *q)
1562{
923adde1
TH
1563 mutex_lock(&all_q_mutex);
1564 list_del_init(&q->all_q_node);
1565 mutex_unlock(&all_q_mutex);
1566
5efd6113
TH
1567 blk_throtl_exit(q);
1568}
1569
31e4c28d
VG
1570/*
1571 * We cannot support shared io contexts, as we have no mean to support
1572 * two tasks with the same ioc in two different groups without major rework
1573 * of the main cic data structures. For now we allow a task to change
1574 * its cgroup only if it's the only owner of its ioc.
1575 */
bb9d97b6
TH
1576static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1577 struct cgroup_taskset *tset)
31e4c28d 1578{
bb9d97b6 1579 struct task_struct *task;
31e4c28d
VG
1580 struct io_context *ioc;
1581 int ret = 0;
1582
1583 /* task_lock() is needed to avoid races with exit_io_context() */
bb9d97b6
TH
1584 cgroup_taskset_for_each(task, cgrp, tset) {
1585 task_lock(task);
1586 ioc = task->io_context;
1587 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1588 ret = -EINVAL;
1589 task_unlock(task);
1590 if (ret)
1591 break;
1592 }
31e4c28d
VG
1593 return ret;
1594}
1595
bb9d97b6
TH
1596static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1597 struct cgroup_taskset *tset)
31e4c28d 1598{
bb9d97b6 1599 struct task_struct *task;
31e4c28d
VG
1600 struct io_context *ioc;
1601
bb9d97b6 1602 cgroup_taskset_for_each(task, cgrp, tset) {
b3c9dd18
LT
1603 /* we don't lose anything even if ioc allocation fails */
1604 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1605 if (ioc) {
1606 ioc_cgroup_changed(ioc);
11a3122f 1607 put_io_context(ioc);
b3c9dd18 1608 }
bb9d97b6 1609 }
31e4c28d
VG
1610}
1611
923adde1
TH
1612static void blkcg_bypass_start(void)
1613 __acquires(&all_q_mutex)
1614{
1615 struct request_queue *q;
1616
1617 mutex_lock(&all_q_mutex);
1618
1619 list_for_each_entry(q, &all_q_list, all_q_node) {
1620 blk_queue_bypass_start(q);
1621 blkg_destroy_all(q);
1622 }
1623}
1624
1625static void blkcg_bypass_end(void)
1626 __releases(&all_q_mutex)
1627{
1628 struct request_queue *q;
1629
1630 list_for_each_entry(q, &all_q_list, all_q_node)
1631 blk_queue_bypass_end(q);
1632
1633 mutex_unlock(&all_q_mutex);
1634}
1635
3e252066
VG
1636void blkio_policy_register(struct blkio_policy_type *blkiop)
1637{
923adde1 1638 blkcg_bypass_start();
3e252066 1639 spin_lock(&blkio_list_lock);
035d10b2
TH
1640
1641 BUG_ON(blkio_policy[blkiop->plid]);
1642 blkio_policy[blkiop->plid] = blkiop;
3e252066 1643 list_add_tail(&blkiop->list, &blkio_list);
035d10b2 1644
3e252066 1645 spin_unlock(&blkio_list_lock);
923adde1 1646 blkcg_bypass_end();
3e252066
VG
1647}
1648EXPORT_SYMBOL_GPL(blkio_policy_register);
1649
1650void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1651{
923adde1 1652 blkcg_bypass_start();
3e252066 1653 spin_lock(&blkio_list_lock);
035d10b2
TH
1654
1655 BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1656 blkio_policy[blkiop->plid] = NULL;
3e252066 1657 list_del_init(&blkiop->list);
035d10b2 1658
3e252066 1659 spin_unlock(&blkio_list_lock);
923adde1 1660 blkcg_bypass_end();
3e252066
VG
1661}
1662EXPORT_SYMBOL_GPL(blkio_policy_unregister);