]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - block/blk-cgroup.c
blkcg: add blkcg->id
[mirror_ubuntu-zesty-kernel.git] / block / blk-cgroup.c
1 /*
2 * Common Block IO controller cgroup interface
3 *
4 * Based on ideas and code from CFQ, CFS and BFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6 *
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com>
12 */
13 #include <linux/ioprio.h>
14 #include <linux/seq_file.h>
15 #include <linux/kdev_t.h>
16 #include <linux/module.h>
17 #include <linux/err.h>
18 #include <linux/blkdev.h>
19 #include <linux/slab.h>
20 #include <linux/genhd.h>
21 #include <linux/delay.h>
22 #include <linux/atomic.h>
23 #include "blk-cgroup.h"
24 #include "blk.h"
25
26 #define MAX_KEY_LEN 100
27
28 static DEFINE_SPINLOCK(blkio_list_lock);
29 static LIST_HEAD(blkio_list);
30
31 static DEFINE_MUTEX(all_q_mutex);
32 static LIST_HEAD(all_q_list);
33
34 /* List of groups pending per cpu stats allocation */
35 static DEFINE_SPINLOCK(alloc_list_lock);
36 static LIST_HEAD(alloc_list);
37
38 static void blkio_stat_alloc_fn(struct work_struct *);
39 static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
40
41 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
42 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
43
44 static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
45
46 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
47 struct cgroup *);
48 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
49 struct cgroup_taskset *);
50 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
51 struct cgroup_taskset *);
52 static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
53 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
54 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
55
56 /* for encoding cft->private value on file */
57 #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
58 /* What policy owns the file, proportional or throttle */
59 #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
60 #define BLKIOFILE_ATTR(val) ((val) & 0xffff)
61
62 struct cgroup_subsys blkio_subsys = {
63 .name = "blkio",
64 .create = blkiocg_create,
65 .can_attach = blkiocg_can_attach,
66 .attach = blkiocg_attach,
67 .pre_destroy = blkiocg_pre_destroy,
68 .destroy = blkiocg_destroy,
69 .populate = blkiocg_populate,
70 .subsys_id = blkio_subsys_id,
71 .module = THIS_MODULE,
72 };
73 EXPORT_SYMBOL_GPL(blkio_subsys);
74
75 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
76 {
77 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
78 struct blkio_cgroup, css);
79 }
80 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
81
82 static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
83 {
84 return container_of(task_subsys_state(tsk, blkio_subsys_id),
85 struct blkio_cgroup, css);
86 }
87
88 struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
89 {
90 if (bio && bio->bi_css)
91 return container_of(bio->bi_css, struct blkio_cgroup, css);
92 return task_blkio_cgroup(current);
93 }
94 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
95
96 static inline void blkio_update_group_weight(struct blkio_group *blkg,
97 int plid, unsigned int weight)
98 {
99 struct blkio_policy_type *blkiop;
100
101 list_for_each_entry(blkiop, &blkio_list, list) {
102 /* If this policy does not own the blkg, do not send updates */
103 if (blkiop->plid != plid)
104 continue;
105 if (blkiop->ops.blkio_update_group_weight_fn)
106 blkiop->ops.blkio_update_group_weight_fn(blkg->q,
107 blkg, weight);
108 }
109 }
110
111 static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
112 u64 bps, int fileid)
113 {
114 struct blkio_policy_type *blkiop;
115
116 list_for_each_entry(blkiop, &blkio_list, list) {
117
118 /* If this policy does not own the blkg, do not send updates */
119 if (blkiop->plid != plid)
120 continue;
121
122 if (fileid == BLKIO_THROTL_read_bps_device
123 && blkiop->ops.blkio_update_group_read_bps_fn)
124 blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
125 blkg, bps);
126
127 if (fileid == BLKIO_THROTL_write_bps_device
128 && blkiop->ops.blkio_update_group_write_bps_fn)
129 blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
130 blkg, bps);
131 }
132 }
133
134 static inline void blkio_update_group_iops(struct blkio_group *blkg,
135 int plid, unsigned int iops,
136 int fileid)
137 {
138 struct blkio_policy_type *blkiop;
139
140 list_for_each_entry(blkiop, &blkio_list, list) {
141
142 /* If this policy does not own the blkg, do not send updates */
143 if (blkiop->plid != plid)
144 continue;
145
146 if (fileid == BLKIO_THROTL_read_iops_device
147 && blkiop->ops.blkio_update_group_read_iops_fn)
148 blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
149 blkg, iops);
150
151 if (fileid == BLKIO_THROTL_write_iops_device
152 && blkiop->ops.blkio_update_group_write_iops_fn)
153 blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
154 blkg,iops);
155 }
156 }
157
158 /*
159 * Add to the appropriate stat variable depending on the request type.
160 * This should be called with queue_lock held.
161 */
162 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
163 bool sync)
164 {
165 if (direction)
166 stat[BLKIO_STAT_WRITE] += add;
167 else
168 stat[BLKIO_STAT_READ] += add;
169 if (sync)
170 stat[BLKIO_STAT_SYNC] += add;
171 else
172 stat[BLKIO_STAT_ASYNC] += add;
173 }
174
175 /*
176 * Decrements the appropriate stat variable if non-zero depending on the
177 * request type. Panics on value being zero.
178 * This should be called with the queue_lock held.
179 */
180 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
181 {
182 if (direction) {
183 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
184 stat[BLKIO_STAT_WRITE]--;
185 } else {
186 BUG_ON(stat[BLKIO_STAT_READ] == 0);
187 stat[BLKIO_STAT_READ]--;
188 }
189 if (sync) {
190 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
191 stat[BLKIO_STAT_SYNC]--;
192 } else {
193 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
194 stat[BLKIO_STAT_ASYNC]--;
195 }
196 }
197
198 #ifdef CONFIG_DEBUG_BLK_CGROUP
199 /* This should be called with the queue_lock held. */
200 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
201 struct blkio_policy_type *pol,
202 struct blkio_group *curr_blkg)
203 {
204 struct blkg_policy_data *pd = blkg->pd[pol->plid];
205
206 if (blkio_blkg_waiting(&pd->stats))
207 return;
208 if (blkg == curr_blkg)
209 return;
210 pd->stats.start_group_wait_time = sched_clock();
211 blkio_mark_blkg_waiting(&pd->stats);
212 }
213
214 /* This should be called with the queue_lock held. */
215 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
216 {
217 unsigned long long now;
218
219 if (!blkio_blkg_waiting(stats))
220 return;
221
222 now = sched_clock();
223 if (time_after64(now, stats->start_group_wait_time))
224 stats->group_wait_time += now - stats->start_group_wait_time;
225 blkio_clear_blkg_waiting(stats);
226 }
227
228 /* This should be called with the queue_lock held. */
229 static void blkio_end_empty_time(struct blkio_group_stats *stats)
230 {
231 unsigned long long now;
232
233 if (!blkio_blkg_empty(stats))
234 return;
235
236 now = sched_clock();
237 if (time_after64(now, stats->start_empty_time))
238 stats->empty_time += now - stats->start_empty_time;
239 blkio_clear_blkg_empty(stats);
240 }
241
242 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
243 struct blkio_policy_type *pol)
244 {
245 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
246
247 lockdep_assert_held(blkg->q->queue_lock);
248 BUG_ON(blkio_blkg_idling(stats));
249
250 stats->start_idle_time = sched_clock();
251 blkio_mark_blkg_idling(stats);
252 }
253 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
254
255 void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
256 struct blkio_policy_type *pol)
257 {
258 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
259
260 lockdep_assert_held(blkg->q->queue_lock);
261
262 if (blkio_blkg_idling(stats)) {
263 unsigned long long now = sched_clock();
264
265 if (time_after64(now, stats->start_idle_time)) {
266 u64_stats_update_begin(&stats->syncp);
267 stats->idle_time += now - stats->start_idle_time;
268 u64_stats_update_end(&stats->syncp);
269 }
270 blkio_clear_blkg_idling(stats);
271 }
272 }
273 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
274
275 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
276 struct blkio_policy_type *pol)
277 {
278 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
279
280 lockdep_assert_held(blkg->q->queue_lock);
281
282 u64_stats_update_begin(&stats->syncp);
283 stats->avg_queue_size_sum +=
284 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
285 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
286 stats->avg_queue_size_samples++;
287 blkio_update_group_wait_time(stats);
288 u64_stats_update_end(&stats->syncp);
289 }
290 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
291
292 void blkiocg_set_start_empty_time(struct blkio_group *blkg,
293 struct blkio_policy_type *pol)
294 {
295 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
296
297 lockdep_assert_held(blkg->q->queue_lock);
298
299 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
300 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
301 return;
302
303 /*
304 * group is already marked empty. This can happen if cfqq got new
305 * request in parent group and moved to this group while being added
306 * to service tree. Just ignore the event and move on.
307 */
308 if (blkio_blkg_empty(stats))
309 return;
310
311 stats->start_empty_time = sched_clock();
312 blkio_mark_blkg_empty(stats);
313 }
314 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
315
316 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
317 struct blkio_policy_type *pol,
318 unsigned long dequeue)
319 {
320 struct blkg_policy_data *pd = blkg->pd[pol->plid];
321
322 lockdep_assert_held(blkg->q->queue_lock);
323
324 pd->stats.dequeue += dequeue;
325 }
326 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
327 #else
328 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
329 struct blkio_policy_type *pol,
330 struct blkio_group *curr_blkg) { }
331 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
332 #endif
333
334 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
335 struct blkio_policy_type *pol,
336 struct blkio_group *curr_blkg, bool direction,
337 bool sync)
338 {
339 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
340
341 lockdep_assert_held(blkg->q->queue_lock);
342
343 u64_stats_update_begin(&stats->syncp);
344 blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
345 blkio_end_empty_time(stats);
346 u64_stats_update_end(&stats->syncp);
347
348 blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
349 }
350 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
351
352 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
353 struct blkio_policy_type *pol,
354 bool direction, bool sync)
355 {
356 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
357
358 lockdep_assert_held(blkg->q->queue_lock);
359
360 u64_stats_update_begin(&stats->syncp);
361 blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
362 sync);
363 u64_stats_update_end(&stats->syncp);
364 }
365 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
366
367 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
368 struct blkio_policy_type *pol,
369 unsigned long time,
370 unsigned long unaccounted_time)
371 {
372 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
373
374 lockdep_assert_held(blkg->q->queue_lock);
375
376 u64_stats_update_begin(&stats->syncp);
377 stats->time += time;
378 #ifdef CONFIG_DEBUG_BLK_CGROUP
379 stats->unaccounted_time += unaccounted_time;
380 #endif
381 u64_stats_update_end(&stats->syncp);
382 }
383 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
384
385 /*
386 * should be called under rcu read lock or queue lock to make sure blkg pointer
387 * is valid.
388 */
389 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
390 struct blkio_policy_type *pol,
391 uint64_t bytes, bool direction, bool sync)
392 {
393 struct blkg_policy_data *pd = blkg->pd[pol->plid];
394 struct blkio_group_stats_cpu *stats_cpu;
395 unsigned long flags;
396
397 /* If per cpu stats are not allocated yet, don't do any accounting. */
398 if (pd->stats_cpu == NULL)
399 return;
400
401 /*
402 * Disabling interrupts to provide mutual exclusion between two
403 * writes on same cpu. It probably is not needed for 64bit. Not
404 * optimizing that case yet.
405 */
406 local_irq_save(flags);
407
408 stats_cpu = this_cpu_ptr(pd->stats_cpu);
409
410 u64_stats_update_begin(&stats_cpu->syncp);
411 stats_cpu->sectors += bytes >> 9;
412 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
413 1, direction, sync);
414 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
415 bytes, direction, sync);
416 u64_stats_update_end(&stats_cpu->syncp);
417 local_irq_restore(flags);
418 }
419 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
420
421 void blkiocg_update_completion_stats(struct blkio_group *blkg,
422 struct blkio_policy_type *pol,
423 uint64_t start_time,
424 uint64_t io_start_time, bool direction,
425 bool sync)
426 {
427 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
428 unsigned long long now = sched_clock();
429
430 lockdep_assert_held(blkg->q->queue_lock);
431
432 u64_stats_update_begin(&stats->syncp);
433 if (time_after64(now, io_start_time))
434 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
435 now - io_start_time, direction, sync);
436 if (time_after64(io_start_time, start_time))
437 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
438 io_start_time - start_time, direction, sync);
439 u64_stats_update_end(&stats->syncp);
440 }
441 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
442
443 /* Merged stats are per cpu. */
444 void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
445 struct blkio_policy_type *pol,
446 bool direction, bool sync)
447 {
448 struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
449
450 lockdep_assert_held(blkg->q->queue_lock);
451
452 u64_stats_update_begin(&stats->syncp);
453 blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
454 u64_stats_update_end(&stats->syncp);
455 }
456 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
457
458 /*
459 * Worker for allocating per cpu stat for blk groups. This is scheduled on
460 * the system_nrt_wq once there are some groups on the alloc_list waiting
461 * for allocation.
462 */
463 static void blkio_stat_alloc_fn(struct work_struct *work)
464 {
465 static void *pcpu_stats[BLKIO_NR_POLICIES];
466 struct delayed_work *dwork = to_delayed_work(work);
467 struct blkio_group *blkg;
468 int i;
469 bool empty = false;
470
471 alloc_stats:
472 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
473 if (pcpu_stats[i] != NULL)
474 continue;
475
476 pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
477
478 /* Allocation failed. Try again after some time. */
479 if (pcpu_stats[i] == NULL) {
480 queue_delayed_work(system_nrt_wq, dwork,
481 msecs_to_jiffies(10));
482 return;
483 }
484 }
485
486 spin_lock_irq(&blkio_list_lock);
487 spin_lock(&alloc_list_lock);
488
489 /* cgroup got deleted or queue exited. */
490 if (!list_empty(&alloc_list)) {
491 blkg = list_first_entry(&alloc_list, struct blkio_group,
492 alloc_node);
493 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
494 struct blkg_policy_data *pd = blkg->pd[i];
495
496 if (blkio_policy[i] && pd && !pd->stats_cpu)
497 swap(pd->stats_cpu, pcpu_stats[i]);
498 }
499
500 list_del_init(&blkg->alloc_node);
501 }
502
503 empty = list_empty(&alloc_list);
504
505 spin_unlock(&alloc_list_lock);
506 spin_unlock_irq(&blkio_list_lock);
507
508 if (!empty)
509 goto alloc_stats;
510 }
511
512 /**
513 * blkg_free - free a blkg
514 * @blkg: blkg to free
515 *
516 * Free @blkg which may be partially allocated.
517 */
518 static void blkg_free(struct blkio_group *blkg)
519 {
520 int i;
521
522 if (!blkg)
523 return;
524
525 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
526 struct blkg_policy_data *pd = blkg->pd[i];
527
528 if (pd) {
529 free_percpu(pd->stats_cpu);
530 kfree(pd);
531 }
532 }
533
534 kfree(blkg);
535 }
536
537 /**
538 * blkg_alloc - allocate a blkg
539 * @blkcg: block cgroup the new blkg is associated with
540 * @q: request_queue the new blkg is associated with
541 *
542 * Allocate a new blkg assocating @blkcg and @q.
543 */
544 static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
545 struct request_queue *q)
546 {
547 struct blkio_group *blkg;
548 int i;
549
550 /* alloc and init base part */
551 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
552 if (!blkg)
553 return NULL;
554
555 blkg->q = q;
556 INIT_LIST_HEAD(&blkg->q_node);
557 INIT_LIST_HEAD(&blkg->alloc_node);
558 blkg->blkcg = blkcg;
559 blkg->refcnt = 1;
560 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
561
562 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
563 struct blkio_policy_type *pol = blkio_policy[i];
564 struct blkg_policy_data *pd;
565
566 if (!pol)
567 continue;
568
569 /* alloc per-policy data and attach it to blkg */
570 pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
571 q->node);
572 if (!pd) {
573 blkg_free(blkg);
574 return NULL;
575 }
576
577 blkg->pd[i] = pd;
578 pd->blkg = blkg;
579 }
580
581 /* invoke per-policy init */
582 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
583 struct blkio_policy_type *pol = blkio_policy[i];
584
585 if (pol)
586 pol->ops.blkio_init_group_fn(blkg);
587 }
588
589 return blkg;
590 }
591
592 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
593 struct request_queue *q,
594 enum blkio_policy_id plid,
595 bool for_root)
596 __releases(q->queue_lock) __acquires(q->queue_lock)
597 {
598 struct blkio_group *blkg;
599
600 WARN_ON_ONCE(!rcu_read_lock_held());
601 lockdep_assert_held(q->queue_lock);
602
603 /*
604 * This could be the first entry point of blkcg implementation and
605 * we shouldn't allow anything to go through for a bypassing queue.
606 * The following can be removed if blkg lookup is guaranteed to
607 * fail on a bypassing queue.
608 */
609 if (unlikely(blk_queue_bypass(q)) && !for_root)
610 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
611
612 blkg = blkg_lookup(blkcg, q);
613 if (blkg)
614 return blkg;
615
616 /* blkg holds a reference to blkcg */
617 if (!css_tryget(&blkcg->css))
618 return ERR_PTR(-EINVAL);
619
620 /*
621 * Allocate and initialize.
622 */
623 blkg = blkg_alloc(blkcg, q);
624
625 /* did alloc fail? */
626 if (unlikely(!blkg)) {
627 blkg = ERR_PTR(-ENOMEM);
628 goto out;
629 }
630
631 /* insert */
632 spin_lock(&blkcg->lock);
633 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
634 list_add(&blkg->q_node, &q->blkg_list);
635 spin_unlock(&blkcg->lock);
636
637 spin_lock(&alloc_list_lock);
638 list_add(&blkg->alloc_node, &alloc_list);
639 /* Queue per cpu stat allocation from worker thread. */
640 queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
641 spin_unlock(&alloc_list_lock);
642 out:
643 return blkg;
644 }
645 EXPORT_SYMBOL_GPL(blkg_lookup_create);
646
647 /* called under rcu_read_lock(). */
648 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
649 struct request_queue *q)
650 {
651 struct blkio_group *blkg;
652 struct hlist_node *n;
653
654 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
655 if (blkg->q == q)
656 return blkg;
657 return NULL;
658 }
659 EXPORT_SYMBOL_GPL(blkg_lookup);
660
661 static void blkg_destroy(struct blkio_group *blkg)
662 {
663 struct request_queue *q = blkg->q;
664 struct blkio_cgroup *blkcg = blkg->blkcg;
665
666 lockdep_assert_held(q->queue_lock);
667 lockdep_assert_held(&blkcg->lock);
668
669 /* Something wrong if we are trying to remove same group twice */
670 WARN_ON_ONCE(list_empty(&blkg->q_node));
671 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
672 list_del_init(&blkg->q_node);
673 hlist_del_init_rcu(&blkg->blkcg_node);
674
675 spin_lock(&alloc_list_lock);
676 list_del_init(&blkg->alloc_node);
677 spin_unlock(&alloc_list_lock);
678
679 /*
680 * Put the reference taken at the time of creation so that when all
681 * queues are gone, group can be destroyed.
682 */
683 blkg_put(blkg);
684 }
685
686 /*
687 * XXX: This updates blkg policy data in-place for root blkg, which is
688 * necessary across elevator switch and policy registration as root blkgs
689 * aren't shot down. This broken and racy implementation is temporary.
690 * Eventually, blkg shoot down will be replaced by proper in-place update.
691 */
692 void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
693 {
694 struct blkio_policy_type *pol = blkio_policy[plid];
695 struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
696 struct blkg_policy_data *pd;
697
698 if (!blkg)
699 return;
700
701 kfree(blkg->pd[plid]);
702 blkg->pd[plid] = NULL;
703
704 if (!pol)
705 return;
706
707 pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
708 WARN_ON_ONCE(!pd);
709
710 pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
711 WARN_ON_ONCE(!pd->stats_cpu);
712
713 blkg->pd[plid] = pd;
714 pd->blkg = blkg;
715 pol->ops.blkio_init_group_fn(blkg);
716 }
717 EXPORT_SYMBOL_GPL(update_root_blkg_pd);
718
719 /**
720 * blkg_destroy_all - destroy all blkgs associated with a request_queue
721 * @q: request_queue of interest
722 * @destroy_root: whether to destroy root blkg or not
723 *
724 * Destroy blkgs associated with @q. If @destroy_root is %true, all are
725 * destroyed; otherwise, root blkg is left alone.
726 */
727 void blkg_destroy_all(struct request_queue *q, bool destroy_root)
728 {
729 struct blkio_group *blkg, *n;
730
731 spin_lock_irq(q->queue_lock);
732
733 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
734 struct blkio_cgroup *blkcg = blkg->blkcg;
735
736 /* skip root? */
737 if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
738 continue;
739
740 spin_lock(&blkcg->lock);
741 blkg_destroy(blkg);
742 spin_unlock(&blkcg->lock);
743 }
744
745 spin_unlock_irq(q->queue_lock);
746 }
747 EXPORT_SYMBOL_GPL(blkg_destroy_all);
748
749 static void blkg_rcu_free(struct rcu_head *rcu_head)
750 {
751 blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
752 }
753
754 void __blkg_release(struct blkio_group *blkg)
755 {
756 /* release the extra blkcg reference this blkg has been holding */
757 css_put(&blkg->blkcg->css);
758
759 /*
760 * A group is freed in rcu manner. But having an rcu lock does not
761 * mean that one can access all the fields of blkg and assume these
762 * are valid. For example, don't try to follow throtl_data and
763 * request queue links.
764 *
765 * Having a reference to blkg under an rcu allows acess to only
766 * values local to groups like group stats and group rate limits
767 */
768 call_rcu(&blkg->rcu_head, blkg_rcu_free);
769 }
770 EXPORT_SYMBOL_GPL(__blkg_release);
771
772 static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
773 {
774 struct blkg_policy_data *pd = blkg->pd[plid];
775 int cpu;
776
777 if (pd->stats_cpu == NULL)
778 return;
779
780 for_each_possible_cpu(cpu) {
781 struct blkio_group_stats_cpu *sc =
782 per_cpu_ptr(pd->stats_cpu, cpu);
783
784 sc->sectors = 0;
785 memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
786 }
787 }
788
789 static int
790 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
791 {
792 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
793 struct blkio_group *blkg;
794 struct hlist_node *n;
795 int i;
796
797 spin_lock(&blkio_list_lock);
798 spin_lock_irq(&blkcg->lock);
799
800 /*
801 * Note that stat reset is racy - it doesn't synchronize against
802 * stat updates. This is a debug feature which shouldn't exist
803 * anyway. If you get hit by a race, retry.
804 */
805 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
806 struct blkio_policy_type *pol;
807
808 list_for_each_entry(pol, &blkio_list, list) {
809 struct blkg_policy_data *pd = blkg->pd[pol->plid];
810 struct blkio_group_stats *stats = &pd->stats;
811
812 /* queued stats shouldn't be cleared */
813 for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
814 if (i != BLKIO_STAT_QUEUED)
815 memset(stats->stat_arr[i], 0,
816 sizeof(stats->stat_arr[i]));
817 stats->time = 0;
818 #ifdef CONFIG_DEBUG_BLK_CGROUP
819 memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
820 BLKG_STATS_DEBUG_CLEAR_SIZE);
821 #endif
822 blkio_reset_stats_cpu(blkg, pol->plid);
823 }
824 }
825
826 spin_unlock_irq(&blkcg->lock);
827 spin_unlock(&blkio_list_lock);
828 return 0;
829 }
830
831 static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
832 char *str, int chars_left, bool diskname_only)
833 {
834 snprintf(str, chars_left, "%s", dname);
835 chars_left -= strlen(str);
836 if (chars_left <= 0) {
837 printk(KERN_WARNING
838 "Possibly incorrect cgroup stat display format");
839 return;
840 }
841 if (diskname_only)
842 return;
843 switch (type) {
844 case BLKIO_STAT_READ:
845 strlcat(str, " Read", chars_left);
846 break;
847 case BLKIO_STAT_WRITE:
848 strlcat(str, " Write", chars_left);
849 break;
850 case BLKIO_STAT_SYNC:
851 strlcat(str, " Sync", chars_left);
852 break;
853 case BLKIO_STAT_ASYNC:
854 strlcat(str, " Async", chars_left);
855 break;
856 case BLKIO_STAT_TOTAL:
857 strlcat(str, " Total", chars_left);
858 break;
859 default:
860 strlcat(str, " Invalid", chars_left);
861 }
862 }
863
864 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
865 enum stat_type_cpu type, enum stat_sub_type sub_type)
866 {
867 struct blkg_policy_data *pd = blkg->pd[plid];
868 int cpu;
869 struct blkio_group_stats_cpu *stats_cpu;
870 u64 val = 0, tval;
871
872 if (pd->stats_cpu == NULL)
873 return val;
874
875 for_each_possible_cpu(cpu) {
876 unsigned int start;
877 stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
878
879 do {
880 start = u64_stats_fetch_begin(&stats_cpu->syncp);
881 if (type == BLKIO_STAT_CPU_SECTORS)
882 tval = stats_cpu->sectors;
883 else
884 tval = stats_cpu->stat_arr_cpu[type][sub_type];
885 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
886
887 val += tval;
888 }
889
890 return val;
891 }
892
893 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
894 struct cgroup_map_cb *cb, const char *dname,
895 enum stat_type_cpu type)
896 {
897 uint64_t disk_total, val;
898 char key_str[MAX_KEY_LEN];
899 enum stat_sub_type sub_type;
900
901 if (type == BLKIO_STAT_CPU_SECTORS) {
902 val = blkio_read_stat_cpu(blkg, plid, type, 0);
903 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
904 cb->fill(cb, key_str, val);
905 return val;
906 }
907
908 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
909 sub_type++) {
910 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
911 false);
912 val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
913 cb->fill(cb, key_str, val);
914 }
915
916 disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
917 blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
918
919 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
920 false);
921 cb->fill(cb, key_str, disk_total);
922 return disk_total;
923 }
924
925 static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
926 struct cgroup_map_cb *cb, const char *dname,
927 enum stat_type type)
928 {
929 struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
930 uint64_t v = 0, disk_total = 0;
931 char key_str[MAX_KEY_LEN];
932 unsigned int sync_start;
933 int st;
934
935 if (type >= BLKIO_STAT_ARR_NR) {
936 do {
937 sync_start = u64_stats_fetch_begin(&stats->syncp);
938 switch (type) {
939 case BLKIO_STAT_TIME:
940 v = stats->time;
941 break;
942 #ifdef CONFIG_DEBUG_BLK_CGROUP
943 case BLKIO_STAT_UNACCOUNTED_TIME:
944 v = stats->unaccounted_time;
945 break;
946 case BLKIO_STAT_AVG_QUEUE_SIZE: {
947 uint64_t samples = stats->avg_queue_size_samples;
948
949 if (samples) {
950 v = stats->avg_queue_size_sum;
951 do_div(v, samples);
952 }
953 break;
954 }
955 case BLKIO_STAT_IDLE_TIME:
956 v = stats->idle_time;
957 break;
958 case BLKIO_STAT_EMPTY_TIME:
959 v = stats->empty_time;
960 break;
961 case BLKIO_STAT_DEQUEUE:
962 v = stats->dequeue;
963 break;
964 case BLKIO_STAT_GROUP_WAIT_TIME:
965 v = stats->group_wait_time;
966 break;
967 #endif
968 default:
969 WARN_ON_ONCE(1);
970 }
971 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
972
973 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
974 cb->fill(cb, key_str, v);
975 return v;
976 }
977
978 for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
979 do {
980 sync_start = u64_stats_fetch_begin(&stats->syncp);
981 v = stats->stat_arr[type][st];
982 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
983
984 blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
985 cb->fill(cb, key_str, v);
986 if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
987 disk_total += v;
988 }
989
990 blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
991 false);
992 cb->fill(cb, key_str, disk_total);
993 return disk_total;
994 }
995
996 static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
997 int fileid, struct blkio_cgroup *blkcg)
998 {
999 struct gendisk *disk = NULL;
1000 struct blkio_group *blkg = NULL;
1001 struct blkg_policy_data *pd;
1002 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
1003 unsigned long major, minor;
1004 int i = 0, ret = -EINVAL;
1005 int part;
1006 dev_t dev;
1007 u64 temp;
1008
1009 memset(s, 0, sizeof(s));
1010
1011 while ((p = strsep(&buf, " ")) != NULL) {
1012 if (!*p)
1013 continue;
1014
1015 s[i++] = p;
1016
1017 /* Prevent from inputing too many things */
1018 if (i == 3)
1019 break;
1020 }
1021
1022 if (i != 2)
1023 goto out;
1024
1025 p = strsep(&s[0], ":");
1026 if (p != NULL)
1027 major_s = p;
1028 else
1029 goto out;
1030
1031 minor_s = s[0];
1032 if (!minor_s)
1033 goto out;
1034
1035 if (strict_strtoul(major_s, 10, &major))
1036 goto out;
1037
1038 if (strict_strtoul(minor_s, 10, &minor))
1039 goto out;
1040
1041 dev = MKDEV(major, minor);
1042
1043 if (strict_strtoull(s[1], 10, &temp))
1044 goto out;
1045
1046 disk = get_gendisk(dev, &part);
1047 if (!disk || part)
1048 goto out;
1049
1050 rcu_read_lock();
1051
1052 spin_lock_irq(disk->queue->queue_lock);
1053 blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
1054 spin_unlock_irq(disk->queue->queue_lock);
1055
1056 if (IS_ERR(blkg)) {
1057 ret = PTR_ERR(blkg);
1058 goto out_unlock;
1059 }
1060
1061 pd = blkg->pd[plid];
1062
1063 switch (plid) {
1064 case BLKIO_POLICY_PROP:
1065 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
1066 temp > BLKIO_WEIGHT_MAX)
1067 goto out_unlock;
1068
1069 pd->conf.weight = temp;
1070 blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
1071 break;
1072 case BLKIO_POLICY_THROTL:
1073 switch(fileid) {
1074 case BLKIO_THROTL_read_bps_device:
1075 pd->conf.bps[READ] = temp;
1076 blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1077 break;
1078 case BLKIO_THROTL_write_bps_device:
1079 pd->conf.bps[WRITE] = temp;
1080 blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1081 break;
1082 case BLKIO_THROTL_read_iops_device:
1083 if (temp > THROTL_IOPS_MAX)
1084 goto out_unlock;
1085 pd->conf.iops[READ] = temp;
1086 blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1087 break;
1088 case BLKIO_THROTL_write_iops_device:
1089 if (temp > THROTL_IOPS_MAX)
1090 goto out_unlock;
1091 pd->conf.iops[WRITE] = temp;
1092 blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1093 break;
1094 }
1095 break;
1096 default:
1097 BUG();
1098 }
1099 ret = 0;
1100 out_unlock:
1101 rcu_read_unlock();
1102 out:
1103 put_disk(disk);
1104
1105 /*
1106 * If queue was bypassing, we should retry. Do so after a short
1107 * msleep(). It isn't strictly necessary but queue can be
1108 * bypassing for some time and it's always nice to avoid busy
1109 * looping.
1110 */
1111 if (ret == -EBUSY) {
1112 msleep(10);
1113 return restart_syscall();
1114 }
1115 return ret;
1116 }
1117
1118 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1119 const char *buffer)
1120 {
1121 int ret = 0;
1122 char *buf;
1123 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1124 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1125 int fileid = BLKIOFILE_ATTR(cft->private);
1126
1127 buf = kstrdup(buffer, GFP_KERNEL);
1128 if (!buf)
1129 return -ENOMEM;
1130
1131 ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
1132 kfree(buf);
1133 return ret;
1134 }
1135
1136 static const char *blkg_dev_name(struct blkio_group *blkg)
1137 {
1138 /* some drivers (floppy) instantiate a queue w/o disk registered */
1139 if (blkg->q->backing_dev_info.dev)
1140 return dev_name(blkg->q->backing_dev_info.dev);
1141 return NULL;
1142 }
1143
1144 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1145 struct seq_file *m)
1146 {
1147 int plid = BLKIOFILE_POLICY(cft->private);
1148 int fileid = BLKIOFILE_ATTR(cft->private);
1149 struct blkg_policy_data *pd = blkg->pd[plid];
1150 const char *dname = blkg_dev_name(blkg);
1151 int rw = WRITE;
1152
1153 if (!dname)
1154 return;
1155
1156 switch (plid) {
1157 case BLKIO_POLICY_PROP:
1158 if (pd->conf.weight)
1159 seq_printf(m, "%s\t%u\n",
1160 dname, pd->conf.weight);
1161 break;
1162 case BLKIO_POLICY_THROTL:
1163 switch (fileid) {
1164 case BLKIO_THROTL_read_bps_device:
1165 rw = READ;
1166 case BLKIO_THROTL_write_bps_device:
1167 if (pd->conf.bps[rw])
1168 seq_printf(m, "%s\t%llu\n",
1169 dname, pd->conf.bps[rw]);
1170 break;
1171 case BLKIO_THROTL_read_iops_device:
1172 rw = READ;
1173 case BLKIO_THROTL_write_iops_device:
1174 if (pd->conf.iops[rw])
1175 seq_printf(m, "%s\t%u\n",
1176 dname, pd->conf.iops[rw]);
1177 break;
1178 }
1179 break;
1180 default:
1181 BUG();
1182 }
1183 }
1184
1185 /* cgroup files which read their data from policy nodes end up here */
1186 static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1187 struct seq_file *m)
1188 {
1189 struct blkio_group *blkg;
1190 struct hlist_node *n;
1191
1192 spin_lock_irq(&blkcg->lock);
1193 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1194 blkio_print_group_conf(cft, blkg, m);
1195 spin_unlock_irq(&blkcg->lock);
1196 }
1197
1198 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1199 struct seq_file *m)
1200 {
1201 struct blkio_cgroup *blkcg;
1202 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1203 int name = BLKIOFILE_ATTR(cft->private);
1204
1205 blkcg = cgroup_to_blkio_cgroup(cgrp);
1206
1207 switch(plid) {
1208 case BLKIO_POLICY_PROP:
1209 switch(name) {
1210 case BLKIO_PROP_weight_device:
1211 blkio_read_conf(cft, blkcg, m);
1212 return 0;
1213 default:
1214 BUG();
1215 }
1216 break;
1217 case BLKIO_POLICY_THROTL:
1218 switch(name){
1219 case BLKIO_THROTL_read_bps_device:
1220 case BLKIO_THROTL_write_bps_device:
1221 case BLKIO_THROTL_read_iops_device:
1222 case BLKIO_THROTL_write_iops_device:
1223 blkio_read_conf(cft, blkcg, m);
1224 return 0;
1225 default:
1226 BUG();
1227 }
1228 break;
1229 default:
1230 BUG();
1231 }
1232
1233 return 0;
1234 }
1235
1236 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1237 struct cftype *cft, struct cgroup_map_cb *cb,
1238 enum stat_type type, bool show_total, bool pcpu)
1239 {
1240 struct blkio_group *blkg;
1241 struct hlist_node *n;
1242 uint64_t cgroup_total = 0;
1243
1244 spin_lock_irq(&blkcg->lock);
1245
1246 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1247 const char *dname = blkg_dev_name(blkg);
1248 int plid = BLKIOFILE_POLICY(cft->private);
1249
1250 if (!dname)
1251 continue;
1252 if (pcpu)
1253 cgroup_total += blkio_get_stat_cpu(blkg, plid,
1254 cb, dname, type);
1255 else
1256 cgroup_total += blkio_get_stat(blkg, plid,
1257 cb, dname, type);
1258 }
1259 if (show_total)
1260 cb->fill(cb, "Total", cgroup_total);
1261
1262 spin_unlock_irq(&blkcg->lock);
1263 return 0;
1264 }
1265
1266 /* All map kind of cgroup file get serviced by this function */
1267 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1268 struct cgroup_map_cb *cb)
1269 {
1270 struct blkio_cgroup *blkcg;
1271 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1272 int name = BLKIOFILE_ATTR(cft->private);
1273
1274 blkcg = cgroup_to_blkio_cgroup(cgrp);
1275
1276 switch(plid) {
1277 case BLKIO_POLICY_PROP:
1278 switch(name) {
1279 case BLKIO_PROP_time:
1280 return blkio_read_blkg_stats(blkcg, cft, cb,
1281 BLKIO_STAT_TIME, 0, 0);
1282 case BLKIO_PROP_sectors:
1283 return blkio_read_blkg_stats(blkcg, cft, cb,
1284 BLKIO_STAT_CPU_SECTORS, 0, 1);
1285 case BLKIO_PROP_io_service_bytes:
1286 return blkio_read_blkg_stats(blkcg, cft, cb,
1287 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1288 case BLKIO_PROP_io_serviced:
1289 return blkio_read_blkg_stats(blkcg, cft, cb,
1290 BLKIO_STAT_CPU_SERVICED, 1, 1);
1291 case BLKIO_PROP_io_service_time:
1292 return blkio_read_blkg_stats(blkcg, cft, cb,
1293 BLKIO_STAT_SERVICE_TIME, 1, 0);
1294 case BLKIO_PROP_io_wait_time:
1295 return blkio_read_blkg_stats(blkcg, cft, cb,
1296 BLKIO_STAT_WAIT_TIME, 1, 0);
1297 case BLKIO_PROP_io_merged:
1298 return blkio_read_blkg_stats(blkcg, cft, cb,
1299 BLKIO_STAT_MERGED, 1, 0);
1300 case BLKIO_PROP_io_queued:
1301 return blkio_read_blkg_stats(blkcg, cft, cb,
1302 BLKIO_STAT_QUEUED, 1, 0);
1303 #ifdef CONFIG_DEBUG_BLK_CGROUP
1304 case BLKIO_PROP_unaccounted_time:
1305 return blkio_read_blkg_stats(blkcg, cft, cb,
1306 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1307 case BLKIO_PROP_dequeue:
1308 return blkio_read_blkg_stats(blkcg, cft, cb,
1309 BLKIO_STAT_DEQUEUE, 0, 0);
1310 case BLKIO_PROP_avg_queue_size:
1311 return blkio_read_blkg_stats(blkcg, cft, cb,
1312 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1313 case BLKIO_PROP_group_wait_time:
1314 return blkio_read_blkg_stats(blkcg, cft, cb,
1315 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1316 case BLKIO_PROP_idle_time:
1317 return blkio_read_blkg_stats(blkcg, cft, cb,
1318 BLKIO_STAT_IDLE_TIME, 0, 0);
1319 case BLKIO_PROP_empty_time:
1320 return blkio_read_blkg_stats(blkcg, cft, cb,
1321 BLKIO_STAT_EMPTY_TIME, 0, 0);
1322 #endif
1323 default:
1324 BUG();
1325 }
1326 break;
1327 case BLKIO_POLICY_THROTL:
1328 switch(name){
1329 case BLKIO_THROTL_io_service_bytes:
1330 return blkio_read_blkg_stats(blkcg, cft, cb,
1331 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1332 case BLKIO_THROTL_io_serviced:
1333 return blkio_read_blkg_stats(blkcg, cft, cb,
1334 BLKIO_STAT_CPU_SERVICED, 1, 1);
1335 default:
1336 BUG();
1337 }
1338 break;
1339 default:
1340 BUG();
1341 }
1342
1343 return 0;
1344 }
1345
1346 static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
1347 {
1348 struct blkio_group *blkg;
1349 struct hlist_node *n;
1350
1351 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1352 return -EINVAL;
1353
1354 spin_lock(&blkio_list_lock);
1355 spin_lock_irq(&blkcg->lock);
1356 blkcg->weight = (unsigned int)val;
1357
1358 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1359 struct blkg_policy_data *pd = blkg->pd[plid];
1360
1361 if (!pd->conf.weight)
1362 blkio_update_group_weight(blkg, plid, blkcg->weight);
1363 }
1364
1365 spin_unlock_irq(&blkcg->lock);
1366 spin_unlock(&blkio_list_lock);
1367 return 0;
1368 }
1369
1370 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1371 struct blkio_cgroup *blkcg;
1372 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1373 int name = BLKIOFILE_ATTR(cft->private);
1374
1375 blkcg = cgroup_to_blkio_cgroup(cgrp);
1376
1377 switch(plid) {
1378 case BLKIO_POLICY_PROP:
1379 switch(name) {
1380 case BLKIO_PROP_weight:
1381 return (u64)blkcg->weight;
1382 }
1383 break;
1384 default:
1385 BUG();
1386 }
1387 return 0;
1388 }
1389
1390 static int
1391 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1392 {
1393 struct blkio_cgroup *blkcg;
1394 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1395 int name = BLKIOFILE_ATTR(cft->private);
1396
1397 blkcg = cgroup_to_blkio_cgroup(cgrp);
1398
1399 switch(plid) {
1400 case BLKIO_POLICY_PROP:
1401 switch(name) {
1402 case BLKIO_PROP_weight:
1403 return blkio_weight_write(blkcg, plid, val);
1404 }
1405 break;
1406 default:
1407 BUG();
1408 }
1409
1410 return 0;
1411 }
1412
1413 struct cftype blkio_files[] = {
1414 {
1415 .name = "weight_device",
1416 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417 BLKIO_PROP_weight_device),
1418 .read_seq_string = blkiocg_file_read,
1419 .write_string = blkiocg_file_write,
1420 .max_write_len = 256,
1421 },
1422 {
1423 .name = "weight",
1424 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1425 BLKIO_PROP_weight),
1426 .read_u64 = blkiocg_file_read_u64,
1427 .write_u64 = blkiocg_file_write_u64,
1428 },
1429 {
1430 .name = "time",
1431 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1432 BLKIO_PROP_time),
1433 .read_map = blkiocg_file_read_map,
1434 },
1435 {
1436 .name = "sectors",
1437 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1438 BLKIO_PROP_sectors),
1439 .read_map = blkiocg_file_read_map,
1440 },
1441 {
1442 .name = "io_service_bytes",
1443 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1444 BLKIO_PROP_io_service_bytes),
1445 .read_map = blkiocg_file_read_map,
1446 },
1447 {
1448 .name = "io_serviced",
1449 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1450 BLKIO_PROP_io_serviced),
1451 .read_map = blkiocg_file_read_map,
1452 },
1453 {
1454 .name = "io_service_time",
1455 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1456 BLKIO_PROP_io_service_time),
1457 .read_map = blkiocg_file_read_map,
1458 },
1459 {
1460 .name = "io_wait_time",
1461 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1462 BLKIO_PROP_io_wait_time),
1463 .read_map = blkiocg_file_read_map,
1464 },
1465 {
1466 .name = "io_merged",
1467 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1468 BLKIO_PROP_io_merged),
1469 .read_map = blkiocg_file_read_map,
1470 },
1471 {
1472 .name = "io_queued",
1473 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1474 BLKIO_PROP_io_queued),
1475 .read_map = blkiocg_file_read_map,
1476 },
1477 {
1478 .name = "reset_stats",
1479 .write_u64 = blkiocg_reset_stats,
1480 },
1481 #ifdef CONFIG_BLK_DEV_THROTTLING
1482 {
1483 .name = "throttle.read_bps_device",
1484 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1485 BLKIO_THROTL_read_bps_device),
1486 .read_seq_string = blkiocg_file_read,
1487 .write_string = blkiocg_file_write,
1488 .max_write_len = 256,
1489 },
1490
1491 {
1492 .name = "throttle.write_bps_device",
1493 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1494 BLKIO_THROTL_write_bps_device),
1495 .read_seq_string = blkiocg_file_read,
1496 .write_string = blkiocg_file_write,
1497 .max_write_len = 256,
1498 },
1499
1500 {
1501 .name = "throttle.read_iops_device",
1502 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1503 BLKIO_THROTL_read_iops_device),
1504 .read_seq_string = blkiocg_file_read,
1505 .write_string = blkiocg_file_write,
1506 .max_write_len = 256,
1507 },
1508
1509 {
1510 .name = "throttle.write_iops_device",
1511 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1512 BLKIO_THROTL_write_iops_device),
1513 .read_seq_string = blkiocg_file_read,
1514 .write_string = blkiocg_file_write,
1515 .max_write_len = 256,
1516 },
1517 {
1518 .name = "throttle.io_service_bytes",
1519 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1520 BLKIO_THROTL_io_service_bytes),
1521 .read_map = blkiocg_file_read_map,
1522 },
1523 {
1524 .name = "throttle.io_serviced",
1525 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1526 BLKIO_THROTL_io_serviced),
1527 .read_map = blkiocg_file_read_map,
1528 },
1529 #endif /* CONFIG_BLK_DEV_THROTTLING */
1530
1531 #ifdef CONFIG_DEBUG_BLK_CGROUP
1532 {
1533 .name = "avg_queue_size",
1534 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1535 BLKIO_PROP_avg_queue_size),
1536 .read_map = blkiocg_file_read_map,
1537 },
1538 {
1539 .name = "group_wait_time",
1540 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1541 BLKIO_PROP_group_wait_time),
1542 .read_map = blkiocg_file_read_map,
1543 },
1544 {
1545 .name = "idle_time",
1546 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1547 BLKIO_PROP_idle_time),
1548 .read_map = blkiocg_file_read_map,
1549 },
1550 {
1551 .name = "empty_time",
1552 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1553 BLKIO_PROP_empty_time),
1554 .read_map = blkiocg_file_read_map,
1555 },
1556 {
1557 .name = "dequeue",
1558 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1559 BLKIO_PROP_dequeue),
1560 .read_map = blkiocg_file_read_map,
1561 },
1562 {
1563 .name = "unaccounted_time",
1564 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1565 BLKIO_PROP_unaccounted_time),
1566 .read_map = blkiocg_file_read_map,
1567 },
1568 #endif
1569 };
1570
1571 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1572 {
1573 return cgroup_add_files(cgroup, subsys, blkio_files,
1574 ARRAY_SIZE(blkio_files));
1575 }
1576
1577 /**
1578 * blkiocg_pre_destroy - cgroup pre_destroy callback
1579 * @subsys: cgroup subsys
1580 * @cgroup: cgroup of interest
1581 *
1582 * This function is called when @cgroup is about to go away and responsible
1583 * for shooting down all blkgs associated with @cgroup. blkgs should be
1584 * removed while holding both q and blkcg locks. As blkcg lock is nested
1585 * inside q lock, this function performs reverse double lock dancing.
1586 *
1587 * This is the blkcg counterpart of ioc_release_fn().
1588 */
1589 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1590 struct cgroup *cgroup)
1591 {
1592 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1593
1594 spin_lock_irq(&blkcg->lock);
1595
1596 while (!hlist_empty(&blkcg->blkg_list)) {
1597 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
1598 struct blkio_group, blkcg_node);
1599 struct request_queue *q = blkg->q;
1600
1601 if (spin_trylock(q->queue_lock)) {
1602 blkg_destroy(blkg);
1603 spin_unlock(q->queue_lock);
1604 } else {
1605 spin_unlock_irq(&blkcg->lock);
1606 cpu_relax();
1607 spin_lock(&blkcg->lock);
1608 }
1609 }
1610
1611 spin_unlock_irq(&blkcg->lock);
1612 return 0;
1613 }
1614
1615 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1616 {
1617 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1618
1619 if (blkcg != &blkio_root_cgroup)
1620 kfree(blkcg);
1621 }
1622
1623 static struct cgroup_subsys_state *
1624 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1625 {
1626 static atomic64_t id_seq = ATOMIC64_INIT(0);
1627 struct blkio_cgroup *blkcg;
1628 struct cgroup *parent = cgroup->parent;
1629
1630 if (!parent) {
1631 blkcg = &blkio_root_cgroup;
1632 goto done;
1633 }
1634
1635 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1636 if (!blkcg)
1637 return ERR_PTR(-ENOMEM);
1638
1639 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1640 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
1641 done:
1642 spin_lock_init(&blkcg->lock);
1643 INIT_HLIST_HEAD(&blkcg->blkg_list);
1644
1645 return &blkcg->css;
1646 }
1647
1648 /**
1649 * blkcg_init_queue - initialize blkcg part of request queue
1650 * @q: request_queue to initialize
1651 *
1652 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1653 * part of new request_queue @q.
1654 *
1655 * RETURNS:
1656 * 0 on success, -errno on failure.
1657 */
1658 int blkcg_init_queue(struct request_queue *q)
1659 {
1660 int ret;
1661
1662 might_sleep();
1663
1664 ret = blk_throtl_init(q);
1665 if (ret)
1666 return ret;
1667
1668 mutex_lock(&all_q_mutex);
1669 INIT_LIST_HEAD(&q->all_q_node);
1670 list_add_tail(&q->all_q_node, &all_q_list);
1671 mutex_unlock(&all_q_mutex);
1672
1673 return 0;
1674 }
1675
1676 /**
1677 * blkcg_drain_queue - drain blkcg part of request_queue
1678 * @q: request_queue to drain
1679 *
1680 * Called from blk_drain_queue(). Responsible for draining blkcg part.
1681 */
1682 void blkcg_drain_queue(struct request_queue *q)
1683 {
1684 lockdep_assert_held(q->queue_lock);
1685
1686 blk_throtl_drain(q);
1687 }
1688
1689 /**
1690 * blkcg_exit_queue - exit and release blkcg part of request_queue
1691 * @q: request_queue being released
1692 *
1693 * Called from blk_release_queue(). Responsible for exiting blkcg part.
1694 */
1695 void blkcg_exit_queue(struct request_queue *q)
1696 {
1697 mutex_lock(&all_q_mutex);
1698 list_del_init(&q->all_q_node);
1699 mutex_unlock(&all_q_mutex);
1700
1701 blkg_destroy_all(q, true);
1702
1703 blk_throtl_exit(q);
1704 }
1705
1706 /*
1707 * We cannot support shared io contexts, as we have no mean to support
1708 * two tasks with the same ioc in two different groups without major rework
1709 * of the main cic data structures. For now we allow a task to change
1710 * its cgroup only if it's the only owner of its ioc.
1711 */
1712 static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1713 struct cgroup_taskset *tset)
1714 {
1715 struct task_struct *task;
1716 struct io_context *ioc;
1717 int ret = 0;
1718
1719 /* task_lock() is needed to avoid races with exit_io_context() */
1720 cgroup_taskset_for_each(task, cgrp, tset) {
1721 task_lock(task);
1722 ioc = task->io_context;
1723 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1724 ret = -EINVAL;
1725 task_unlock(task);
1726 if (ret)
1727 break;
1728 }
1729 return ret;
1730 }
1731
1732 static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1733 struct cgroup_taskset *tset)
1734 {
1735 struct task_struct *task;
1736 struct io_context *ioc;
1737
1738 cgroup_taskset_for_each(task, cgrp, tset) {
1739 /* we don't lose anything even if ioc allocation fails */
1740 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1741 if (ioc) {
1742 ioc_cgroup_changed(ioc);
1743 put_io_context(ioc);
1744 }
1745 }
1746 }
1747
1748 static void blkcg_bypass_start(void)
1749 __acquires(&all_q_mutex)
1750 {
1751 struct request_queue *q;
1752
1753 mutex_lock(&all_q_mutex);
1754
1755 list_for_each_entry(q, &all_q_list, all_q_node) {
1756 blk_queue_bypass_start(q);
1757 blkg_destroy_all(q, false);
1758 }
1759 }
1760
1761 static void blkcg_bypass_end(void)
1762 __releases(&all_q_mutex)
1763 {
1764 struct request_queue *q;
1765
1766 list_for_each_entry(q, &all_q_list, all_q_node)
1767 blk_queue_bypass_end(q);
1768
1769 mutex_unlock(&all_q_mutex);
1770 }
1771
1772 void blkio_policy_register(struct blkio_policy_type *blkiop)
1773 {
1774 struct request_queue *q;
1775
1776 blkcg_bypass_start();
1777 spin_lock(&blkio_list_lock);
1778
1779 BUG_ON(blkio_policy[blkiop->plid]);
1780 blkio_policy[blkiop->plid] = blkiop;
1781 list_add_tail(&blkiop->list, &blkio_list);
1782
1783 spin_unlock(&blkio_list_lock);
1784 list_for_each_entry(q, &all_q_list, all_q_node)
1785 update_root_blkg_pd(q, blkiop->plid);
1786 blkcg_bypass_end();
1787 }
1788 EXPORT_SYMBOL_GPL(blkio_policy_register);
1789
1790 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1791 {
1792 struct request_queue *q;
1793
1794 blkcg_bypass_start();
1795 spin_lock(&blkio_list_lock);
1796
1797 BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1798 blkio_policy[blkiop->plid] = NULL;
1799 list_del_init(&blkiop->list);
1800
1801 spin_unlock(&blkio_list_lock);
1802 list_for_each_entry(q, &all_q_list, all_q_node)
1803 update_root_blkg_pd(q, blkiop->plid);
1804 blkcg_bypass_end();
1805 }
1806 EXPORT_SYMBOL_GPL(blkio_policy_unregister);