block/blk-cgroup.c

   1 /*
   2  * Common Block IO controller cgroup interface
   3  *
   4  * Based on ideas and code from CFQ, CFS and BFQ:
   5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8  *                    Paolo Valente <paolo.valente@unimore.it>
   9  *
  10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11  *                    Nauman Rafique <nauman@google.com>
  12  */
  13 #include <linux/ioprio.h>
  14 #include <linux/seq_file.h>
  15 #include <linux/kdev_t.h>
  16 #include <linux/module.h>
  17 #include <linux/err.h>
  18 #include <linux/blkdev.h>
  19 #include <linux/slab.h>
  20 #include <linux/genhd.h>
  21 #include <linux/delay.h>
  22 #include <linux/atomic.h>
  23 #include "blk-cgroup.h"
  24 #include "blk.h"
  25
  26 #define MAX_KEY_LEN 100
  27
  28 static DEFINE_SPINLOCK(blkio_list_lock);
  29 static LIST_HEAD(blkio_list);
  30
  31 static DEFINE_MUTEX(all_q_mutex);
  32 static LIST_HEAD(all_q_list);
  33
  34 /* List of groups pending per cpu stats allocation */
  35 static DEFINE_SPINLOCK(alloc_list_lock);
  36 static LIST_HEAD(alloc_list);
  37
  38 static void blkio_stat_alloc_fn(struct work_struct *);
  39 static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
  40
  41 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  42 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  43
  44 static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
  45
  46 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  47                                                   struct cgroup *);
  48 static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  49                               struct cgroup_taskset *);
  50 static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  51                            struct cgroup_taskset *);
  52 static int blkiocg_pre_destroy(struct cgroup_subsys *, struct cgroup *);
  53 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  54 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  55
  56 /* for encoding cft->private value on file */
  57 #define BLKIOFILE_PRIVATE(x, val)       (((x) << 16) | (val))
  58 /* What policy owns the file, proportional or throttle */
  59 #define BLKIOFILE_POLICY(val)           (((val) >> 16) & 0xffff)
  60 #define BLKIOFILE_ATTR(val)             ((val) & 0xffff)
  61
  62 struct cgroup_subsys blkio_subsys = {
  63         .name = "blkio",
  64         .create = blkiocg_create,
  65         .can_attach = blkiocg_can_attach,
  66         .attach = blkiocg_attach,
  67         .pre_destroy = blkiocg_pre_destroy,
  68         .destroy = blkiocg_destroy,
  69         .populate = blkiocg_populate,
  70         .subsys_id = blkio_subsys_id,
  71         .module = THIS_MODULE,
  72 };
  73 EXPORT_SYMBOL_GPL(blkio_subsys);
  74
  75 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  76 {
  77         return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  78                             struct blkio_cgroup, css);
  79 }
  80 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  81
  82 static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
  83 {
  84         return container_of(task_subsys_state(tsk, blkio_subsys_id),
  85                             struct blkio_cgroup, css);
  86 }
  87
  88 struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
  89 {
  90         if (bio && bio->bi_css)
  91                 return container_of(bio->bi_css, struct blkio_cgroup, css);
  92         return task_blkio_cgroup(current);
  93 }
  94 EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
  95
  96 static inline void blkio_update_group_weight(struct blkio_group *blkg,
  97                                              int plid, unsigned int weight)
  98 {
  99         struct blkio_policy_type *blkiop;
 100
 101         list_for_each_entry(blkiop, &blkio_list, list) {
 102                 /* If this policy does not own the blkg, do not send updates */
 103                 if (blkiop->plid != plid)
 104                         continue;
 105                 if (blkiop->ops.blkio_update_group_weight_fn)
 106                         blkiop->ops.blkio_update_group_weight_fn(blkg->q,
 107                                                         blkg, weight);
 108         }
 109 }
 110
 111 static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
 112                                           u64 bps, int fileid)
 113 {
 114         struct blkio_policy_type *blkiop;
 115
 116         list_for_each_entry(blkiop, &blkio_list, list) {
 117
 118                 /* If this policy does not own the blkg, do not send updates */
 119                 if (blkiop->plid != plid)
 120                         continue;
 121
 122                 if (fileid == BLKIO_THROTL_read_bps_device
 123                     && blkiop->ops.blkio_update_group_read_bps_fn)
 124                         blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
 125                                                                 blkg, bps);
 126
 127                 if (fileid == BLKIO_THROTL_write_bps_device
 128                     && blkiop->ops.blkio_update_group_write_bps_fn)
 129                         blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
 130                                                                 blkg, bps);
 131         }
 132 }
 133
 134 static inline void blkio_update_group_iops(struct blkio_group *blkg,
 135                                            int plid, unsigned int iops,
 136                                            int fileid)
 137 {
 138         struct blkio_policy_type *blkiop;
 139
 140         list_for_each_entry(blkiop, &blkio_list, list) {
 141
 142                 /* If this policy does not own the blkg, do not send updates */
 143                 if (blkiop->plid != plid)
 144                         continue;
 145
 146                 if (fileid == BLKIO_THROTL_read_iops_device
 147                     && blkiop->ops.blkio_update_group_read_iops_fn)
 148                         blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
 149                                                                 blkg, iops);
 150
 151                 if (fileid == BLKIO_THROTL_write_iops_device
 152                     && blkiop->ops.blkio_update_group_write_iops_fn)
 153                         blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
 154                                                                 blkg,iops);
 155         }
 156 }
 157
 158 /*
 159  * Add to the appropriate stat variable depending on the request type.
 160  * This should be called with queue_lock held.
 161  */
 162 static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
 163                                 bool sync)
 164 {
 165         if (direction)
 166                 stat[BLKIO_STAT_WRITE] += add;
 167         else
 168                 stat[BLKIO_STAT_READ] += add;
 169         if (sync)
 170                 stat[BLKIO_STAT_SYNC] += add;
 171         else
 172                 stat[BLKIO_STAT_ASYNC] += add;
 173 }
 174
 175 /*
 176  * Decrements the appropriate stat variable if non-zero depending on the
 177  * request type. Panics on value being zero.
 178  * This should be called with the queue_lock held.
 179  */
 180 static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
 181 {
 182         if (direction) {
 183                 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
 184                 stat[BLKIO_STAT_WRITE]--;
 185         } else {
 186                 BUG_ON(stat[BLKIO_STAT_READ] == 0);
 187                 stat[BLKIO_STAT_READ]--;
 188         }
 189         if (sync) {
 190                 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
 191                 stat[BLKIO_STAT_SYNC]--;
 192         } else {
 193                 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
 194                 stat[BLKIO_STAT_ASYNC]--;
 195         }
 196 }
 197
 198 #ifdef CONFIG_DEBUG_BLK_CGROUP
 199 /* This should be called with the queue_lock held. */
 200 static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 201                                             struct blkio_policy_type *pol,
 202                                             struct blkio_group *curr_blkg)
 203 {
 204         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 205
 206         if (blkio_blkg_waiting(&pd->stats))
 207                 return;
 208         if (blkg == curr_blkg)
 209                 return;
 210         pd->stats.start_group_wait_time = sched_clock();
 211         blkio_mark_blkg_waiting(&pd->stats);
 212 }
 213
 214 /* This should be called with the queue_lock held. */
 215 static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
 216 {
 217         unsigned long long now;
 218
 219         if (!blkio_blkg_waiting(stats))
 220                 return;
 221
 222         now = sched_clock();
 223         if (time_after64(now, stats->start_group_wait_time))
 224                 stats->group_wait_time += now - stats->start_group_wait_time;
 225         blkio_clear_blkg_waiting(stats);
 226 }
 227
 228 /* This should be called with the queue_lock held. */
 229 static void blkio_end_empty_time(struct blkio_group_stats *stats)
 230 {
 231         unsigned long long now;
 232
 233         if (!blkio_blkg_empty(stats))
 234                 return;
 235
 236         now = sched_clock();
 237         if (time_after64(now, stats->start_empty_time))
 238                 stats->empty_time += now - stats->start_empty_time;
 239         blkio_clear_blkg_empty(stats);
 240 }
 241
 242 void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
 243                                         struct blkio_policy_type *pol)
 244 {
 245         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 246
 247         lockdep_assert_held(blkg->q->queue_lock);
 248         BUG_ON(blkio_blkg_idling(stats));
 249
 250         stats->start_idle_time = sched_clock();
 251         blkio_mark_blkg_idling(stats);
 252 }
 253 EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
 254
 255 void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
 256                                     struct blkio_policy_type *pol)
 257 {
 258         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 259
 260         lockdep_assert_held(blkg->q->queue_lock);
 261
 262         if (blkio_blkg_idling(stats)) {
 263                 unsigned long long now = sched_clock();
 264
 265                 if (time_after64(now, stats->start_idle_time)) {
 266                         u64_stats_update_begin(&stats->syncp);
 267                         stats->idle_time += now - stats->start_idle_time;
 268                         u64_stats_update_end(&stats->syncp);
 269                 }
 270                 blkio_clear_blkg_idling(stats);
 271         }
 272 }
 273 EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
 274
 275 void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
 276                                          struct blkio_policy_type *pol)
 277 {
 278         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 279
 280         lockdep_assert_held(blkg->q->queue_lock);
 281
 282         u64_stats_update_begin(&stats->syncp);
 283         stats->avg_queue_size_sum +=
 284                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
 285                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
 286         stats->avg_queue_size_samples++;
 287         blkio_update_group_wait_time(stats);
 288         u64_stats_update_end(&stats->syncp);
 289 }
 290 EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
 291
 292 void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 293                                   struct blkio_policy_type *pol)
 294 {
 295         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 296
 297         lockdep_assert_held(blkg->q->queue_lock);
 298
 299         if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
 300                         stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE])
 301                 return;
 302
 303         /*
 304          * group is already marked empty. This can happen if cfqq got new
 305          * request in parent group and moved to this group while being added
 306          * to service tree. Just ignore the event and move on.
 307          */
 308         if (blkio_blkg_empty(stats))
 309                 return;
 310
 311         stats->start_empty_time = sched_clock();
 312         blkio_mark_blkg_empty(stats);
 313 }
 314 EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
 315
 316 void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
 317                                   struct blkio_policy_type *pol,
 318                                   unsigned long dequeue)
 319 {
 320         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 321
 322         lockdep_assert_held(blkg->q->queue_lock);
 323
 324         pd->stats.dequeue += dequeue;
 325 }
 326 EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
 327 #else
 328 static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
 329                                         struct blkio_policy_type *pol,
 330                                         struct blkio_group *curr_blkg) { }
 331 static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
 332 #endif
 333
 334 void blkiocg_update_io_add_stats(struct blkio_group *blkg,
 335                                  struct blkio_policy_type *pol,
 336                                  struct blkio_group *curr_blkg, bool direction,
 337                                  bool sync)
 338 {
 339         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 340
 341         lockdep_assert_held(blkg->q->queue_lock);
 342
 343         u64_stats_update_begin(&stats->syncp);
 344         blkio_add_stat(stats->stat_arr[BLKIO_STAT_QUEUED], 1, direction, sync);
 345         blkio_end_empty_time(stats);
 346         u64_stats_update_end(&stats->syncp);
 347
 348         blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
 349 }
 350 EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
 351
 352 void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
 353                                     struct blkio_policy_type *pol,
 354                                     bool direction, bool sync)
 355 {
 356         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 357
 358         lockdep_assert_held(blkg->q->queue_lock);
 359
 360         u64_stats_update_begin(&stats->syncp);
 361         blkio_check_and_dec_stat(stats->stat_arr[BLKIO_STAT_QUEUED], direction,
 362                                  sync);
 363         u64_stats_update_end(&stats->syncp);
 364 }
 365 EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
 366
 367 void blkiocg_update_timeslice_used(struct blkio_group *blkg,
 368                                    struct blkio_policy_type *pol,
 369                                    unsigned long time,
 370                                    unsigned long unaccounted_time)
 371 {
 372         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 373
 374         lockdep_assert_held(blkg->q->queue_lock);
 375
 376         u64_stats_update_begin(&stats->syncp);
 377         stats->time += time;
 378 #ifdef CONFIG_DEBUG_BLK_CGROUP
 379         stats->unaccounted_time += unaccounted_time;
 380 #endif
 381         u64_stats_update_end(&stats->syncp);
 382 }
 383 EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
 384
 385 /*
 386  * should be called under rcu read lock or queue lock to make sure blkg pointer
 387  * is valid.
 388  */
 389 void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
 390                                    struct blkio_policy_type *pol,
 391                                    uint64_t bytes, bool direction, bool sync)
 392 {
 393         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 394         struct blkio_group_stats_cpu *stats_cpu;
 395         unsigned long flags;
 396
 397         /* If per cpu stats are not allocated yet, don't do any accounting. */
 398         if (pd->stats_cpu == NULL)
 399                 return;
 400
 401         /*
 402          * Disabling interrupts to provide mutual exclusion between two
 403          * writes on same cpu. It probably is not needed for 64bit. Not
 404          * optimizing that case yet.
 405          */
 406         local_irq_save(flags);
 407
 408         stats_cpu = this_cpu_ptr(pd->stats_cpu);
 409
 410         u64_stats_update_begin(&stats_cpu->syncp);
 411         stats_cpu->sectors += bytes >> 9;
 412         blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
 413                         1, direction, sync);
 414         blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
 415                         bytes, direction, sync);
 416         u64_stats_update_end(&stats_cpu->syncp);
 417         local_irq_restore(flags);
 418 }
 419 EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
 420
 421 void blkiocg_update_completion_stats(struct blkio_group *blkg,
 422                                      struct blkio_policy_type *pol,
 423                                      uint64_t start_time,
 424                                      uint64_t io_start_time, bool direction,
 425                                      bool sync)
 426 {
 427         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 428         unsigned long long now = sched_clock();
 429
 430         lockdep_assert_held(blkg->q->queue_lock);
 431
 432         u64_stats_update_begin(&stats->syncp);
 433         if (time_after64(now, io_start_time))
 434                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
 435                                 now - io_start_time, direction, sync);
 436         if (time_after64(io_start_time, start_time))
 437                 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
 438                                 io_start_time - start_time, direction, sync);
 439         u64_stats_update_end(&stats->syncp);
 440 }
 441 EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
 442
 443 /*  Merged stats are per cpu.  */
 444 void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
 445                                     struct blkio_policy_type *pol,
 446                                     bool direction, bool sync)
 447 {
 448         struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
 449
 450         lockdep_assert_held(blkg->q->queue_lock);
 451
 452         u64_stats_update_begin(&stats->syncp);
 453         blkio_add_stat(stats->stat_arr[BLKIO_STAT_MERGED], 1, direction, sync);
 454         u64_stats_update_end(&stats->syncp);
 455 }
 456 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 457
 458 /*
 459  * Worker for allocating per cpu stat for blk groups. This is scheduled on
 460  * the system_nrt_wq once there are some groups on the alloc_list waiting
 461  * for allocation.
 462  */
 463 static void blkio_stat_alloc_fn(struct work_struct *work)
 464 {
 465         static void *pcpu_stats[BLKIO_NR_POLICIES];
 466         struct delayed_work *dwork = to_delayed_work(work);
 467         struct blkio_group *blkg;
 468         int i;
 469         bool empty = false;
 470
 471 alloc_stats:
 472         for (i = 0; i < BLKIO_NR_POLICIES; i++) {
 473                 if (pcpu_stats[i] != NULL)
 474                         continue;
 475
 476                 pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
 477
 478                 /* Allocation failed. Try again after some time. */
 479                 if (pcpu_stats[i] == NULL) {
 480                         queue_delayed_work(system_nrt_wq, dwork,
 481                                                 msecs_to_jiffies(10));
 482                         return;
 483                 }
 484         }
 485
 486         spin_lock_irq(&blkio_list_lock);
 487         spin_lock(&alloc_list_lock);
 488
 489         /* cgroup got deleted or queue exited. */
 490         if (!list_empty(&alloc_list)) {
 491                 blkg = list_first_entry(&alloc_list, struct blkio_group,
 492                                                 alloc_node);
 493                 for (i = 0; i < BLKIO_NR_POLICIES; i++) {
 494                         struct blkg_policy_data *pd = blkg->pd[i];
 495
 496                         if (blkio_policy[i] && pd && !pd->stats_cpu)
 497                                 swap(pd->stats_cpu, pcpu_stats[i]);
 498                 }
 499
 500                 list_del_init(&blkg->alloc_node);
 501         }
 502
 503         empty = list_empty(&alloc_list);
 504
 505         spin_unlock(&alloc_list_lock);
 506         spin_unlock_irq(&blkio_list_lock);
 507
 508         if (!empty)
 509                 goto alloc_stats;
 510 }
 511
 512 /**
 513  * blkg_free - free a blkg
 514  * @blkg: blkg to free
 515  *
 516  * Free @blkg which may be partially allocated.
 517  */
 518 static void blkg_free(struct blkio_group *blkg)
 519 {
 520         int i;
 521
 522         if (!blkg)
 523                 return;
 524
 525         for (i = 0; i < BLKIO_NR_POLICIES; i++) {
 526                 struct blkg_policy_data *pd = blkg->pd[i];
 527
 528                 if (pd) {
 529                         free_percpu(pd->stats_cpu);
 530                         kfree(pd);
 531                 }
 532         }
 533
 534         kfree(blkg);
 535 }
 536
 537 /**
 538  * blkg_alloc - allocate a blkg
 539  * @blkcg: block cgroup the new blkg is associated with
 540  * @q: request_queue the new blkg is associated with
 541  *
 542  * Allocate a new blkg assocating @blkcg and @q.
 543  */
 544 static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
 545                                       struct request_queue *q)
 546 {
 547         struct blkio_group *blkg;
 548         int i;
 549
 550         /* alloc and init base part */
 551         blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
 552         if (!blkg)
 553                 return NULL;
 554
 555         blkg->q = q;
 556         INIT_LIST_HEAD(&blkg->q_node);
 557         INIT_LIST_HEAD(&blkg->alloc_node);
 558         blkg->blkcg = blkcg;
 559         blkg->refcnt = 1;
 560         cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 561
 562         for (i = 0; i < BLKIO_NR_POLICIES; i++) {
 563                 struct blkio_policy_type *pol = blkio_policy[i];
 564                 struct blkg_policy_data *pd;
 565
 566                 if (!pol)
 567                         continue;
 568
 569                 /* alloc per-policy data and attach it to blkg */
 570                 pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
 571                                   q->node);
 572                 if (!pd) {
 573                         blkg_free(blkg);
 574                         return NULL;
 575                 }
 576
 577                 blkg->pd[i] = pd;
 578                 pd->blkg = blkg;
 579         }
 580
 581         /* invoke per-policy init */
 582         for (i = 0; i < BLKIO_NR_POLICIES; i++) {
 583                 struct blkio_policy_type *pol = blkio_policy[i];
 584
 585                 if (pol)
 586                         pol->ops.blkio_init_group_fn(blkg);
 587         }
 588
 589         return blkg;
 590 }
 591
 592 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 593                                        struct request_queue *q,
 594                                        enum blkio_policy_id plid,
 595                                        bool for_root)
 596         __releases(q->queue_lock) __acquires(q->queue_lock)
 597 {
 598         struct blkio_group *blkg;
 599
 600         WARN_ON_ONCE(!rcu_read_lock_held());
 601         lockdep_assert_held(q->queue_lock);
 602
 603         /*
 604          * This could be the first entry point of blkcg implementation and
 605          * we shouldn't allow anything to go through for a bypassing queue.
 606          * The following can be removed if blkg lookup is guaranteed to
 607          * fail on a bypassing queue.
 608          */
 609         if (unlikely(blk_queue_bypass(q)) && !for_root)
 610                 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
 611
 612         blkg = blkg_lookup(blkcg, q);
 613         if (blkg)
 614                 return blkg;
 615
 616         /* blkg holds a reference to blkcg */
 617         if (!css_tryget(&blkcg->css))
 618                 return ERR_PTR(-EINVAL);
 619
 620         /*
 621          * Allocate and initialize.
 622          */
 623         blkg = blkg_alloc(blkcg, q);
 624
 625         /* did alloc fail? */
 626         if (unlikely(!blkg)) {
 627                 blkg = ERR_PTR(-ENOMEM);
 628                 goto out;
 629         }
 630
 631         /* insert */
 632         spin_lock(&blkcg->lock);
 633         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 634         list_add(&blkg->q_node, &q->blkg_list);
 635         spin_unlock(&blkcg->lock);
 636
 637         spin_lock(&alloc_list_lock);
 638         list_add(&blkg->alloc_node, &alloc_list);
 639         /* Queue per cpu stat allocation from worker thread. */
 640         queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
 641         spin_unlock(&alloc_list_lock);
 642 out:
 643         return blkg;
 644 }
 645 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 646
 647 /* called under rcu_read_lock(). */
 648 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 649                                 struct request_queue *q)
 650 {
 651         struct blkio_group *blkg;
 652         struct hlist_node *n;
 653
 654         hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
 655                 if (blkg->q == q)
 656                         return blkg;
 657         return NULL;
 658 }
 659 EXPORT_SYMBOL_GPL(blkg_lookup);
 660
 661 static void blkg_destroy(struct blkio_group *blkg)
 662 {
 663         struct request_queue *q = blkg->q;
 664         struct blkio_cgroup *blkcg = blkg->blkcg;
 665
 666         lockdep_assert_held(q->queue_lock);
 667         lockdep_assert_held(&blkcg->lock);
 668
 669         /* Something wrong if we are trying to remove same group twice */
 670         WARN_ON_ONCE(list_empty(&blkg->q_node));
 671         WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 672         list_del_init(&blkg->q_node);
 673         hlist_del_init_rcu(&blkg->blkcg_node);
 674
 675         spin_lock(&alloc_list_lock);
 676         list_del_init(&blkg->alloc_node);
 677         spin_unlock(&alloc_list_lock);
 678
 679         /*
 680          * Put the reference taken at the time of creation so that when all
 681          * queues are gone, group can be destroyed.
 682          */
 683         blkg_put(blkg);
 684 }
 685
 686 /*
 687  * XXX: This updates blkg policy data in-place for root blkg, which is
 688  * necessary across elevator switch and policy registration as root blkgs
 689  * aren't shot down.  This broken and racy implementation is temporary.
 690  * Eventually, blkg shoot down will be replaced by proper in-place update.
 691  */
 692 void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
 693 {
 694         struct blkio_policy_type *pol = blkio_policy[plid];
 695         struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
 696         struct blkg_policy_data *pd;
 697
 698         if (!blkg)
 699                 return;
 700
 701         kfree(blkg->pd[plid]);
 702         blkg->pd[plid] = NULL;
 703
 704         if (!pol)
 705                 return;
 706
 707         pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
 708         WARN_ON_ONCE(!pd);
 709
 710         pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
 711         WARN_ON_ONCE(!pd->stats_cpu);
 712
 713         blkg->pd[plid] = pd;
 714         pd->blkg = blkg;
 715         pol->ops.blkio_init_group_fn(blkg);
 716 }
 717 EXPORT_SYMBOL_GPL(update_root_blkg_pd);
 718
 719 /**
 720  * blkg_destroy_all - destroy all blkgs associated with a request_queue
 721  * @q: request_queue of interest
 722  * @destroy_root: whether to destroy root blkg or not
 723  *
 724  * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
 725  * destroyed; otherwise, root blkg is left alone.
 726  */
 727 void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 728 {
 729         struct blkio_group *blkg, *n;
 730
 731         spin_lock_irq(q->queue_lock);
 732
 733         list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 734                 struct blkio_cgroup *blkcg = blkg->blkcg;
 735
 736                 /* skip root? */
 737                 if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
 738                         continue;
 739
 740                 spin_lock(&blkcg->lock);
 741                 blkg_destroy(blkg);
 742                 spin_unlock(&blkcg->lock);
 743         }
 744
 745         spin_unlock_irq(q->queue_lock);
 746 }
 747 EXPORT_SYMBOL_GPL(blkg_destroy_all);
 748
 749 static void blkg_rcu_free(struct rcu_head *rcu_head)
 750 {
 751         blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
 752 }
 753
 754 void __blkg_release(struct blkio_group *blkg)
 755 {
 756         /* release the extra blkcg reference this blkg has been holding */
 757         css_put(&blkg->blkcg->css);
 758
 759         /*
 760          * A group is freed in rcu manner. But having an rcu lock does not
 761          * mean that one can access all the fields of blkg and assume these
 762          * are valid. For example, don't try to follow throtl_data and
 763          * request queue links.
 764          *
 765          * Having a reference to blkg under an rcu allows acess to only
 766          * values local to groups like group stats and group rate limits
 767          */
 768         call_rcu(&blkg->rcu_head, blkg_rcu_free);
 769 }
 770 EXPORT_SYMBOL_GPL(__blkg_release);
 771
 772 static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
 773 {
 774         struct blkg_policy_data *pd = blkg->pd[plid];
 775         int cpu;
 776
 777         if (pd->stats_cpu == NULL)
 778                 return;
 779
 780         for_each_possible_cpu(cpu) {
 781                 struct blkio_group_stats_cpu *sc =
 782                         per_cpu_ptr(pd->stats_cpu, cpu);
 783
 784                 sc->sectors = 0;
 785                 memset(sc->stat_arr_cpu, 0, sizeof(sc->stat_arr_cpu));
 786         }
 787 }
 788
 789 static int
 790 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 791 {
 792         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 793         struct blkio_group *blkg;
 794         struct hlist_node *n;
 795         int i;
 796
 797         spin_lock(&blkio_list_lock);
 798         spin_lock_irq(&blkcg->lock);
 799
 800         /*
 801          * Note that stat reset is racy - it doesn't synchronize against
 802          * stat updates.  This is a debug feature which shouldn't exist
 803          * anyway.  If you get hit by a race, retry.
 804          */
 805         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 806                 struct blkio_policy_type *pol;
 807
 808                 list_for_each_entry(pol, &blkio_list, list) {
 809                         struct blkg_policy_data *pd = blkg->pd[pol->plid];
 810                         struct blkio_group_stats *stats = &pd->stats;
 811
 812                         /* queued stats shouldn't be cleared */
 813                         for (i = 0; i < ARRAY_SIZE(stats->stat_arr); i++)
 814                                 if (i != BLKIO_STAT_QUEUED)
 815                                         memset(stats->stat_arr[i], 0,
 816                                                sizeof(stats->stat_arr[i]));
 817                         stats->time = 0;
 818 #ifdef CONFIG_DEBUG_BLK_CGROUP
 819                         memset((void *)stats + BLKG_STATS_DEBUG_CLEAR_START, 0,
 820                                BLKG_STATS_DEBUG_CLEAR_SIZE);
 821 #endif
 822                         blkio_reset_stats_cpu(blkg, pol->plid);
 823                 }
 824         }
 825
 826         spin_unlock_irq(&blkcg->lock);
 827         spin_unlock(&blkio_list_lock);
 828         return 0;
 829 }
 830
 831 static void blkio_get_key_name(enum stat_sub_type type, const char *dname,
 832                                char *str, int chars_left, bool diskname_only)
 833 {
 834         snprintf(str, chars_left, "%s", dname);
 835         chars_left -= strlen(str);
 836         if (chars_left <= 0) {
 837                 printk(KERN_WARNING
 838                         "Possibly incorrect cgroup stat display format");
 839                 return;
 840         }
 841         if (diskname_only)
 842                 return;
 843         switch (type) {
 844         case BLKIO_STAT_READ:
 845                 strlcat(str, " Read", chars_left);
 846                 break;
 847         case BLKIO_STAT_WRITE:
 848                 strlcat(str, " Write", chars_left);
 849                 break;
 850         case BLKIO_STAT_SYNC:
 851                 strlcat(str, " Sync", chars_left);
 852                 break;
 853         case BLKIO_STAT_ASYNC:
 854                 strlcat(str, " Async", chars_left);
 855                 break;
 856         case BLKIO_STAT_TOTAL:
 857                 strlcat(str, " Total", chars_left);
 858                 break;
 859         default:
 860                 strlcat(str, " Invalid", chars_left);
 861         }
 862 }
 863
 864 static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, int plid,
 865                         enum stat_type_cpu type, enum stat_sub_type sub_type)
 866 {
 867         struct blkg_policy_data *pd = blkg->pd[plid];
 868         int cpu;
 869         struct blkio_group_stats_cpu *stats_cpu;
 870         u64 val = 0, tval;
 871
 872         if (pd->stats_cpu == NULL)
 873                 return val;
 874
 875         for_each_possible_cpu(cpu) {
 876                 unsigned int start;
 877                 stats_cpu = per_cpu_ptr(pd->stats_cpu, cpu);
 878
 879                 do {
 880                         start = u64_stats_fetch_begin(&stats_cpu->syncp);
 881                         if (type == BLKIO_STAT_CPU_SECTORS)
 882                                 tval = stats_cpu->sectors;
 883                         else
 884                                 tval = stats_cpu->stat_arr_cpu[type][sub_type];
 885                 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
 886
 887                 val += tval;
 888         }
 889
 890         return val;
 891 }
 892
 893 static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, int plid,
 894                                    struct cgroup_map_cb *cb, const char *dname,
 895                                    enum stat_type_cpu type)
 896 {
 897         uint64_t disk_total, val;
 898         char key_str[MAX_KEY_LEN];
 899         enum stat_sub_type sub_type;
 900
 901         if (type == BLKIO_STAT_CPU_SECTORS) {
 902                 val = blkio_read_stat_cpu(blkg, plid, type, 0);
 903                 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
 904                 cb->fill(cb, key_str, val);
 905                 return val;
 906         }
 907
 908         for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
 909                         sub_type++) {
 910                 blkio_get_key_name(sub_type, dname, key_str, MAX_KEY_LEN,
 911                                    false);
 912                 val = blkio_read_stat_cpu(blkg, plid, type, sub_type);
 913                 cb->fill(cb, key_str, val);
 914         }
 915
 916         disk_total = blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_READ) +
 917                 blkio_read_stat_cpu(blkg, plid, type, BLKIO_STAT_WRITE);
 918
 919         blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 920                            false);
 921         cb->fill(cb, key_str, disk_total);
 922         return disk_total;
 923 }
 924
 925 static uint64_t blkio_get_stat(struct blkio_group *blkg, int plid,
 926                                struct cgroup_map_cb *cb, const char *dname,
 927                                enum stat_type type)
 928 {
 929         struct blkio_group_stats *stats = &blkg->pd[plid]->stats;
 930         uint64_t v = 0, disk_total = 0;
 931         char key_str[MAX_KEY_LEN];
 932         unsigned int sync_start;
 933         int st;
 934
 935         if (type >= BLKIO_STAT_ARR_NR) {
 936                 do {
 937                         sync_start = u64_stats_fetch_begin(&stats->syncp);
 938                         switch (type) {
 939                         case BLKIO_STAT_TIME:
 940                                 v = stats->time;
 941                                 break;
 942 #ifdef CONFIG_DEBUG_BLK_CGROUP
 943                         case BLKIO_STAT_UNACCOUNTED_TIME:
 944                                 v = stats->unaccounted_time;
 945                                 break;
 946                         case BLKIO_STAT_AVG_QUEUE_SIZE: {
 947                                 uint64_t samples = stats->avg_queue_size_samples;
 948
 949                                 if (samples) {
 950                                         v = stats->avg_queue_size_sum;
 951                                         do_div(v, samples);
 952                                 }
 953                                 break;
 954                         }
 955                         case BLKIO_STAT_IDLE_TIME:
 956                                 v = stats->idle_time;
 957                                 break;
 958                         case BLKIO_STAT_EMPTY_TIME:
 959                                 v = stats->empty_time;
 960                                 break;
 961                         case BLKIO_STAT_DEQUEUE:
 962                                 v = stats->dequeue;
 963                                 break;
 964                         case BLKIO_STAT_GROUP_WAIT_TIME:
 965                                 v = stats->group_wait_time;
 966                                 break;
 967 #endif
 968                         default:
 969                                 WARN_ON_ONCE(1);
 970                         }
 971                 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
 972
 973                 blkio_get_key_name(0, dname, key_str, MAX_KEY_LEN, true);
 974                 cb->fill(cb, key_str, v);
 975                 return v;
 976         }
 977
 978         for (st = BLKIO_STAT_READ; st < BLKIO_STAT_TOTAL; st++) {
 979                 do {
 980                         sync_start = u64_stats_fetch_begin(&stats->syncp);
 981                         v = stats->stat_arr[type][st];
 982                 } while (u64_stats_fetch_retry(&stats->syncp, sync_start));
 983
 984                 blkio_get_key_name(st, dname, key_str, MAX_KEY_LEN, false);
 985                 cb->fill(cb, key_str, v);
 986                 if (st == BLKIO_STAT_READ || st == BLKIO_STAT_WRITE)
 987                         disk_total += v;
 988         }
 989
 990         blkio_get_key_name(BLKIO_STAT_TOTAL, dname, key_str, MAX_KEY_LEN,
 991                            false);
 992         cb->fill(cb, key_str, disk_total);
 993         return disk_total;
 994 }
 995
 996 static int blkio_policy_parse_and_set(char *buf, enum blkio_policy_id plid,
 997                                       int fileid, struct blkio_cgroup *blkcg)
 998 {
 999         struct gendisk *disk = NULL;
1000         struct blkio_group *blkg = NULL;
1001         struct blkg_policy_data *pd;
1002         char *s[4], *p, *major_s = NULL, *minor_s = NULL;
1003         unsigned long major, minor;
1004         int i = 0, ret = -EINVAL;
1005         int part;
1006         dev_t dev;
1007         u64 temp;
1008
1009         memset(s, 0, sizeof(s));
1010
1011         while ((p = strsep(&buf, " ")) != NULL) {
1012                 if (!*p)
1013                         continue;
1014
1015                 s[i++] = p;
1016
1017                 /* Prevent from inputing too many things */
1018                 if (i == 3)
1019                         break;
1020         }
1021
1022         if (i != 2)
1023                 goto out;
1024
1025         p = strsep(&s[0], ":");
1026         if (p != NULL)
1027                 major_s = p;
1028         else
1029                 goto out;
1030
1031         minor_s = s[0];
1032         if (!minor_s)
1033                 goto out;
1034
1035         if (strict_strtoul(major_s, 10, &major))
1036                 goto out;
1037
1038         if (strict_strtoul(minor_s, 10, &minor))
1039                 goto out;
1040
1041         dev = MKDEV(major, minor);
1042
1043         if (strict_strtoull(s[1], 10, &temp))
1044                 goto out;
1045
1046         disk = get_gendisk(dev, &part);
1047         if (!disk || part)
1048                 goto out;
1049
1050         rcu_read_lock();
1051
1052         spin_lock_irq(disk->queue->queue_lock);
1053         blkg = blkg_lookup_create(blkcg, disk->queue, plid, false);
1054         spin_unlock_irq(disk->queue->queue_lock);
1055
1056         if (IS_ERR(blkg)) {
1057                 ret = PTR_ERR(blkg);
1058                 goto out_unlock;
1059         }
1060
1061         pd = blkg->pd[plid];
1062
1063         switch (plid) {
1064         case BLKIO_POLICY_PROP:
1065                 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
1066                      temp > BLKIO_WEIGHT_MAX)
1067                         goto out_unlock;
1068
1069                 pd->conf.weight = temp;
1070                 blkio_update_group_weight(blkg, plid, temp ?: blkcg->weight);
1071                 break;
1072         case BLKIO_POLICY_THROTL:
1073                 switch(fileid) {
1074                 case BLKIO_THROTL_read_bps_device:
1075                         pd->conf.bps[READ] = temp;
1076                         blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1077                         break;
1078                 case BLKIO_THROTL_write_bps_device:
1079                         pd->conf.bps[WRITE] = temp;
1080                         blkio_update_group_bps(blkg, plid, temp ?: -1, fileid);
1081                         break;
1082                 case BLKIO_THROTL_read_iops_device:
1083                         if (temp > THROTL_IOPS_MAX)
1084                                 goto out_unlock;
1085                         pd->conf.iops[READ] = temp;
1086                         blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1087                         break;
1088                 case BLKIO_THROTL_write_iops_device:
1089                         if (temp > THROTL_IOPS_MAX)
1090                                 goto out_unlock;
1091                         pd->conf.iops[WRITE] = temp;
1092                         blkio_update_group_iops(blkg, plid, temp ?: -1, fileid);
1093                         break;
1094                 }
1095                 break;
1096         default:
1097                 BUG();
1098         }
1099         ret = 0;
1100 out_unlock:
1101         rcu_read_unlock();
1102 out:
1103         put_disk(disk);
1104
1105         /*
1106          * If queue was bypassing, we should retry.  Do so after a short
1107          * msleep().  It isn't strictly necessary but queue can be
1108          * bypassing for some time and it's always nice to avoid busy
1109          * looping.
1110          */
1111         if (ret == -EBUSY) {
1112                 msleep(10);
1113                 return restart_syscall();
1114         }
1115         return ret;
1116 }
1117
1118 static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1119                                        const char *buffer)
1120 {
1121         int ret = 0;
1122         char *buf;
1123         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
1124         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1125         int fileid = BLKIOFILE_ATTR(cft->private);
1126
1127         buf = kstrdup(buffer, GFP_KERNEL);
1128         if (!buf)
1129                 return -ENOMEM;
1130
1131         ret = blkio_policy_parse_and_set(buf, plid, fileid, blkcg);
1132         kfree(buf);
1133         return ret;
1134 }
1135
1136 static const char *blkg_dev_name(struct blkio_group *blkg)
1137 {
1138         /* some drivers (floppy) instantiate a queue w/o disk registered */
1139         if (blkg->q->backing_dev_info.dev)
1140                 return dev_name(blkg->q->backing_dev_info.dev);
1141         return NULL;
1142 }
1143
1144 static void blkio_print_group_conf(struct cftype *cft, struct blkio_group *blkg,
1145                                    struct seq_file *m)
1146 {
1147         int plid = BLKIOFILE_POLICY(cft->private);
1148         int fileid = BLKIOFILE_ATTR(cft->private);
1149         struct blkg_policy_data *pd = blkg->pd[plid];
1150         const char *dname = blkg_dev_name(blkg);
1151         int rw = WRITE;
1152
1153         if (!dname)
1154                 return;
1155
1156         switch (plid) {
1157                 case BLKIO_POLICY_PROP:
1158                         if (pd->conf.weight)
1159                                 seq_printf(m, "%s\t%u\n",
1160                                            dname, pd->conf.weight);
1161                         break;
1162                 case BLKIO_POLICY_THROTL:
1163                         switch (fileid) {
1164                         case BLKIO_THROTL_read_bps_device:
1165                                 rw = READ;
1166                         case BLKIO_THROTL_write_bps_device:
1167                                 if (pd->conf.bps[rw])
1168                                         seq_printf(m, "%s\t%llu\n",
1169                                                    dname, pd->conf.bps[rw]);
1170                                 break;
1171                         case BLKIO_THROTL_read_iops_device:
1172                                 rw = READ;
1173                         case BLKIO_THROTL_write_iops_device:
1174                                 if (pd->conf.iops[rw])
1175                                         seq_printf(m, "%s\t%u\n",
1176                                                    dname, pd->conf.iops[rw]);
1177                                 break;
1178                         }
1179                         break;
1180                 default:
1181                         BUG();
1182         }
1183 }
1184
1185 /* cgroup files which read their data from policy nodes end up here */
1186 static void blkio_read_conf(struct cftype *cft, struct blkio_cgroup *blkcg,
1187                             struct seq_file *m)
1188 {
1189         struct blkio_group *blkg;
1190         struct hlist_node *n;
1191
1192         spin_lock_irq(&blkcg->lock);
1193         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1194                 blkio_print_group_conf(cft, blkg, m);
1195         spin_unlock_irq(&blkcg->lock);
1196 }
1197
1198 static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1199                                 struct seq_file *m)
1200 {
1201         struct blkio_cgroup *blkcg;
1202         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1203         int name = BLKIOFILE_ATTR(cft->private);
1204
1205         blkcg = cgroup_to_blkio_cgroup(cgrp);
1206
1207         switch(plid) {
1208         case BLKIO_POLICY_PROP:
1209                 switch(name) {
1210                 case BLKIO_PROP_weight_device:
1211                         blkio_read_conf(cft, blkcg, m);
1212                         return 0;
1213                 default:
1214                         BUG();
1215                 }
1216                 break;
1217         case BLKIO_POLICY_THROTL:
1218                 switch(name){
1219                 case BLKIO_THROTL_read_bps_device:
1220                 case BLKIO_THROTL_write_bps_device:
1221                 case BLKIO_THROTL_read_iops_device:
1222                 case BLKIO_THROTL_write_iops_device:
1223                         blkio_read_conf(cft, blkcg, m);
1224                         return 0;
1225                 default:
1226                         BUG();
1227                 }
1228                 break;
1229         default:
1230                 BUG();
1231         }
1232
1233         return 0;
1234 }
1235
1236 static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1237                 struct cftype *cft, struct cgroup_map_cb *cb,
1238                 enum stat_type type, bool show_total, bool pcpu)
1239 {
1240         struct blkio_group *blkg;
1241         struct hlist_node *n;
1242         uint64_t cgroup_total = 0;
1243
1244         spin_lock_irq(&blkcg->lock);
1245
1246         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1247                 const char *dname = blkg_dev_name(blkg);
1248                 int plid = BLKIOFILE_POLICY(cft->private);
1249
1250                 if (!dname)
1251                         continue;
1252                 if (pcpu)
1253                         cgroup_total += blkio_get_stat_cpu(blkg, plid,
1254                                                            cb, dname, type);
1255                 else
1256                         cgroup_total += blkio_get_stat(blkg, plid,
1257                                                        cb, dname, type);
1258         }
1259         if (show_total)
1260                 cb->fill(cb, "Total", cgroup_total);
1261
1262         spin_unlock_irq(&blkcg->lock);
1263         return 0;
1264 }
1265
1266 /* All map kind of cgroup file get serviced by this function */
1267 static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1268                                 struct cgroup_map_cb *cb)
1269 {
1270         struct blkio_cgroup *blkcg;
1271         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1272         int name = BLKIOFILE_ATTR(cft->private);
1273
1274         blkcg = cgroup_to_blkio_cgroup(cgrp);
1275
1276         switch(plid) {
1277         case BLKIO_POLICY_PROP:
1278                 switch(name) {
1279                 case BLKIO_PROP_time:
1280                         return blkio_read_blkg_stats(blkcg, cft, cb,
1281                                                 BLKIO_STAT_TIME, 0, 0);
1282                 case BLKIO_PROP_sectors:
1283                         return blkio_read_blkg_stats(blkcg, cft, cb,
1284                                                 BLKIO_STAT_CPU_SECTORS, 0, 1);
1285                 case BLKIO_PROP_io_service_bytes:
1286                         return blkio_read_blkg_stats(blkcg, cft, cb,
1287                                         BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1288                 case BLKIO_PROP_io_serviced:
1289                         return blkio_read_blkg_stats(blkcg, cft, cb,
1290                                                 BLKIO_STAT_CPU_SERVICED, 1, 1);
1291                 case BLKIO_PROP_io_service_time:
1292                         return blkio_read_blkg_stats(blkcg, cft, cb,
1293                                                 BLKIO_STAT_SERVICE_TIME, 1, 0);
1294                 case BLKIO_PROP_io_wait_time:
1295                         return blkio_read_blkg_stats(blkcg, cft, cb,
1296                                                 BLKIO_STAT_WAIT_TIME, 1, 0);
1297                 case BLKIO_PROP_io_merged:
1298                         return blkio_read_blkg_stats(blkcg, cft, cb,
1299                                                 BLKIO_STAT_MERGED, 1, 0);
1300                 case BLKIO_PROP_io_queued:
1301                         return blkio_read_blkg_stats(blkcg, cft, cb,
1302                                                 BLKIO_STAT_QUEUED, 1, 0);
1303 #ifdef CONFIG_DEBUG_BLK_CGROUP
1304                 case BLKIO_PROP_unaccounted_time:
1305                         return blkio_read_blkg_stats(blkcg, cft, cb,
1306                                         BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1307                 case BLKIO_PROP_dequeue:
1308                         return blkio_read_blkg_stats(blkcg, cft, cb,
1309                                                 BLKIO_STAT_DEQUEUE, 0, 0);
1310                 case BLKIO_PROP_avg_queue_size:
1311                         return blkio_read_blkg_stats(blkcg, cft, cb,
1312                                         BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1313                 case BLKIO_PROP_group_wait_time:
1314                         return blkio_read_blkg_stats(blkcg, cft, cb,
1315                                         BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1316                 case BLKIO_PROP_idle_time:
1317                         return blkio_read_blkg_stats(blkcg, cft, cb,
1318                                                 BLKIO_STAT_IDLE_TIME, 0, 0);
1319                 case BLKIO_PROP_empty_time:
1320                         return blkio_read_blkg_stats(blkcg, cft, cb,
1321                                                 BLKIO_STAT_EMPTY_TIME, 0, 0);
1322 #endif
1323                 default:
1324                         BUG();
1325                 }
1326                 break;
1327         case BLKIO_POLICY_THROTL:
1328                 switch(name){
1329                 case BLKIO_THROTL_io_service_bytes:
1330                         return blkio_read_blkg_stats(blkcg, cft, cb,
1331                                                 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1332                 case BLKIO_THROTL_io_serviced:
1333                         return blkio_read_blkg_stats(blkcg, cft, cb,
1334                                                 BLKIO_STAT_CPU_SERVICED, 1, 1);
1335                 default:
1336                         BUG();
1337                 }
1338                 break;
1339         default:
1340                 BUG();
1341         }
1342
1343         return 0;
1344 }
1345
1346 static int blkio_weight_write(struct blkio_cgroup *blkcg, int plid, u64 val)
1347 {
1348         struct blkio_group *blkg;
1349         struct hlist_node *n;
1350
1351         if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1352                 return -EINVAL;
1353
1354         spin_lock(&blkio_list_lock);
1355         spin_lock_irq(&blkcg->lock);
1356         blkcg->weight = (unsigned int)val;
1357
1358         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1359                 struct blkg_policy_data *pd = blkg->pd[plid];
1360
1361                 if (!pd->conf.weight)
1362                         blkio_update_group_weight(blkg, plid, blkcg->weight);
1363         }
1364
1365         spin_unlock_irq(&blkcg->lock);
1366         spin_unlock(&blkio_list_lock);
1367         return 0;
1368 }
1369
1370 static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1371         struct blkio_cgroup *blkcg;
1372         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1373         int name = BLKIOFILE_ATTR(cft->private);
1374
1375         blkcg = cgroup_to_blkio_cgroup(cgrp);
1376
1377         switch(plid) {
1378         case BLKIO_POLICY_PROP:
1379                 switch(name) {
1380                 case BLKIO_PROP_weight:
1381                         return (u64)blkcg->weight;
1382                 }
1383                 break;
1384         default:
1385                 BUG();
1386         }
1387         return 0;
1388 }
1389
1390 static int
1391 blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1392 {
1393         struct blkio_cgroup *blkcg;
1394         enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1395         int name = BLKIOFILE_ATTR(cft->private);
1396
1397         blkcg = cgroup_to_blkio_cgroup(cgrp);
1398
1399         switch(plid) {
1400         case BLKIO_POLICY_PROP:
1401                 switch(name) {
1402                 case BLKIO_PROP_weight:
1403                         return blkio_weight_write(blkcg, plid, val);
1404                 }
1405                 break;
1406         default:
1407                 BUG();
1408         }
1409
1410         return 0;
1411 }
1412
1413 struct cftype blkio_files[] = {
1414         {
1415                 .name = "weight_device",
1416                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417                                 BLKIO_PROP_weight_device),
1418                 .read_seq_string = blkiocg_file_read,
1419                 .write_string = blkiocg_file_write,
1420                 .max_write_len = 256,
1421         },
1422         {
1423                 .name = "weight",
1424                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1425                                 BLKIO_PROP_weight),
1426                 .read_u64 = blkiocg_file_read_u64,
1427                 .write_u64 = blkiocg_file_write_u64,
1428         },
1429         {
1430                 .name = "time",
1431                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1432                                 BLKIO_PROP_time),
1433                 .read_map = blkiocg_file_read_map,
1434         },
1435         {
1436                 .name = "sectors",
1437                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1438                                 BLKIO_PROP_sectors),
1439                 .read_map = blkiocg_file_read_map,
1440         },
1441         {
1442                 .name = "io_service_bytes",
1443                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1444                                 BLKIO_PROP_io_service_bytes),
1445                 .read_map = blkiocg_file_read_map,
1446         },
1447         {
1448                 .name = "io_serviced",
1449                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1450                                 BLKIO_PROP_io_serviced),
1451                 .read_map = blkiocg_file_read_map,
1452         },
1453         {
1454                 .name = "io_service_time",
1455                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1456                                 BLKIO_PROP_io_service_time),
1457                 .read_map = blkiocg_file_read_map,
1458         },
1459         {
1460                 .name = "io_wait_time",
1461                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1462                                 BLKIO_PROP_io_wait_time),
1463                 .read_map = blkiocg_file_read_map,
1464         },
1465         {
1466                 .name = "io_merged",
1467                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1468                                 BLKIO_PROP_io_merged),
1469                 .read_map = blkiocg_file_read_map,
1470         },
1471         {
1472                 .name = "io_queued",
1473                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1474                                 BLKIO_PROP_io_queued),
1475                 .read_map = blkiocg_file_read_map,
1476         },
1477         {
1478                 .name = "reset_stats",
1479                 .write_u64 = blkiocg_reset_stats,
1480         },
1481 #ifdef CONFIG_BLK_DEV_THROTTLING
1482         {
1483                 .name = "throttle.read_bps_device",
1484                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1485                                 BLKIO_THROTL_read_bps_device),
1486                 .read_seq_string = blkiocg_file_read,
1487                 .write_string = blkiocg_file_write,
1488                 .max_write_len = 256,
1489         },
1490
1491         {
1492                 .name = "throttle.write_bps_device",
1493                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1494                                 BLKIO_THROTL_write_bps_device),
1495                 .read_seq_string = blkiocg_file_read,
1496                 .write_string = blkiocg_file_write,
1497                 .max_write_len = 256,
1498         },
1499
1500         {
1501                 .name = "throttle.read_iops_device",
1502                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1503                                 BLKIO_THROTL_read_iops_device),
1504                 .read_seq_string = blkiocg_file_read,
1505                 .write_string = blkiocg_file_write,
1506                 .max_write_len = 256,
1507         },
1508
1509         {
1510                 .name = "throttle.write_iops_device",
1511                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1512                                 BLKIO_THROTL_write_iops_device),
1513                 .read_seq_string = blkiocg_file_read,
1514                 .write_string = blkiocg_file_write,
1515                 .max_write_len = 256,
1516         },
1517         {
1518                 .name = "throttle.io_service_bytes",
1519                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1520                                 BLKIO_THROTL_io_service_bytes),
1521                 .read_map = blkiocg_file_read_map,
1522         },
1523         {
1524                 .name = "throttle.io_serviced",
1525                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1526                                 BLKIO_THROTL_io_serviced),
1527                 .read_map = blkiocg_file_read_map,
1528         },
1529 #endif /* CONFIG_BLK_DEV_THROTTLING */
1530
1531 #ifdef CONFIG_DEBUG_BLK_CGROUP
1532         {
1533                 .name = "avg_queue_size",
1534                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1535                                 BLKIO_PROP_avg_queue_size),
1536                 .read_map = blkiocg_file_read_map,
1537         },
1538         {
1539                 .name = "group_wait_time",
1540                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1541                                 BLKIO_PROP_group_wait_time),
1542                 .read_map = blkiocg_file_read_map,
1543         },
1544         {
1545                 .name = "idle_time",
1546                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1547                                 BLKIO_PROP_idle_time),
1548                 .read_map = blkiocg_file_read_map,
1549         },
1550         {
1551                 .name = "empty_time",
1552                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1553                                 BLKIO_PROP_empty_time),
1554                 .read_map = blkiocg_file_read_map,
1555         },
1556         {
1557                 .name = "dequeue",
1558                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1559                                 BLKIO_PROP_dequeue),
1560                 .read_map = blkiocg_file_read_map,
1561         },
1562         {
1563                 .name = "unaccounted_time",
1564                 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1565                                 BLKIO_PROP_unaccounted_time),
1566                 .read_map = blkiocg_file_read_map,
1567         },
1568 #endif
1569 };
1570
1571 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1572 {
1573         return cgroup_add_files(cgroup, subsys, blkio_files,
1574                                 ARRAY_SIZE(blkio_files));
1575 }
1576
1577 /**
1578  * blkiocg_pre_destroy - cgroup pre_destroy callback
1579  * @subsys: cgroup subsys
1580  * @cgroup: cgroup of interest
1581  *
1582  * This function is called when @cgroup is about to go away and responsible
1583  * for shooting down all blkgs associated with @cgroup.  blkgs should be
1584  * removed while holding both q and blkcg locks.  As blkcg lock is nested
1585  * inside q lock, this function performs reverse double lock dancing.
1586  *
1587  * This is the blkcg counterpart of ioc_release_fn().
1588  */
1589 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
1590                                struct cgroup *cgroup)
1591 {
1592         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1593
1594         spin_lock_irq(&blkcg->lock);
1595
1596         while (!hlist_empty(&blkcg->blkg_list)) {
1597                 struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
1598                                                 struct blkio_group, blkcg_node);
1599                 struct request_queue *q = blkg->q;
1600
1601                 if (spin_trylock(q->queue_lock)) {
1602                         blkg_destroy(blkg);
1603                         spin_unlock(q->queue_lock);
1604                 } else {
1605                         spin_unlock_irq(&blkcg->lock);
1606                         cpu_relax();
1607                         spin_lock(&blkcg->lock);
1608                 }
1609         }
1610
1611         spin_unlock_irq(&blkcg->lock);
1612         return 0;
1613 }
1614
1615 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1616 {
1617         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1618
1619         if (blkcg != &blkio_root_cgroup)
1620                 kfree(blkcg);
1621 }
1622
1623 static struct cgroup_subsys_state *
1624 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1625 {
1626         static atomic64_t id_seq = ATOMIC64_INIT(0);
1627         struct blkio_cgroup *blkcg;
1628         struct cgroup *parent = cgroup->parent;
1629
1630         if (!parent) {
1631                 blkcg = &blkio_root_cgroup;
1632                 goto done;
1633         }
1634
1635         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1636         if (!blkcg)
1637                 return ERR_PTR(-ENOMEM);
1638
1639         blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1640         blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
1641 done:
1642         spin_lock_init(&blkcg->lock);
1643         INIT_HLIST_HEAD(&blkcg->blkg_list);
1644
1645         return &blkcg->css;
1646 }
1647
1648 /**
1649  * blkcg_init_queue - initialize blkcg part of request queue
1650  * @q: request_queue to initialize
1651  *
1652  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1653  * part of new request_queue @q.
1654  *
1655  * RETURNS:
1656  * 0 on success, -errno on failure.
1657  */
1658 int blkcg_init_queue(struct request_queue *q)
1659 {
1660         int ret;
1661
1662         might_sleep();
1663
1664         ret = blk_throtl_init(q);
1665         if (ret)
1666                 return ret;
1667
1668         mutex_lock(&all_q_mutex);
1669         INIT_LIST_HEAD(&q->all_q_node);
1670         list_add_tail(&q->all_q_node, &all_q_list);
1671         mutex_unlock(&all_q_mutex);
1672
1673         return 0;
1674 }
1675
1676 /**
1677  * blkcg_drain_queue - drain blkcg part of request_queue
1678  * @q: request_queue to drain
1679  *
1680  * Called from blk_drain_queue().  Responsible for draining blkcg part.
1681  */
1682 void blkcg_drain_queue(struct request_queue *q)
1683 {
1684         lockdep_assert_held(q->queue_lock);
1685
1686         blk_throtl_drain(q);
1687 }
1688
1689 /**
1690  * blkcg_exit_queue - exit and release blkcg part of request_queue
1691  * @q: request_queue being released
1692  *
1693  * Called from blk_release_queue().  Responsible for exiting blkcg part.
1694  */
1695 void blkcg_exit_queue(struct request_queue *q)
1696 {
1697         mutex_lock(&all_q_mutex);
1698         list_del_init(&q->all_q_node);
1699         mutex_unlock(&all_q_mutex);
1700
1701         blkg_destroy_all(q, true);
1702
1703         blk_throtl_exit(q);
1704 }
1705
1706 /*
1707  * We cannot support shared io contexts, as we have no mean to support
1708  * two tasks with the same ioc in two different groups without major rework
1709  * of the main cic data structures.  For now we allow a task to change
1710  * its cgroup only if it's the only owner of its ioc.
1711  */
1712 static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1713                               struct cgroup_taskset *tset)
1714 {
1715         struct task_struct *task;
1716         struct io_context *ioc;
1717         int ret = 0;
1718
1719         /* task_lock() is needed to avoid races with exit_io_context() */
1720         cgroup_taskset_for_each(task, cgrp, tset) {
1721                 task_lock(task);
1722                 ioc = task->io_context;
1723                 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1724                         ret = -EINVAL;
1725                 task_unlock(task);
1726                 if (ret)
1727                         break;
1728         }
1729         return ret;
1730 }
1731
1732 static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1733                            struct cgroup_taskset *tset)
1734 {
1735         struct task_struct *task;
1736         struct io_context *ioc;
1737
1738         cgroup_taskset_for_each(task, cgrp, tset) {
1739                 /* we don't lose anything even if ioc allocation fails */
1740                 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1741                 if (ioc) {
1742                         ioc_cgroup_changed(ioc);
1743                         put_io_context(ioc);
1744                 }
1745         }
1746 }
1747
1748 static void blkcg_bypass_start(void)
1749         __acquires(&all_q_mutex)
1750 {
1751         struct request_queue *q;
1752
1753         mutex_lock(&all_q_mutex);
1754
1755         list_for_each_entry(q, &all_q_list, all_q_node) {
1756                 blk_queue_bypass_start(q);
1757                 blkg_destroy_all(q, false);
1758         }
1759 }
1760
1761 static void blkcg_bypass_end(void)
1762         __releases(&all_q_mutex)
1763 {
1764         struct request_queue *q;
1765
1766         list_for_each_entry(q, &all_q_list, all_q_node)
1767                 blk_queue_bypass_end(q);
1768
1769         mutex_unlock(&all_q_mutex);
1770 }
1771
1772 void blkio_policy_register(struct blkio_policy_type *blkiop)
1773 {
1774         struct request_queue *q;
1775
1776         blkcg_bypass_start();
1777         spin_lock(&blkio_list_lock);
1778
1779         BUG_ON(blkio_policy[blkiop->plid]);
1780         blkio_policy[blkiop->plid] = blkiop;
1781         list_add_tail(&blkiop->list, &blkio_list);
1782
1783         spin_unlock(&blkio_list_lock);
1784         list_for_each_entry(q, &all_q_list, all_q_node)
1785                 update_root_blkg_pd(q, blkiop->plid);
1786         blkcg_bypass_end();
1787 }
1788 EXPORT_SYMBOL_GPL(blkio_policy_register);
1789
1790 void blkio_policy_unregister(struct blkio_policy_type *blkiop)
1791 {
1792         struct request_queue *q;
1793
1794         blkcg_bypass_start();
1795         spin_lock(&blkio_list_lock);
1796
1797         BUG_ON(blkio_policy[blkiop->plid] != blkiop);
1798         blkio_policy[blkiop->plid] = NULL;
1799         list_del_init(&blkiop->list);
1800
1801         spin_unlock(&blkio_list_lock);
1802         list_for_each_entry(q, &all_q_list, all_q_node)
1803                 update_root_blkg_pd(q, blkiop->plid);
1804         blkcg_bypass_end();
1805 }
1806 EXPORT_SYMBOL_GPL(blkio_policy_unregister);