blk-throttle: add throtl_grp->service_queue

[mirror_ubuntu-bionic-kernel.git] / block / blk-throttle.c
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 31146225f3d078fb5326e30195a0fa138f1f3e5a..7340440ccfb5857428f5fa7ea83420738f33d378 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,17 @@ static struct blkcg_policy blkcg_policy_throtl;
  
  /* A workqueue to queue throttle related work */
  static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
-                               unsigned long delay);
  
-struct throtl_rb_root {
-       struct rb_root rb;
-       struct rb_node *left;
-       unsigned int count;
-       unsigned long min_disptime;
+struct throtl_service_queue {
+       struct rb_root          pending_tree;   /* RB tree of active tgs */
+       struct rb_node          *first_pending; /* first node in the tree */
+       unsigned int            nr_pending;     /* # queued in the tree */
+       unsigned long           first_pending_disptime; /* disptime of the first tg */
  };
  
-#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
-                       .count = 0, .min_disptime = 0}
+enum tg_state_flags {
+       THROTL_TG_PENDING       = 1 << 0,       /* on parent's pending tree */
+};
  
  #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
  
@@ -52,9 +51,15 @@ struct throtl_grp {
         /* must be the first member */
         struct blkg_policy_data pd;
  
-       /* active throtl group service_tree member */
+       /* active throtl group service_queue member */
         struct rb_node rb_node;
  
+       /* throtl_data this group belongs to */
+       struct throtl_data *td;
+
+       /* this group's service queue */
+       struct throtl_service_queue service_queue;
+
         /*
          * Dispatch time in jiffies. This is the estimated time when group
          * will unthrottle and is ready to dispatch more bio. It is used as
@@ -85,9 +90,6 @@ struct throtl_grp {
         unsigned long slice_start[2];
         unsigned long slice_end[2];
  
-       /* Some throttle limits got updated for the group */
-       int limits_changed;
-
         /* Per cpu stats pointer */
         struct tg_stats_cpu __percpu *stats_cpu;
  
@@ -98,7 +100,7 @@ struct throtl_grp {
  struct throtl_data
  {
         /* service tree for active throtl groups */
-       struct throtl_rb_root tg_service_tree;
+       struct throtl_service_queue service_queue;
  
         struct request_queue *queue;
  
@@ -111,9 +113,7 @@ struct throtl_data
         unsigned int nr_undestroyed_grps;
  
         /* Work for dispatching throttled bios */
-       struct delayed_work throtl_work;
-
-       int limits_changed;
+       struct delayed_work dispatch_work;
  };
  
  /* list and work item to allocate percpu group stats */
@@ -143,41 +143,16 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
         return blkg_to_tg(td->queue->root_blkg);
  }
  
-enum tg_state_flags {
-       THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name)                                            \
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg)                \
-{                                                                      \
-       (tg)->flags |= (1 << THROTL_TG_FLAG_##name);                    \
-}                                                                      \
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg)       \
-{                                                                      \
-       (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);                   \
-}                                                                      \
-static inline int throtl_tg_##name(const struct throtl_grp *tg)                \
-{                                                                      \
-       return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;       \
-}
-
-THROTL_TG_FNS(on_rr);
-
-#define throtl_log_tg(td, tg, fmt, args...)    do {                    \
+#define throtl_log_tg(tg, fmt, args...)        do {                            \
         char __pbuf[128];                                               \
                                                                         \
         blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));              \
-       blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+       blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
  } while (0)
  
  #define throtl_log(td, fmt, args...)   \
         blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
  
-static inline unsigned int total_nr_queued(struct throtl_data *td)
-{
-       return td->nr_queued[0] + td->nr_queued[1];
-}
-
  /*
   * Worker for allocating per cpu stat for tgs. This is scheduled on the
   * system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +190,22 @@ alloc_stats:
                 goto alloc_stats;
  }
  
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
+{
+       sq->pending_tree = RB_ROOT;
+}
+
  static void throtl_pd_init(struct blkcg_gq *blkg)
  {
         struct throtl_grp *tg = blkg_to_tg(blkg);
         unsigned long flags;
  
+       throtl_service_queue_init(&tg->service_queue);
         RB_CLEAR_NODE(&tg->rb_node);
+       tg->td = blkg->q->td;
         bio_list_init(&tg->bio_lists[0]);
         bio_list_init(&tg->bio_lists[1]);
-       tg->limits_changed = false;
  
         tg->bps[READ] = -1;
         tg->bps[WRITE] = -1;
@@ -309,17 +291,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
         return tg;
  }
  
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
  {
         /* Service tree is empty */
-       if (!root->count)
+       if (!parent_sq->nr_pending)
                 return NULL;
  
-       if (!root->left)
-               root->left = rb_first(&root->rb);
+       if (!parent_sq->first_pending)
+               parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
  
-       if (root->left)
-               return rb_entry_tg(root->left);
+       if (parent_sq->first_pending)
+               return rb_entry_tg(parent_sq->first_pending);
  
         return NULL;
  }
@@ -330,29 +313,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
         RB_CLEAR_NODE(n);
  }
  
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n,
+                           struct throtl_service_queue *parent_sq)
  {
-       if (root->left == n)
-               root->left = NULL;
-       rb_erase_init(n, &root->rb);
-       --root->count;
+       if (parent_sq->first_pending == n)
+               parent_sq->first_pending = NULL;
+       rb_erase_init(n, &parent_sq->pending_tree);
+       --parent_sq->nr_pending;
  }
  
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
  {
         struct throtl_grp *tg;
  
-       tg = throtl_rb_first(st);
+       tg = throtl_rb_first(parent_sq);
         if (!tg)
                 return;
  
-       st->min_disptime = tg->disptime;
+       parent_sq->first_pending_disptime = tg->disptime;
  }
  
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg,
+                                struct throtl_service_queue *parent_sq)
  {
-       struct rb_node **node = &st->rb.rb_node;
+       struct rb_node **node = &parent_sq->pending_tree.rb_node;
         struct rb_node *parent = NULL;
         struct throtl_grp *__tg;
         unsigned long key = tg->disptime;
@@ -371,89 +355,95 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
         }
  
         if (left)
-               st->left = &tg->rb_node;
+               parent_sq->first_pending = &tg->rb_node;
  
         rb_link_node(&tg->rb_node, parent, node);
-       rb_insert_color(&tg->rb_node, &st->rb);
+       rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
  }
  
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_enqueue_tg(struct throtl_grp *tg,
+                               struct throtl_service_queue *parent_sq)
  {
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       tg_service_queue_add(tg, parent_sq);
+       tg->flags |= THROTL_TG_PENDING;
+       parent_sq->nr_pending++;
+}
  
-       tg_service_tree_add(st, tg);
-       throtl_mark_tg_on_rr(tg);
-       st->count++;
+static void throtl_enqueue_tg(struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
+{
+       if (!(tg->flags & THROTL_TG_PENDING))
+               __throtl_enqueue_tg(tg, parent_sq);
  }
  
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_dequeue_tg(struct throtl_grp *tg,
+                               struct throtl_service_queue *parent_sq)
  {
-       if (!throtl_tg_on_rr(tg))
-               __throtl_enqueue_tg(td, tg);
+       throtl_rb_erase(&tg->rb_node, parent_sq);
+       tg->flags &= ~THROTL_TG_PENDING;
  }
  
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
  {
-       throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
-       throtl_clear_tg_on_rr(tg);
+       if (tg->flags & THROTL_TG_PENDING)
+               __throtl_dequeue_tg(tg, parent_sq);
  }
  
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/* Call with queue lock held */
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+                                        unsigned long delay)
  {
-       if (throtl_tg_on_rr(tg))
-               __throtl_dequeue_tg(td, tg);
+       struct delayed_work *dwork = &td->dispatch_work;
+
+       mod_delayed_work(kthrotld_workqueue, dwork, delay);
+       throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
  }
  
  static void throtl_schedule_next_dispatch(struct throtl_data *td)
  {
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       struct throtl_service_queue *sq = &td->service_queue;
  
-       /*
-        * If there are more bios pending, schedule more work.
-        */
-       if (!total_nr_queued(td))
+       /* any pending children left? */
+       if (!sq->nr_pending)
                 return;
  
-       BUG_ON(!st->count);
+       update_min_dispatch_time(sq);
  
-       update_min_dispatch_time(st);
-
-       if (time_before_eq(st->min_disptime, jiffies))
+       if (time_before_eq(sq->first_pending_disptime, jiffies))
                 throtl_schedule_delayed_work(td, 0);
         else
-               throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+               throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
  }
  
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
  {
         tg->bytes_disp[rw] = 0;
         tg->io_disp[rw] = 0;
         tg->slice_start[rw] = jiffies;
         tg->slice_end[rw] = jiffies + throtl_slice;
-       throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+       throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
                         tg->slice_end[rw], jiffies);
  }
  
-static inline void throtl_set_slice_end(struct throtl_data *td,
-               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+                                       unsigned long jiffy_end)
  {
         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
  }
  
-static inline void throtl_extend_slice(struct throtl_data *td,
-               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+                                      unsigned long jiffy_end)
  {
         tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
-       throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+       throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
                         rw == READ ? 'R' : 'W', tg->slice_start[rw],
                         tg->slice_end[rw], jiffies);
  }
  
  /* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
  {
         if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
                 return 0;
@@ -462,8 +452,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  }
  
  /* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
  {
         unsigned long nr_slices, time_elapsed, io_trim;
         u64 bytes_trim, tmp;
@@ -475,7 +464,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
          * renewed. Don't try to trim the slice if slice is used. A new
          * slice will start when appropriate.
          */
-       if (throtl_slice_used(td, tg, rw))
+       if (throtl_slice_used(tg, rw))
                 return;
  
         /*
@@ -486,7 +475,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
          * is bad because it does not allow new slice to start.
          */
  
-       throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+       throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
  
         time_elapsed = jiffies - tg->slice_start[rw];
  
@@ -515,14 +504,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  
         tg->slice_start[rw] += nr_slices * throtl_slice;
  
-       throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
+       throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
                         " start=%lu end=%lu jiffies=%lu",
                         rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
                         tg->slice_start[rw], tg->slice_end[rw], jiffies);
  }
  
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
-               struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+                                 unsigned long *wait)
  {
         bool rw = bio_data_dir(bio);
         unsigned int io_allowed;
@@ -571,8 +560,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
         return 0;
  }
  
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
-               struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+                                unsigned long *wait)
  {
         bool rw = bio_data_dir(bio);
         u64 bytes_allowed, extra_bytes, tmp;
@@ -623,8 +612,8 @@ static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
   * Returns whether one can dispatch a bio or not. Also returns approx number
   * of jiffies to wait before this bio is with-in IO rate and can be dispatched
   */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-                               struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+                           unsigned long *wait)
  {
         bool rw = bio_data_dir(bio);
         unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -649,15 +638,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
          * existing slice to make sure it is at least throtl_slice interval
          * long since now.
          */
-       if (throtl_slice_used(td, tg, rw))
-               throtl_start_new_slice(td, tg, rw);
+       if (throtl_slice_used(tg, rw))
+               throtl_start_new_slice(tg, rw);
         else {
                 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-                       throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+                       throtl_extend_slice(tg, rw, jiffies + throtl_slice);
         }
  
-       if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
-           && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+       if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+           tg_with_in_iops_limit(tg, bio, &iops_wait)) {
                 if (wait)
                         *wait = 0;
                 return 1;
@@ -669,7 +658,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
                 *wait = max_wait;
  
         if (time_before(tg->slice_end[rw], jiffies + max_wait))
-               throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+               throtl_extend_slice(tg, rw, jiffies + max_wait);
  
         return 0;
  }
@@ -711,8 +700,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
         throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
  }
  
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
-                       struct bio *bio)
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
  {
         bool rw = bio_data_dir(bio);
  
@@ -720,32 +709,33 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
         /* Take a bio reference on tg */
         blkg_get(tg_to_blkg(tg));
         tg->nr_queued[rw]++;
-       td->nr_queued[rw]++;
-       throtl_enqueue_tg(td, tg);
+       tg->td->nr_queued[rw]++;
+       throtl_enqueue_tg(tg, parent_sq);
  }
  
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg,
+                              struct throtl_service_queue *parent_sq)
  {
         unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
         struct bio *bio;
  
         if ((bio = bio_list_peek(&tg->bio_lists[READ])))
-               tg_may_dispatch(td, tg, bio, &read_wait);
+               tg_may_dispatch(tg, bio, &read_wait);
  
         if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-               tg_may_dispatch(td, tg, bio, &write_wait);
+               tg_may_dispatch(tg, bio, &write_wait);
  
         min_wait = min(read_wait, write_wait);
         disptime = jiffies + min_wait;
  
         /* Update dispatch time */
-       throtl_dequeue_tg(td, tg);
+       throtl_dequeue_tg(tg, parent_sq);
         tg->disptime = disptime;
-       throtl_enqueue_tg(td, tg);
+       throtl_enqueue_tg(tg, parent_sq);
  }
  
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
-                               bool rw, struct bio_list *bl)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
+                               struct bio_list *bl)
  {
         struct bio *bio;
  
@@ -754,18 +744,17 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
         /* Drop bio reference on blkg */
         blkg_put(tg_to_blkg(tg));
  
-       BUG_ON(td->nr_queued[rw] <= 0);
-       td->nr_queued[rw]--;
+       BUG_ON(tg->td->nr_queued[rw] <= 0);
+       tg->td->nr_queued[rw]--;
  
         throtl_charge_bio(tg, bio);
         bio_list_add(bl, bio);
         bio->bi_rw |= REQ_THROTTLED;
  
-       throtl_trim_slice(td, tg, rw);
+       throtl_trim_slice(tg, rw);
  }
  
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
-                               struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
  {
         unsigned int nr_reads = 0, nr_writes = 0;
         unsigned int max_nr_reads = throtl_grp_quantum*3/4;
@@ -774,20 +763,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
  
         /* Try to dispatch 75% READS and 25% WRITES */
  
-       while ((bio = bio_list_peek(&tg->bio_lists[READ]))
-               && tg_may_dispatch(td, tg, bio, NULL)) {
+       while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
+              tg_may_dispatch(tg, bio, NULL)) {
  
-               tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+               tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
                 nr_reads++;
  
                 if (nr_reads >= max_nr_reads)
                         break;
         }
  
-       while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
-               && tg_may_dispatch(td, tg, bio, NULL)) {
+       while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
+              tg_may_dispatch(tg, bio, NULL)) {
  
-               tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+               tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
                 nr_writes++;
  
                 if (nr_writes >= max_nr_writes)
@@ -797,14 +786,14 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
         return nr_reads + nr_writes;
  }
  
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
+                                 struct bio_list *bl)
  {
         unsigned int nr_disp = 0;
         struct throtl_grp *tg;
-       struct throtl_rb_root *st = &td->tg_service_tree;
  
         while (1) {
-               tg = throtl_rb_first(st);
+               tg = throtl_rb_first(parent_sq);
  
                 if (!tg)
                         break;
@@ -812,14 +801,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
                 if (time_before(jiffies, tg->disptime))
                         break;
  
-               throtl_dequeue_tg(td, tg);
+               throtl_dequeue_tg(tg, parent_sq);
  
-               nr_disp += throtl_dispatch_tg(td, tg, bl);
+               nr_disp += throtl_dispatch_tg(tg, bl);
  
-               if (tg->nr_queued[0] || tg->nr_queued[1]) {
-                       tg_update_disptime(td, tg);
-                       throtl_enqueue_tg(td, tg);
-               }
+               if (tg->nr_queued[0] || tg->nr_queued[1])
+                       tg_update_disptime(tg, parent_sq);
  
                 if (nr_disp >= throtl_quantum)
                         break;
@@ -828,49 +815,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
         return nr_disp;
  }
  
-static void throtl_process_limit_change(struct throtl_data *td)
+/* work function to dispatch throttled bios */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
  {
+       struct throtl_data *td = container_of(to_delayed_work(work),
+                                             struct throtl_data, dispatch_work);
         struct request_queue *q = td->queue;
-       struct blkcg_gq *blkg, *n;
-
-       if (!td->limits_changed)
-               return;
-
-       xchg(&td->limits_changed, false);
-
-       throtl_log(td, "limits changed");
-
-       list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-               struct throtl_grp *tg = blkg_to_tg(blkg);
-
-               if (!tg->limits_changed)
-                       continue;
-
-               if (!xchg(&tg->limits_changed, false))
-                       continue;
-
-               throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-                       " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
-                       tg->iops[READ], tg->iops[WRITE]);
-
-               /*
-                * Restart the slices for both READ and WRITES. It
-                * might happen that a group's limit are dropped
-                * suddenly and we don't want to account recently
-                * dispatched IO with new low rate
-                */
-               throtl_start_new_slice(td, tg, 0);
-               throtl_start_new_slice(td, tg, 1);
-
-               if (throtl_tg_on_rr(tg))
-                       tg_update_disptime(td, tg);
-       }
-}
-
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
-{
-       struct throtl_data *td = q->td;
         unsigned int nr_disp = 0;
         struct bio_list bio_list_on_stack;
         struct bio *bio;
@@ -878,24 +828,19 @@ static int throtl_dispatch(struct request_queue *q)
  
         spin_lock_irq(q->queue_lock);
  
-       throtl_process_limit_change(td);
-
-       if (!total_nr_queued(td))
-               goto out;
-
         bio_list_init(&bio_list_on_stack);
  
         throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
-                       total_nr_queued(td), td->nr_queued[READ],
-                       td->nr_queued[WRITE]);
+                  td->nr_queued[READ] + td->nr_queued[WRITE],
+                  td->nr_queued[READ], td->nr_queued[WRITE]);
  
-       nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+       nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
  
         if (nr_disp)
                 throtl_log(td, "bios disp=%u", nr_disp);
  
         throtl_schedule_next_dispatch(td);
-out:
+
         spin_unlock_irq(q->queue_lock);
  
         /*
@@ -908,31 +853,6 @@ out:
                         generic_make_request(bio);
                 blk_finish_plug(&plug);
         }
-       return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
-       struct throtl_data *td = container_of(work, struct throtl_data,
-                                       throtl_work.work);
-       struct request_queue *q = td->queue;
-
-       throtl_dispatch(q);
-}
-
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
-       struct delayed_work *dwork = &td->throtl_work;
-
-       /* schedule work if limits changed even if no bio is queued */
-       if (total_nr_queued(td) || td->limits_changed) {
-               mod_delayed_work(kthrotld_workqueue, dwork, delay);
-               throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
-                               delay, jiffies);
-       }
  }
  
  static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1025,10 +945,25 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
         else
                 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
  
-       /* XXX: we don't need the following deferred processing */
-       xchg(&tg->limits_changed, true);
-       xchg(&td->limits_changed, true);
-       throtl_schedule_delayed_work(td, 0);
+       throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+                     tg->bps[READ], tg->bps[WRITE],
+                     tg->iops[READ], tg->iops[WRITE]);
+
+       /*
+        * We're already holding queue_lock and know @tg is valid.  Let's
+        * apply the new config directly.
+        *
+        * Restart the slices for both READ and WRITES. It might happen
+        * that a group's limit are dropped suddenly and we don't want to
+        * account recently dispatched IO with new low rate.
+        */
+       throtl_start_new_slice(tg, 0);
+       throtl_start_new_slice(tg, 1);
+
+       if (tg->flags & THROTL_TG_PENDING) {
+               tg_update_disptime(tg, &td->service_queue);
+               throtl_schedule_next_dispatch(td);
+       }
  
         blkg_conf_finish(&ctx);
         return 0;
@@ -1092,7 +1027,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
  {
         struct throtl_data *td = q->td;
  
-       cancel_delayed_work_sync(&td->throtl_work);
+       cancel_delayed_work_sync(&td->dispatch_work);
  }
  
  static struct blkcg_policy blkcg_policy_throtl = {
@@ -1153,7 +1088,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
         }
  
         /* Bio is with-in rate limit of group */
-       if (tg_may_dispatch(td, tg, bio, NULL)) {
+       if (tg_may_dispatch(tg, bio, NULL)) {
                 throtl_charge_bio(tg, bio);
  
                 /*
@@ -1167,12 +1102,12 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
                  *
                  * So keep on trimming slice even if bio is not queued.
                  */
-               throtl_trim_slice(td, tg, rw);
+               throtl_trim_slice(tg, rw);
                 goto out_unlock;
         }
  
  queue_bio:
-       throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
+       throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
                         " iodisp=%u iops=%u queued=%d/%d",
                         rw == READ ? 'R' : 'W',
                         tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
@@ -1180,11 +1115,11 @@ queue_bio:
                         tg->nr_queued[READ], tg->nr_queued[WRITE]);
  
         bio_associate_current(bio);
-       throtl_add_bio_tg(q->td, tg, bio);
+       throtl_add_bio_tg(bio, tg, &q->td->service_queue);
         throttled = true;
  
         if (update_disptime) {
-               tg_update_disptime(td, tg);
+               tg_update_disptime(tg, &td->service_queue);
                 throtl_schedule_next_dispatch(td);
         }
  
@@ -1206,7 +1141,7 @@ void blk_throtl_drain(struct request_queue *q)
         __releases(q->queue_lock) __acquires(q->queue_lock)
  {
         struct throtl_data *td = q->td;
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       struct throtl_service_queue *parent_sq = &td->service_queue;
         struct throtl_grp *tg;
         struct bio_list bl;
         struct bio *bio;
@@ -1215,13 +1150,13 @@ void blk_throtl_drain(struct request_queue *q)
  
         bio_list_init(&bl);
  
-       while ((tg = throtl_rb_first(st))) {
-               throtl_dequeue_tg(td, tg);
+       while ((tg = throtl_rb_first(parent_sq))) {
+               throtl_dequeue_tg(tg, parent_sq);
  
                 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
-                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+                       tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
                 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+                       tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
         }
         spin_unlock_irq(q->queue_lock);
  
@@ -1240,9 +1175,8 @@ int blk_throtl_init(struct request_queue *q)
         if (!td)
                 return -ENOMEM;
  
-       td->tg_service_tree = THROTL_RB_ROOT;
-       td->limits_changed = false;
-       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+       INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+       throtl_service_queue_init(&td->service_queue);
  
         q->td = td;
         td->queue = q;