]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - block/blk-throttle.c
blk-throttle: add throtl_grp->service_queue
[mirror_ubuntu-bionic-kernel.git] / block / blk-throttle.c
index 31146225f3d078fb5326e30195a0fa138f1f3e5a..7340440ccfb5857428f5fa7ea83420738f33d378 100644 (file)
@@ -25,18 +25,17 @@ static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
-static void throtl_schedule_delayed_work(struct throtl_data *td,
-                               unsigned long delay);
 
-struct throtl_rb_root {
-       struct rb_root rb;
-       struct rb_node *left;
-       unsigned int count;
-       unsigned long min_disptime;
+struct throtl_service_queue {
+       struct rb_root          pending_tree;   /* RB tree of active tgs */
+       struct rb_node          *first_pending; /* first node in the tree */
+       unsigned int            nr_pending;     /* # queued in the tree */
+       unsigned long           first_pending_disptime; /* disptime of the first tg */
 };
 
-#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
-                       .count = 0, .min_disptime = 0}
+enum tg_state_flags {
+       THROTL_TG_PENDING       = 1 << 0,       /* on parent's pending tree */
+};
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
@@ -52,9 +51,15 @@ struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;
 
-       /* active throtl group service_tree member */
+       /* active throtl group service_queue member */
        struct rb_node rb_node;
 
+       /* throtl_data this group belongs to */
+       struct throtl_data *td;
+
+       /* this group's service queue */
+       struct throtl_service_queue service_queue;
+
        /*
         * Dispatch time in jiffies. This is the estimated time when group
         * will unthrottle and is ready to dispatch more bio. It is used as
@@ -85,9 +90,6 @@ struct throtl_grp {
        unsigned long slice_start[2];
        unsigned long slice_end[2];
 
-       /* Some throttle limits got updated for the group */
-       int limits_changed;
-
        /* Per cpu stats pointer */
        struct tg_stats_cpu __percpu *stats_cpu;
 
@@ -98,7 +100,7 @@ struct throtl_grp {
 struct throtl_data
 {
        /* service tree for active throtl groups */
-       struct throtl_rb_root tg_service_tree;
+       struct throtl_service_queue service_queue;
 
        struct request_queue *queue;
 
@@ -111,9 +113,7 @@ struct throtl_data
        unsigned int nr_undestroyed_grps;
 
        /* Work for dispatching throttled bios */
-       struct delayed_work throtl_work;
-
-       int limits_changed;
+       struct delayed_work dispatch_work;
 };
 
 /* list and work item to allocate percpu group stats */
@@ -143,41 +143,16 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
        return blkg_to_tg(td->queue->root_blkg);
 }
 
-enum tg_state_flags {
-       THROTL_TG_FLAG_on_rr = 0,       /* on round-robin busy list */
-};
-
-#define THROTL_TG_FNS(name)                                            \
-static inline void throtl_mark_tg_##name(struct throtl_grp *tg)                \
-{                                                                      \
-       (tg)->flags |= (1 << THROTL_TG_FLAG_##name);                    \
-}                                                                      \
-static inline void throtl_clear_tg_##name(struct throtl_grp *tg)       \
-{                                                                      \
-       (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);                   \
-}                                                                      \
-static inline int throtl_tg_##name(const struct throtl_grp *tg)                \
-{                                                                      \
-       return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;       \
-}
-
-THROTL_TG_FNS(on_rr);
-
-#define throtl_log_tg(td, tg, fmt, args...)    do {                    \
+#define throtl_log_tg(tg, fmt, args...)        do {                            \
        char __pbuf[128];                                               \
                                                                        \
        blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf));              \
-       blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
+       blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \
 } while (0)
 
 #define throtl_log(td, fmt, args...)   \
        blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 
-static inline unsigned int total_nr_queued(struct throtl_data *td)
-{
-       return td->nr_queued[0] + td->nr_queued[1];
-}
-
 /*
  * Worker for allocating per cpu stat for tgs. This is scheduled on the
  * system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +190,22 @@ alloc_stats:
                goto alloc_stats;
 }
 
+/* init a service_queue, assumes the caller zeroed it */
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
+{
+       sq->pending_tree = RB_ROOT;
+}
+
 static void throtl_pd_init(struct blkcg_gq *blkg)
 {
        struct throtl_grp *tg = blkg_to_tg(blkg);
        unsigned long flags;
 
+       throtl_service_queue_init(&tg->service_queue);
        RB_CLEAR_NODE(&tg->rb_node);
+       tg->td = blkg->q->td;
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
-       tg->limits_changed = false;
 
        tg->bps[READ] = -1;
        tg->bps[WRITE] = -1;
@@ -309,17 +291,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
        return tg;
 }
 
-static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+static struct throtl_grp *
+throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
        /* Service tree is empty */
-       if (!root->count)
+       if (!parent_sq->nr_pending)
                return NULL;
 
-       if (!root->left)
-               root->left = rb_first(&root->rb);
+       if (!parent_sq->first_pending)
+               parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
 
-       if (root->left)
-               return rb_entry_tg(root->left);
+       if (parent_sq->first_pending)
+               return rb_entry_tg(parent_sq->first_pending);
 
        return NULL;
 }
@@ -330,29 +313,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
        RB_CLEAR_NODE(n);
 }
 
-static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+static void throtl_rb_erase(struct rb_node *n,
+                           struct throtl_service_queue *parent_sq)
 {
-       if (root->left == n)
-               root->left = NULL;
-       rb_erase_init(n, &root->rb);
-       --root->count;
+       if (parent_sq->first_pending == n)
+               parent_sq->first_pending = NULL;
+       rb_erase_init(n, &parent_sq->pending_tree);
+       --parent_sq->nr_pending;
 }
 
-static void update_min_dispatch_time(struct throtl_rb_root *st)
+static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 {
        struct throtl_grp *tg;
 
-       tg = throtl_rb_first(st);
+       tg = throtl_rb_first(parent_sq);
        if (!tg)
                return;
 
-       st->min_disptime = tg->disptime;
+       parent_sq->first_pending_disptime = tg->disptime;
 }
 
-static void
-tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+static void tg_service_queue_add(struct throtl_grp *tg,
+                                struct throtl_service_queue *parent_sq)
 {
-       struct rb_node **node = &st->rb.rb_node;
+       struct rb_node **node = &parent_sq->pending_tree.rb_node;
        struct rb_node *parent = NULL;
        struct throtl_grp *__tg;
        unsigned long key = tg->disptime;
@@ -371,89 +355,95 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
        }
 
        if (left)
-               st->left = &tg->rb_node;
+               parent_sq->first_pending = &tg->rb_node;
 
        rb_link_node(&tg->rb_node, parent, node);
-       rb_insert_color(&tg->rb_node, &st->rb);
+       rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
 }
 
-static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_enqueue_tg(struct throtl_grp *tg,
+                               struct throtl_service_queue *parent_sq)
 {
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       tg_service_queue_add(tg, parent_sq);
+       tg->flags |= THROTL_TG_PENDING;
+       parent_sq->nr_pending++;
+}
 
-       tg_service_tree_add(st, tg);
-       throtl_mark_tg_on_rr(tg);
-       st->count++;
+static void throtl_enqueue_tg(struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
+{
+       if (!(tg->flags & THROTL_TG_PENDING))
+               __throtl_enqueue_tg(tg, parent_sq);
 }
 
-static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void __throtl_dequeue_tg(struct throtl_grp *tg,
+                               struct throtl_service_queue *parent_sq)
 {
-       if (!throtl_tg_on_rr(tg))
-               __throtl_enqueue_tg(td, tg);
+       throtl_rb_erase(&tg->rb_node, parent_sq);
+       tg->flags &= ~THROTL_TG_PENDING;
 }
 
-static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+static void throtl_dequeue_tg(struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
 {
-       throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
-       throtl_clear_tg_on_rr(tg);
+       if (tg->flags & THROTL_TG_PENDING)
+               __throtl_dequeue_tg(tg, parent_sq);
 }
 
-static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+/* Call with queue lock held */
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+                                        unsigned long delay)
 {
-       if (throtl_tg_on_rr(tg))
-               __throtl_dequeue_tg(td, tg);
+       struct delayed_work *dwork = &td->dispatch_work;
+
+       mod_delayed_work(kthrotld_workqueue, dwork, delay);
+       throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies);
 }
 
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       struct throtl_service_queue *sq = &td->service_queue;
 
-       /*
-        * If there are more bios pending, schedule more work.
-        */
-       if (!total_nr_queued(td))
+       /* any pending children left? */
+       if (!sq->nr_pending)
                return;
 
-       BUG_ON(!st->count);
+       update_min_dispatch_time(sq);
 
-       update_min_dispatch_time(st);
-
-       if (time_before_eq(st->min_disptime, jiffies))
+       if (time_before_eq(sq->first_pending_disptime, jiffies))
                throtl_schedule_delayed_work(td, 0);
        else
-               throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
+               throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies);
 }
 
-static inline void
-throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 {
        tg->bytes_disp[rw] = 0;
        tg->io_disp[rw] = 0;
        tg->slice_start[rw] = jiffies;
        tg->slice_end[rw] = jiffies + throtl_slice;
-       throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+       throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
                        rw == READ ? 'R' : 'W', tg->slice_start[rw],
                        tg->slice_end[rw], jiffies);
 }
 
-static inline void throtl_set_slice_end(struct throtl_data *td,
-               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
+                                       unsigned long jiffy_end)
 {
        tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 }
 
-static inline void throtl_extend_slice(struct throtl_data *td,
-               struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
+                                      unsigned long jiffy_end)
 {
        tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
-       throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+       throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
                        rw == READ ? 'R' : 'W', tg->slice_start[rw],
                        tg->slice_end[rw], jiffies);
 }
 
 /* Determine if previously allocated or extended slice is complete or not */
-static bool
-throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
        if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
                return 0;
@@ -462,8 +452,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 }
 
 /* Trim the used slices and adjust slice start accordingly */
-static inline void
-throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
 {
        unsigned long nr_slices, time_elapsed, io_trim;
        u64 bytes_trim, tmp;
@@ -475,7 +464,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
         * renewed. Don't try to trim the slice if slice is used. A new
         * slice will start when appropriate.
         */
-       if (throtl_slice_used(td, tg, rw))
+       if (throtl_slice_used(tg, rw))
                return;
 
        /*
@@ -486,7 +475,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
         * is bad because it does not allow new slice to start.
         */
 
-       throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
+       throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
 
        time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -515,14 +504,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
        tg->slice_start[rw] += nr_slices * throtl_slice;
 
-       throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
+       throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
                        " start=%lu end=%lu jiffies=%lu",
                        rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
                        tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
-               struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
+                                 unsigned long *wait)
 {
        bool rw = bio_data_dir(bio);
        unsigned int io_allowed;
@@ -571,8 +560,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
        return 0;
 }
 
-static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
-               struct bio *bio, unsigned long *wait)
+static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
+                                unsigned long *wait)
 {
        bool rw = bio_data_dir(bio);
        u64 bytes_allowed, extra_bytes, tmp;
@@ -623,8 +612,8 @@ static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-                               struct bio *bio, unsigned long *wait)
+static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
+                           unsigned long *wait)
 {
        bool rw = bio_data_dir(bio);
        unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -649,15 +638,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
         * existing slice to make sure it is at least throtl_slice interval
         * long since now.
         */
-       if (throtl_slice_used(td, tg, rw))
-               throtl_start_new_slice(td, tg, rw);
+       if (throtl_slice_used(tg, rw))
+               throtl_start_new_slice(tg, rw);
        else {
                if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-                       throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+                       throtl_extend_slice(tg, rw, jiffies + throtl_slice);
        }
 
-       if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
-           && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+       if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
+           tg_with_in_iops_limit(tg, bio, &iops_wait)) {
                if (wait)
                        *wait = 0;
                return 1;
@@ -669,7 +658,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
                *wait = max_wait;
 
        if (time_before(tg->slice_end[rw], jiffies + max_wait))
-               throtl_extend_slice(td, tg, rw, jiffies + max_wait);
+               throtl_extend_slice(tg, rw, jiffies + max_wait);
 
        return 0;
 }
@@ -711,8 +700,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
        throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 
-static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
-                       struct bio *bio)
+static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg,
+                             struct throtl_service_queue *parent_sq)
 {
        bool rw = bio_data_dir(bio);
 
@@ -720,32 +709,33 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
        /* Take a bio reference on tg */
        blkg_get(tg_to_blkg(tg));
        tg->nr_queued[rw]++;
-       td->nr_queued[rw]++;
-       throtl_enqueue_tg(td, tg);
+       tg->td->nr_queued[rw]++;
+       throtl_enqueue_tg(tg, parent_sq);
 }
 
-static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+static void tg_update_disptime(struct throtl_grp *tg,
+                              struct throtl_service_queue *parent_sq)
 {
        unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
        struct bio *bio;
 
        if ((bio = bio_list_peek(&tg->bio_lists[READ])))
-               tg_may_dispatch(td, tg, bio, &read_wait);
+               tg_may_dispatch(tg, bio, &read_wait);
 
        if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-               tg_may_dispatch(td, tg, bio, &write_wait);
+               tg_may_dispatch(tg, bio, &write_wait);
 
        min_wait = min(read_wait, write_wait);
        disptime = jiffies + min_wait;
 
        /* Update dispatch time */
-       throtl_dequeue_tg(td, tg);
+       throtl_dequeue_tg(tg, parent_sq);
        tg->disptime = disptime;
-       throtl_enqueue_tg(td, tg);
+       throtl_enqueue_tg(tg, parent_sq);
 }
 
-static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
-                               bool rw, struct bio_list *bl)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw,
+                               struct bio_list *bl)
 {
        struct bio *bio;
 
@@ -754,18 +744,17 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
        /* Drop bio reference on blkg */
        blkg_put(tg_to_blkg(tg));
 
-       BUG_ON(td->nr_queued[rw] <= 0);
-       td->nr_queued[rw]--;
+       BUG_ON(tg->td->nr_queued[rw] <= 0);
+       tg->td->nr_queued[rw]--;
 
        throtl_charge_bio(tg, bio);
        bio_list_add(bl, bio);
        bio->bi_rw |= REQ_THROTTLED;
 
-       throtl_trim_slice(td, tg, rw);
+       throtl_trim_slice(tg, rw);
 }
 
-static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
-                               struct bio_list *bl)
+static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl)
 {
        unsigned int nr_reads = 0, nr_writes = 0;
        unsigned int max_nr_reads = throtl_grp_quantum*3/4;
@@ -774,20 +763,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 
        /* Try to dispatch 75% READS and 25% WRITES */
 
-       while ((bio = bio_list_peek(&tg->bio_lists[READ]))
-               && tg_may_dispatch(td, tg, bio, NULL)) {
+       while ((bio = bio_list_peek(&tg->bio_lists[READ])) &&
+              tg_may_dispatch(tg, bio, NULL)) {
 
-               tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+               tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
                nr_reads++;
 
                if (nr_reads >= max_nr_reads)
                        break;
        }
 
-       while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
-               && tg_may_dispatch(td, tg, bio, NULL)) {
+       while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) &&
+              tg_may_dispatch(tg, bio, NULL)) {
 
-               tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+               tg_dispatch_one_bio(tg, bio_data_dir(bio), bl);
                nr_writes++;
 
                if (nr_writes >= max_nr_writes)
@@ -797,14 +786,14 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
        return nr_reads + nr_writes;
 }
 
-static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+static int throtl_select_dispatch(struct throtl_service_queue *parent_sq,
+                                 struct bio_list *bl)
 {
        unsigned int nr_disp = 0;
        struct throtl_grp *tg;
-       struct throtl_rb_root *st = &td->tg_service_tree;
 
        while (1) {
-               tg = throtl_rb_first(st);
+               tg = throtl_rb_first(parent_sq);
 
                if (!tg)
                        break;
@@ -812,14 +801,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
                if (time_before(jiffies, tg->disptime))
                        break;
 
-               throtl_dequeue_tg(td, tg);
+               throtl_dequeue_tg(tg, parent_sq);
 
-               nr_disp += throtl_dispatch_tg(td, tg, bl);
+               nr_disp += throtl_dispatch_tg(tg, bl);
 
-               if (tg->nr_queued[0] || tg->nr_queued[1]) {
-                       tg_update_disptime(td, tg);
-                       throtl_enqueue_tg(td, tg);
-               }
+               if (tg->nr_queued[0] || tg->nr_queued[1])
+                       tg_update_disptime(tg, parent_sq);
 
                if (nr_disp >= throtl_quantum)
                        break;
@@ -828,49 +815,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
        return nr_disp;
 }
 
-static void throtl_process_limit_change(struct throtl_data *td)
+/* work function to dispatch throttled bios */
+void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
+       struct throtl_data *td = container_of(to_delayed_work(work),
+                                             struct throtl_data, dispatch_work);
        struct request_queue *q = td->queue;
-       struct blkcg_gq *blkg, *n;
-
-       if (!td->limits_changed)
-               return;
-
-       xchg(&td->limits_changed, false);
-
-       throtl_log(td, "limits changed");
-
-       list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-               struct throtl_grp *tg = blkg_to_tg(blkg);
-
-               if (!tg->limits_changed)
-                       continue;
-
-               if (!xchg(&tg->limits_changed, false))
-                       continue;
-
-               throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
-                       " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
-                       tg->iops[READ], tg->iops[WRITE]);
-
-               /*
-                * Restart the slices for both READ and WRITES. It
-                * might happen that a group's limit are dropped
-                * suddenly and we don't want to account recently
-                * dispatched IO with new low rate
-                */
-               throtl_start_new_slice(td, tg, 0);
-               throtl_start_new_slice(td, tg, 1);
-
-               if (throtl_tg_on_rr(tg))
-                       tg_update_disptime(td, tg);
-       }
-}
-
-/* Dispatch throttled bios. Should be called without queue lock held. */
-static int throtl_dispatch(struct request_queue *q)
-{
-       struct throtl_data *td = q->td;
        unsigned int nr_disp = 0;
        struct bio_list bio_list_on_stack;
        struct bio *bio;
@@ -878,24 +828,19 @@ static int throtl_dispatch(struct request_queue *q)
 
        spin_lock_irq(q->queue_lock);
 
-       throtl_process_limit_change(td);
-
-       if (!total_nr_queued(td))
-               goto out;
-
        bio_list_init(&bio_list_on_stack);
 
        throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
-                       total_nr_queued(td), td->nr_queued[READ],
-                       td->nr_queued[WRITE]);
+                  td->nr_queued[READ] + td->nr_queued[WRITE],
+                  td->nr_queued[READ], td->nr_queued[WRITE]);
 
-       nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+       nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack);
 
        if (nr_disp)
                throtl_log(td, "bios disp=%u", nr_disp);
 
        throtl_schedule_next_dispatch(td);
-out:
+
        spin_unlock_irq(q->queue_lock);
 
        /*
@@ -908,31 +853,6 @@ out:
                        generic_make_request(bio);
                blk_finish_plug(&plug);
        }
-       return nr_disp;
-}
-
-void blk_throtl_work(struct work_struct *work)
-{
-       struct throtl_data *td = container_of(work, struct throtl_data,
-                                       throtl_work.work);
-       struct request_queue *q = td->queue;
-
-       throtl_dispatch(q);
-}
-
-/* Call with queue lock held */
-static void
-throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
-{
-
-       struct delayed_work *dwork = &td->throtl_work;
-
-       /* schedule work if limits changed even if no bio is queued */
-       if (total_nr_queued(td) || td->limits_changed) {
-               mod_delayed_work(kthrotld_workqueue, dwork, delay);
-               throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
-                               delay, jiffies);
-       }
 }
 
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1025,10 +945,25 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
        else
                *(unsigned int *)((void *)tg + cft->private) = ctx.v;
 
-       /* XXX: we don't need the following deferred processing */
-       xchg(&tg->limits_changed, true);
-       xchg(&td->limits_changed, true);
-       throtl_schedule_delayed_work(td, 0);
+       throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
+                     tg->bps[READ], tg->bps[WRITE],
+                     tg->iops[READ], tg->iops[WRITE]);
+
+       /*
+        * We're already holding queue_lock and know @tg is valid.  Let's
+        * apply the new config directly.
+        *
+        * Restart the slices for both READ and WRITES. It might happen
+        * that a group's limit are dropped suddenly and we don't want to
+        * account recently dispatched IO with new low rate.
+        */
+       throtl_start_new_slice(tg, 0);
+       throtl_start_new_slice(tg, 1);
+
+       if (tg->flags & THROTL_TG_PENDING) {
+               tg_update_disptime(tg, &td->service_queue);
+               throtl_schedule_next_dispatch(td);
+       }
 
        blkg_conf_finish(&ctx);
        return 0;
@@ -1092,7 +1027,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 {
        struct throtl_data *td = q->td;
 
-       cancel_delayed_work_sync(&td->throtl_work);
+       cancel_delayed_work_sync(&td->dispatch_work);
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
@@ -1153,7 +1088,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
        }
 
        /* Bio is with-in rate limit of group */
-       if (tg_may_dispatch(td, tg, bio, NULL)) {
+       if (tg_may_dispatch(tg, bio, NULL)) {
                throtl_charge_bio(tg, bio);
 
                /*
@@ -1167,12 +1102,12 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
                 *
                 * So keep on trimming slice even if bio is not queued.
                 */
-               throtl_trim_slice(td, tg, rw);
+               throtl_trim_slice(tg, rw);
                goto out_unlock;
        }
 
 queue_bio:
-       throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
+       throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
                        " iodisp=%u iops=%u queued=%d/%d",
                        rw == READ ? 'R' : 'W',
                        tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
@@ -1180,11 +1115,11 @@ queue_bio:
                        tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
        bio_associate_current(bio);
-       throtl_add_bio_tg(q->td, tg, bio);
+       throtl_add_bio_tg(bio, tg, &q->td->service_queue);
        throttled = true;
 
        if (update_disptime) {
-               tg_update_disptime(td, tg);
+               tg_update_disptime(tg, &td->service_queue);
                throtl_schedule_next_dispatch(td);
        }
 
@@ -1206,7 +1141,7 @@ void blk_throtl_drain(struct request_queue *q)
        __releases(q->queue_lock) __acquires(q->queue_lock)
 {
        struct throtl_data *td = q->td;
-       struct throtl_rb_root *st = &td->tg_service_tree;
+       struct throtl_service_queue *parent_sq = &td->service_queue;
        struct throtl_grp *tg;
        struct bio_list bl;
        struct bio *bio;
@@ -1215,13 +1150,13 @@ void blk_throtl_drain(struct request_queue *q)
 
        bio_list_init(&bl);
 
-       while ((tg = throtl_rb_first(st))) {
-               throtl_dequeue_tg(td, tg);
+       while ((tg = throtl_rb_first(parent_sq))) {
+               throtl_dequeue_tg(tg, parent_sq);
 
                while ((bio = bio_list_peek(&tg->bio_lists[READ])))
-                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+                       tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
                while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
-                       tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
+                       tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl);
        }
        spin_unlock_irq(q->queue_lock);
 
@@ -1240,9 +1175,8 @@ int blk_throtl_init(struct request_queue *q)
        if (!td)
                return -ENOMEM;
 
-       td->tg_service_tree = THROTL_RB_ROOT;
-       td->limits_changed = false;
-       INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+       INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+       throtl_service_queue_init(&td->service_queue);
 
        q->td = td;
        td->queue = q;