sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler

[mirror_ubuntu-bionic-kernel.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index c56fb57f2991ef4f2a68395c534b32d3053ae208..9fd37169b302c1ae2e87e0754cec8685a4a939fe 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -102,9 +102,12 @@ void update_rq_clock(struct rq *rq)
  
         lockdep_assert_held(&rq->lock);
  
-       if (rq->clock_skip_update & RQCF_ACT_SKIP)
+       if (rq->clock_update_flags & RQCF_ACT_SKIP)
                 return;
  
+#ifdef CONFIG_SCHED_DEBUG
+       rq->clock_update_flags |= RQCF_UPDATED;
+#endif
         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
         if (delta < 0)
                 return;
@@ -185,7 +188,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
                 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       rq_pin_lock(rq, rf);
                         return rq;
                 }
                 raw_spin_unlock(&rq->lock);
@@ -225,7 +228,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
                  * pair with the WMB to ensure we must then also see migrating.
                  */
                 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-                       rf->cookie = lockdep_pin_lock(&rq->lock);
+                       rq_pin_lock(rq, rf);
                         return rq;
                 }
                 raw_spin_unlock(&rq->lock);
@@ -1195,9 +1198,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                  * OK, since we're going to drop the lock immediately
                  * afterwards anyway.
                  */
-               lockdep_unpin_lock(&rq->lock, rf.cookie);
+               rq_unpin_lock(rq, &rf);
                 rq = move_queued_task(rq, p, dest_cpu);
-               lockdep_repin_lock(&rq->lock, rf.cookie);
+               rq_repin_lock(rq, &rf);
         }
  out:
         task_rq_unlock(rq, p, &rf);
@@ -1690,7 +1693,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
   * Mark the task runnable and perform wakeup-preemption.
   */
  static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
-                          struct pin_cookie cookie)
+                          struct rq_flags *rf)
  {
         check_preempt_curr(rq, p, wake_flags);
         p->state = TASK_RUNNING;
@@ -1702,9 +1705,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
                  * Our task @p is fully woken up and running; so its safe to
                  * drop the rq->lock, hereafter rq is only used for statistics.
                  */
-               lockdep_unpin_lock(&rq->lock, cookie);
+               rq_unpin_lock(rq, rf);
                 p->sched_class->task_woken(rq, p);
-               lockdep_repin_lock(&rq->lock, cookie);
+               rq_repin_lock(rq, rf);
         }
  
         if (rq->idle_stamp) {
@@ -1723,7 +1726,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
  
  static void
  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
-                struct pin_cookie cookie)
+                struct rq_flags *rf)
  {
         int en_flags = ENQUEUE_WAKEUP;
  
@@ -1738,7 +1741,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
  #endif
  
         ttwu_activate(rq, p, en_flags);
-       ttwu_do_wakeup(rq, p, wake_flags, cookie);
+       ttwu_do_wakeup(rq, p, wake_flags, rf);
  }
  
  /*
@@ -1757,7 +1760,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
         if (task_on_rq_queued(p)) {
                 /* check_preempt_curr() may use rq clock */
                 update_rq_clock(rq);
-               ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
+               ttwu_do_wakeup(rq, p, wake_flags, &rf);
                 ret = 1;
         }
         __task_rq_unlock(rq, &rf);
@@ -1770,15 +1773,15 @@ void sched_ttwu_pending(void)
  {
         struct rq *rq = this_rq();
         struct llist_node *llist = llist_del_all(&rq->wake_list);
-       struct pin_cookie cookie;
         struct task_struct *p;
         unsigned long flags;
+       struct rq_flags rf;
  
         if (!llist)
                 return;
  
         raw_spin_lock_irqsave(&rq->lock, flags);
-       cookie = lockdep_pin_lock(&rq->lock);
+       rq_pin_lock(rq, &rf);
  
         while (llist) {
                 int wake_flags = 0;
@@ -1789,10 +1792,10 @@ void sched_ttwu_pending(void)
                 if (p->sched_remote_wakeup)
                         wake_flags = WF_MIGRATED;
  
-               ttwu_do_activate(rq, p, wake_flags, cookie);
+               ttwu_do_activate(rq, p, wake_flags, &rf);
         }
  
-       lockdep_unpin_lock(&rq->lock, cookie);
+       rq_unpin_lock(rq, &rf);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -1881,7 +1884,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
  static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  {
         struct rq *rq = cpu_rq(cpu);
-       struct pin_cookie cookie;
+       struct rq_flags rf;
  
  #if defined(CONFIG_SMP)
         if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1892,9 +1895,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  #endif
  
         raw_spin_lock(&rq->lock);
-       cookie = lockdep_pin_lock(&rq->lock);
-       ttwu_do_activate(rq, p, wake_flags, cookie);
-       lockdep_unpin_lock(&rq->lock, cookie);
+       rq_pin_lock(rq, &rf);
+       ttwu_do_activate(rq, p, wake_flags, &rf);
+       rq_unpin_lock(rq, &rf);
         raw_spin_unlock(&rq->lock);
  }
  
@@ -2086,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
+       if (p->in_iowait) {
+               delayacct_blkio_end();
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
         }
+
+#else /* CONFIG_SMP */
+
+       if (p->in_iowait) {
+               delayacct_blkio_end();
+               atomic_dec(&task_rq(p)->nr_iowait);
+       }
+
  #endif /* CONFIG_SMP */
  
         ttwu_queue(p, cpu, wake_flags);
@@ -2111,7 +2127,7 @@ out:
   * ensure that this_rq() is locked, @p is bound to this_rq() and not
   * the current task.
   */
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
  {
         struct rq *rq = task_rq(p);
  
@@ -2128,11 +2144,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                  * disabled avoiding further scheduler activity on it and we've
                  * not yet picked a replacement task.
                  */
-               lockdep_unpin_lock(&rq->lock, cookie);
+               rq_unpin_lock(rq, rf);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&p->pi_lock);
                 raw_spin_lock(&rq->lock);
-               lockdep_repin_lock(&rq->lock, cookie);
+               rq_repin_lock(rq, rf);
         }
  
         if (!(p->state & TASK_NORMAL))
@@ -2140,10 +2156,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
  
         trace_sched_waking(p);
  
-       if (!task_on_rq_queued(p))
+       if (!task_on_rq_queued(p)) {
+               if (p->in_iowait) {
+                       delayacct_blkio_end();
+                       atomic_dec(&rq->nr_iowait);
+               }
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+       }
  
-       ttwu_do_wakeup(rq, p, 0, cookie);
+       ttwu_do_wakeup(rq, p, 0, rf);
         ttwu_stat(p, smp_processor_id(), 0);
  out:
         raw_spin_unlock(&p->pi_lock);
@@ -2578,6 +2599,7 @@ void wake_up_new_task(struct task_struct *p)
         __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
         rq = __task_rq_lock(p, &rf);
+       update_rq_clock(rq);
         post_init_entity_util_avg(&p->se);
  
         activate_task(rq, p, 0);
@@ -2590,9 +2612,9 @@ void wake_up_new_task(struct task_struct *p)
                  * Nothing relies on rq->lock after this, so its fine to
                  * drop it.
                  */
-               lockdep_unpin_lock(&rq->lock, rf.cookie);
+               rq_unpin_lock(rq, &rf);
                 p->sched_class->task_woken(rq, p);
-               lockdep_repin_lock(&rq->lock, rf.cookie);
+               rq_repin_lock(rq, &rf);
         }
  #endif
         task_rq_unlock(rq, p, &rf);
@@ -2861,7 +2883,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
   */
  static __always_inline struct rq *
  context_switch(struct rq *rq, struct task_struct *prev,
-              struct task_struct *next, struct pin_cookie cookie)
+              struct task_struct *next, struct rq_flags *rf)
  {
         struct mm_struct *mm, *oldmm;
  
@@ -2887,13 +2909,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
+
+       rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
         /*
          * Since the runqueue lock will be released by the next
          * task (which is an invalid locking op but in the case
          * of the scheduler it's an obvious special-case), so we
          * do an early lockdep release here:
          */
-       lockdep_unpin_lock(&rq->lock, cookie);
+       rq_unpin_lock(rq, rf);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  
         /* Here we just switch the register state and the stack. */
@@ -2949,6 +2974,36 @@ unsigned long long nr_context_switches(void)
         return sum;
  }
  
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
  unsigned long nr_iowait(void)
  {
         unsigned long i, sum = 0;
@@ -2959,6 +3014,13 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
  unsigned long nr_iowait_cpu(int cpu)
  {
         struct rq *this = cpu_rq(cpu);
@@ -3257,7 +3319,7 @@ static inline void schedule_debug(struct task_struct *prev)
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         const struct sched_class *class = &fair_sched_class;
         struct task_struct *p;
@@ -3268,20 +3330,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie
          */
         if (likely(prev->sched_class == class &&
                    rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq, prev, cookie);
+               p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
                         goto again;
  
                 /* assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
-                       p = idle_sched_class.pick_next_task(rq, prev, cookie);
+                       p = idle_sched_class.pick_next_task(rq, prev, rf);
  
                 return p;
         }
  
  again:
         for_each_class(class) {
-               p = class->pick_next_task(rq, prev, cookie);
+               p = class->pick_next_task(rq, prev, rf);
                 if (p) {
                         if (unlikely(p == RETRY_TASK))
                                 goto again;
@@ -3335,7 +3397,7 @@ static void __sched notrace __schedule(bool preempt)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
-       struct pin_cookie cookie;
+       struct rq_flags rf;
         struct rq *rq;
         int cpu;
  
@@ -3358,9 +3420,9 @@ static void __sched notrace __schedule(bool preempt)
          */
         smp_mb__before_spinlock();
         raw_spin_lock(&rq->lock);
-       cookie = lockdep_pin_lock(&rq->lock);
+       rq_pin_lock(rq, &rf);
  
-       rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+       rq->clock_update_flags <<= 1; /* promote REQ to ACT */
  
         switch_count = &prev->nivcsw;
         if (!preempt && prev->state) {
@@ -3370,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt)
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
                         prev->on_rq = 0;
  
+                       if (prev->in_iowait) {
+                               atomic_inc(&rq->nr_iowait);
+                               delayacct_blkio_start();
+                       }
+
                         /*
                          * If a worker went to sleep, notify and ask workqueue
                          * whether it wants to wake up a task to maintain
@@ -3380,7 +3447,7 @@ static void __sched notrace __schedule(bool preempt)
  
                                 to_wakeup = wq_worker_sleeping(prev);
                                 if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup, cookie);
+                                       try_to_wake_up_local(to_wakeup, &rf);
                         }
                 }
                 switch_count = &prev->nvcsw;
@@ -3389,10 +3456,9 @@ static void __sched notrace __schedule(bool preempt)
         if (task_on_rq_queued(prev))
                 update_rq_clock(rq);
  
-       next = pick_next_task(rq, prev, cookie);
+       next = pick_next_task(rq, prev, &rf);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
-       rq->clock_skip_update = 0;
  
         if (likely(prev != next)) {
                 rq->nr_switches++;
@@ -3400,9 +3466,10 @@ static void __sched notrace __schedule(bool preempt)
                 ++*switch_count;
  
                 trace_sched_switch(preempt, prev, next);
-               rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+               rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
         } else {
-               lockdep_unpin_lock(&rq->lock, cookie);
+               rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+               rq_unpin_lock(rq, &rf);
                 raw_spin_unlock_irq(&rq->lock);
         }
  
@@ -3651,6 +3718,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         BUG_ON(prio > MAX_PRIO);
  
         rq = __task_rq_lock(p, &rf);
+       update_rq_clock(rq);
  
         /*
          * Idle task boosting is a nono in general. There is one
@@ -3747,6 +3815,8 @@ void set_user_nice(struct task_struct *p, long nice)
          * the task might be in the middle of scheduling on another CPU.
          */
         rq = task_rq_lock(p, &rf);
+       update_rq_clock(rq);
+
         /*
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
@@ -4179,6 +4249,7 @@ recheck:
          * runqueue lock must be held.
          */
         rq = task_rq_lock(p, &rf);
+       update_rq_clock(rq);
  
         /*
          * Changing the policy of the stop threads its a very bad idea
@@ -5064,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to);
  long __sched io_schedule_timeout(long timeout)
  {
         int old_iowait = current->in_iowait;
-       struct rq *rq;
         long ret;
  
         current->in_iowait = 1;
         blk_schedule_flush_plug(current);
  
-       delayacct_blkio_start();
-       rq = raw_rq();
-       atomic_inc(&rq->nr_iowait);
         ret = schedule_timeout(timeout);
         current->in_iowait = old_iowait;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
  
         return ret;
  }
@@ -5521,7 +5586,7 @@ static void migrate_tasks(struct rq *dead_rq)
  {
         struct rq *rq = dead_rq;
         struct task_struct *next, *stop = rq->stop;
-       struct pin_cookie cookie;
+       struct rq_flags rf;
         int dest_cpu;
  
         /*
@@ -5553,8 +5618,8 @@ static void migrate_tasks(struct rq *dead_rq)
                 /*
                  * pick_next_task assumes pinned rq->lock.
                  */
-               cookie = lockdep_pin_lock(&rq->lock);
-               next = pick_next_task(rq, &fake_task, cookie);
+               rq_pin_lock(rq, &rf);
+               next = pick_next_task(rq, &fake_task, &rf);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
  
@@ -5567,7 +5632,7 @@ static void migrate_tasks(struct rq *dead_rq)
                  * because !cpu_active at this point, which means load-balance
                  * will not interfere. Also, stop-machine.
                  */
-               lockdep_unpin_lock(&rq->lock, cookie);
+               rq_unpin_lock(rq, &rf);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&next->pi_lock);
                 raw_spin_lock(&rq->lock);
@@ -7487,6 +7552,7 @@ void __init sched_init_smp(void)
         init_sched_dl_class();
  
         sched_init_smt();
+       sched_clock_init_late();
  
         sched_smp_initialized = true;
  }
@@ -7502,6 +7568,7 @@ early_initcall(migration_init);
  void __init sched_init_smp(void)
  {
         sched_init_granularity();
+       sched_clock_init_late();
  }
  #endif /* CONFIG_SMP */
  
@@ -7545,6 +7612,8 @@ void __init sched_init(void)
         int i, j;
         unsigned long alloc_size = 0, ptr;
  
+       sched_clock_init();
+
         for (i = 0; i < WAIT_TABLE_SIZE; i++)
                 init_waitqueue_head(bit_wait_table + i);
  
@@ -8431,6 +8500,7 @@ static void cpu_cgroup_fork(struct task_struct *task)
  
         rq = task_rq_lock(task, &rf);
  
+       update_rq_clock(rq);
         sched_change_group(task, TASK_SET_GROUP);
  
         task_rq_unlock(rq, task, &rf);