sched: Fix sleep time double accounting in enqueue entity

[mirror_ubuntu-bionic-kernel.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c61a614465c8ebf13b5f71aabf23a78c9e8533a6..9bbc303598ea848453efbb647fffe9ac08ca5029 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
  
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+       lw->weight += inc;
+       lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+       lw->weight -= dec;
+       lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
  /*
   * Increase the granularity value when there are more CPUs,
   * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
+#ifdef CONFIG_SMP
+static inline void __update_task_entity_contrib(struct sched_entity *se);
+
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+       u32 slice;
+
+       p->se.avg.decay_count = 0;
+       slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+       p->se.avg.runnable_avg_sum = slice;
+       p->se.avg.runnable_avg_period = slice;
+       __update_task_entity_contrib(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
+}
+#endif
+
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  static void update_curr(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock_task;
+       u64 now = rq_clock_task(rq_of(cfs_rq));
         unsigned long delta_exec;
  
         if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
  static inline void
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+       schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
  }
  
  /*
@@ -738,14 +776,14 @@ static void
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start));
+                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
+                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
  #ifdef CONFIG_SCHEDSTATS
         if (entity_is_task(se)) {
                 trace_sched_stat_wait(task_of(se),
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
+                       rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
         }
  #endif
         schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
         /*
          * We are starting a new run period:
          */
-       se->exec_start = rq_of(cfs_rq)->clock_task;
+       se->exec_start = rq_clock_task(rq_of(cfs_rq));
  }
  
  /**************************************************
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
  /*
   * We choose a half-life close to 1 scheduling period.
   * Note: The tables below are dependent on this value.
@@ -1497,7 +1534,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
  
  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
  {
-       __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+       __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
  }
  
@@ -1510,9 +1547,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
          * We track migrations using entity decay_count <= 0, on a wake-up
          * migration we use a negative decay count to track the remote decays
          * accumulated while sleeping.
+        *
+        * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+        * are seen by enqueue_entity_load_avg() as a migration with an already
+        * constructed load_avg_contrib.
          */
         if (unlikely(se->avg.decay_count <= 0)) {
-               se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+               se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
                 if (se->avg.decay_count) {
                         /*
                          * In a wake-up migration we have to approximate the
@@ -1530,7 +1571,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
                 }
                 wakeup = 0;
         } else {
-               __synchronize_entity_decay(se);
+               /*
+                * Task re-woke on same cpu (or else migrate_task_rq_fair()
+                * would have made count negative); we must be careful to avoid
+                * double-accounting blocked time after synchronizing decays.
+                */
+               se->avg.last_runnable_update += __synchronize_entity_decay(se)
+                                                       << 20;
         }
  
         /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1654,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 tsk = task_of(se);
  
         if (se->statistics.sleep_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
  
                 if ((s64)delta < 0)
                         delta = 0;
@@ -1624,7 +1671,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 }
         }
         if (se->statistics.block_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+               u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
  
                 if ((s64)delta < 0)
                         delta = 0;
@@ -1805,9 +1852,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                         struct task_struct *tsk = task_of(se);
  
                         if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+                               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
                         if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_of(cfs_rq)->clock;
+                               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
                 }
  #endif
         }
@@ -2082,7 +2129,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
         if (unlikely(cfs_rq->throttle_count))
                 return cfs_rq->throttled_clock_task;
  
-       return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+       return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
  }
  
  /* returns 0 on failure to allocate runtime */
@@ -2138,10 +2185,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       struct rq *rq = rq_of(cfs_rq);
  
         /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+       if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
                 return;
  
         if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2276,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
  #ifdef CONFIG_SMP
         if (!cfs_rq->throttle_count) {
                 /* adjust cfs_rq_clock_task() */
-               cfs_rq->throttled_clock_task_time += rq->clock_task -
+               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
         }
  #endif
@@ -2245,7 +2291,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
  
         /* group is entering throttled state, stop time */
         if (!cfs_rq->throttle_count)
-               cfs_rq->throttled_clock_task = rq->clock_task;
+               cfs_rq->throttled_clock_task = rq_clock_task(rq);
         cfs_rq->throttle_count++;
  
         return 0;
@@ -2284,7 +2330,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 rq->nr_running -= task_delta;
  
         cfs_rq->throttled = 1;
-       cfs_rq->throttled_clock = rq->clock;
+       cfs_rq->throttled_clock = rq_clock(rq);
         raw_spin_lock(&cfs_b->lock);
         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
         raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2344,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         int enqueue = 1;
         long task_delta;
  
-       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+       se = cfs_rq->tg->se[cpu_of(rq)];
  
         cfs_rq->throttled = 0;
+
+       update_rq_clock(rq);
+
         raw_spin_lock(&cfs_b->lock);
-       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
+       cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
         list_del_rcu(&cfs_rq->throttled_list);
         raw_spin_unlock(&cfs_b->lock);
  
-       update_rq_clock(rq);
         /* update hierarchical throttle state */
         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
  
@@ -2599,10 +2647,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         throttle_cfs_rq(cfs_rq);
  }
  
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
  {
         struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2750,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
  #else /* CONFIG_CFS_BANDWIDTH */
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
-       return rq_of(cfs_rq)->clock_task;
+       return rq_clock_task(rq_of(cfs_rq));
  }
  
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -3415,12 +3459,6 @@ unlock:
         return new_cpu;
  }
  
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
   * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -3444,7 +3482,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
         }
  }
-#endif
  #endif /* CONFIG_SMP */
  
  static unsigned long
@@ -3946,7 +3983,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
          * 2) too many balance attempts have failed.
          */
  
-       tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+       tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
  
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu)
         age_stamp = ACCESS_ONCE(rq->age_stamp);
         avg = ACCESS_ONCE(rq->rt_avg);
  
-       total = sched_avg_period() + (rq->clock - age_stamp);
+       total = sched_avg_period() + (rq_clock(rq) - age_stamp);
  
         if (unlikely(total < avg)) {
                 /* Ensures that power won't end up being negative */
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
  
-       this_rq->idle_stamp = this_rq->clock;
+       this_rq->idle_stamp = rq_clock(this_rq);
  
         if (this_rq->avg_idle < sysctl_sched_migration_cost)
                 return;
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu)
  static inline void set_cpu_sd_state_busy(void)
  {
         struct sched_domain *sd;
-       int cpu = smp_processor_id();
  
         rcu_read_lock();
-       sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+       sd = rcu_dereference_check_sched_domain(this_rq()->sd);
  
         if (!sd || !sd->nohz_idle)
                 goto unlock;
@@ -5436,10 +5472,9 @@ unlock:
  void set_cpu_sd_state_idle(void)
  {
         struct sched_domain *sd;
-       int cpu = smp_processor_id();
  
         rcu_read_lock();
-       sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+       sd = rcu_dereference_check_sched_domain(this_rq()->sd);
  
         if (!sd || sd->nohz_idle)
                 goto unlock;
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                 se->vruntime -= cfs_rq->min_vruntime;
         }
  
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
         /*
         * Remove our load from contribution when we leave sched_fair
         * and ensure we don't carry in an old decay_count if we
@@ -5907,7 +5942,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
  #ifndef CONFIG_64BIT
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
         atomic64_set(&cfs_rq->decay_counter, 1);
         atomic64_set(&cfs_rq->removed_load, 0);
  #endif
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 se = tg->se[i];
                 /* Propagate contribution to hierarchy */
                 raw_spin_lock_irqsave(&rq->lock, flags);
+
+               /* Possible calls to update_curr() need rq clock */
+               update_rq_clock(rq);
                 for_each_sched_entity(se)
                         update_cfs_shares(group_cfs_rq(se));
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = {
  
  #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
         .migrate_task_rq        = migrate_task_rq_fair,
-#endif
+
         .rq_online              = rq_online_fair,
         .rq_offline             = rq_offline_fair,