sched: Fix avg_load computation

[mirror_ubuntu-kernels.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d3427a8f254bff904bf819555699666c25aeed80..eb87229ed4af5e3d78c0a6754d60ab4aed5b58a9 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -665,6 +665,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1257,6 +1258,13 @@ balance:
         if (load_too_imbalanced(src_load, dst_load, env))
                 goto unlock;
  
+       /*
+        * One idle CPU per node is evaluated for a task numa move.
+        * Call select_idle_sibling to maybe find a better one.
+        */
+       if (!cur)
+               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
  assign:
         task_numa_assign(env, cur, imp);
  unlock:
@@ -1780,7 +1788,7 @@ void task_numa_free(struct task_struct *p)
                 list_del(&p->numa_entry);
                 grp->nr_tasks--;
                 spin_unlock_irqrestore(&grp->lock, flags);
-               rcu_assign_pointer(p->numa_group, NULL);
+               RCU_INIT_POINTER(p->numa_group, NULL);
                 put_numa_group(grp);
         }
  
@@ -2382,6 +2390,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
         tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
         tg_contrib -= cfs_rq->tg_load_contrib;
  
+       if (!tg_contrib)
+               return;
+
         if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
                 atomic_long_add(tg_contrib, &tg->load_avg);
                 cfs_rq->tg_load_contrib += tg_contrib;
@@ -3897,14 +3908,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                                 resched_curr(rq);
                         return;
                 }
-
-               /*
-                * Don't schedule slices shorter than 10000ns, that just
-                * doesn't make sense. Rely on vruntime for fairness.
-                */
-               if (rq->curr != p)
-                       delta = max_t(s64, 10000LL, delta);
-
                 hrtick_start(rq, delta);
         }
  }
@@ -4092,7 +4095,7 @@ static unsigned long capacity_of(int cpu)
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+       unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
         unsigned long load_avg = rq->cfs.runnable_load_avg;
  
         if (nr_running)
@@ -4282,7 +4285,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  {
         s64 this_load, load;
         int idx, this_cpu, prev_cpu;
-       unsigned long tl_per_task;
         struct task_group *tg;
         unsigned long weight;
         int balanced;
@@ -4340,32 +4342,15 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                 balanced = this_eff_load <= prev_eff_load;
         } else
                 balanced = true;
-
-       /*
-        * If the currently running task will sleep within
-        * a reasonable amount of time then attract this newly
-        * woken task:
-        */
-       if (sync && balanced)
-               return 1;
-
         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-       tl_per_task = cpu_avg_load_per_task(this_cpu);
  
-       if (balanced ||
-           (this_load <= load &&
-            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-               /*
-                * This domain has SD_WAKE_AFFINE and
-                * p is cache cold in this domain, and
-                * there is no bad imbalance.
-                */
-               schedstat_inc(sd, ttwu_move_affine);
-               schedstat_inc(p, se.statistics.nr_wakeups_affine);
+       if (!balanced)
+               return 0;
  
-               return 1;
-       }
-       return 0;
+       schedstat_inc(sd, ttwu_move_affine);
+       schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+       return 1;
  }
  
  /*
@@ -4518,11 +4503,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         if (p->nr_cpus_allowed == 1)
                 return prev_cpu;
  
-       if (sd_flag & SD_BALANCE_WAKE) {
-               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-                       want_affine = 1;
-               new_cpu = prev_cpu;
-       }
+       if (sd_flag & SD_BALANCE_WAKE)
+               want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -6003,7 +5985,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         load = source_load(i, load_idx);
  
                 sgs->group_load += load;
-               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_nr_running += rq->cfs.h_nr_running;
  
                 if (rq->nr_running > 1)
                         *overload = true;
@@ -6765,10 +6747,8 @@ more_balance:
                 if (sd_parent) {
                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  
-                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                                 *group_imbalance = 1;
-                       } else if (*group_imbalance)
-                               *group_imbalance = 0;
                 }
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6779,7 +6759,7 @@ more_balance:
                                 env.loop_break = sched_nr_migrate_break;
                                 goto redo;
                         }
-                       goto out_balanced;
+                       goto out_all_pinned;
                 }
         }
  
@@ -6853,6 +6833,23 @@ more_balance:
         goto out;
  
  out_balanced:
+       /*
+        * We reach balance although we may have faced some affinity
+        * constraints. Clear the imbalance flag if it was set.
+        */
+       if (sd_parent) {
+               int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+               if (*group_imbalance)
+                       *group_imbalance = 0;
+       }
+
+out_all_pinned:
+       /*
+        * We reach balance because all tasks are pinned at this level so
+        * we can't migrate them. Let the imbalance flag set so parent level
+        * can try to migrate them.
+        */
         schedstat_inc(sd, lb_balanced[idle]);
  
         sd->nr_balance_failed = 0;