sched/numa: Avoid migrating task to CPU-less node

[mirror_ubuntu-kernels.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 095b0aa378df0903c6f34cbd34dfcca945d87758..11a72e1b3b2c0e6da31964390e1d069fe34e4b70 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1259,10 +1259,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
  
  /* Handle placement on systems where not all nodes are directly connected. */
  static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
-                                       int maxdist, bool task)
+                                       int lim_dist, bool task)
  {
         unsigned long score = 0;
-       int node;
+       int node, max_dist;
  
         /*
          * All nodes are directly connected, and the same distance
@@ -1271,6 +1271,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
         if (sched_numa_topology_type == NUMA_DIRECT)
                 return 0;
  
+       /* sched_max_numa_distance may be changed in parallel. */
+       max_dist = READ_ONCE(sched_max_numa_distance);
         /*
          * This code is called for each node, introducing N^2 complexity,
          * which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1285,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                  * The furthest away nodes in the system are not interesting
                  * for placement; nid was already counted.
                  */
-               if (dist == sched_max_numa_distance || node == nid)
+               if (dist >= max_dist || node == nid)
                         continue;
  
                 /*
@@ -1293,8 +1295,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                  * "hoplimit", only nodes closer by than "hoplimit" are part
                  * of each group. Skip other nodes.
                  */
-               if (sched_numa_topology_type == NUMA_BACKPLANE &&
-                                       dist >= maxdist)
+               if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
                         continue;
  
                 /* Add up the faults from nearby nodes. */
@@ -1312,8 +1313,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                  * This seems to result in good task placement.
                  */
                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
-                       faults *= (sched_max_numa_distance - dist);
-                       faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+                       faults *= (max_dist - dist);
+                       faults /= (max_dist - LOCAL_DISTANCE);
                 }
  
                 score += faults;
@@ -1489,6 +1490,7 @@ struct task_numa_env {
  
         int src_cpu, src_nid;
         int dst_cpu, dst_nid;
+       int imb_numa_nr;
  
         struct numa_stats src_stats, dst_stats;
  
@@ -1503,7 +1505,7 @@ struct task_numa_env {
  static unsigned long cpu_load(struct rq *rq);
  static unsigned long cpu_runnable(struct rq *rq);
  static inline long adjust_numa_imbalance(int imbalance,
-                                       int dst_running, int dst_weight);
+                                       int dst_running, int imb_numa_nr);
  
  static inline enum
  numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1886,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
                 dst_running = env->dst_stats.nr_running + 1;
                 imbalance = max(0, dst_running - src_running);
                 imbalance = adjust_numa_imbalance(imbalance, dst_running,
-                                                       env->dst_stats.weight);
+                                                 env->imb_numa_nr);
  
                 /* Use idle CPU if there is no imbalance */
                 if (!imbalance) {
@@ -1949,8 +1951,10 @@ static int task_numa_migrate(struct task_struct *p)
          */
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       if (sd)
+       if (sd) {
                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+               env.imb_numa_nr = sd->imb_numa_nr;
+       }
         rcu_read_unlock();
  
         /*
@@ -1985,7 +1989,7 @@ static int task_numa_migrate(struct task_struct *p)
          */
         ng = deref_curr_numa_group(p);
         if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
-               for_each_online_node(nid) {
+               for_each_node_state(nid, N_CPU) {
                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                 continue;
  
@@ -2083,13 +2087,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
         unsigned long faults, max_faults = 0;
         int nid, active_nodes = 0;
  
-       for_each_online_node(nid) {
+       for_each_node_state(nid, N_CPU) {
                 faults = group_faults_cpu(numa_group, nid);
                 if (faults > max_faults)
                         max_faults = faults;
         }
  
-       for_each_online_node(nid) {
+       for_each_node_state(nid, N_CPU) {
                 faults = group_faults_cpu(numa_group, nid);
                 if (faults * ACTIVE_NODE_FRACTION > max_faults)
                         active_nodes++;
@@ -2243,7 +2247,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
  
                 dist = sched_max_numa_distance;
  
-               for_each_online_node(node) {
+               for_each_node_state(node, N_CPU) {
                         score = group_weight(p, node, dist);
                         if (score > max_score) {
                                 max_score = score;
@@ -2262,7 +2266,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
          * inside the highest scoring group of nodes. The nodemask tricks
          * keep the complexity of the search down.
          */
-       nodes = node_online_map;
+       nodes = node_states[N_CPU];
         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
                 unsigned long max_faults = 0;
                 nodemask_t max_group = NODE_MASK_NONE;
@@ -2401,6 +2405,21 @@ static void task_numa_placement(struct task_struct *p)
                 }
         }
  
+       /* Cannot migrate task to CPU-less node */
+       if (!node_state(max_nid, N_CPU)) {
+               int near_nid = max_nid;
+               int distance, near_distance = INT_MAX;
+
+               for_each_node_state(nid, N_CPU) {
+                       distance = node_distance(max_nid, nid);
+                       if (distance < near_distance) {
+                               near_nid = nid;
+                               near_distance = distance;
+                       }
+               }
+               max_nid = near_nid;
+       }
+
         if (ng) {
                 numa_group_count_active_nodes(ng);
                 spin_unlock_irq(group_lock);
@@ -2825,6 +2844,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
         /* Protect against double add, see task_tick_numa and task_numa_work */
         p->numa_work.next               = &p->numa_work;
         p->numa_faults                  = NULL;
+       p->numa_pages_migrated          = 0;
+       p->total_numa_faults            = 0;
         RCU_INIT_POINTER(p->numa_group, NULL);
         p->last_task_numa_placement     = 0;
         p->last_sum_exec_runtime        = 0;
@@ -9003,9 +9024,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
   * This is an approximation as the number of running tasks may not be
   * related to the number of busy CPUs due to sched_setaffinity.
   */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
  {
-       return (dst_running < (dst_weight >> 2));
+       return running <= imb_numa_nr;
  }
  
  /*
@@ -9139,12 +9160,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                                 return idlest;
  #endif
                         /*
-                        * Otherwise, keep the task on this node to stay close
-                        * its wakeup source and improve locality. If there is
-                        * a real need of migration, periodic load balance will
-                        * take care of it.
+                        * Otherwise, keep the task close to the wakeup source
+                        * and improve locality if the number of running tasks
+                        * would remain below threshold where an imbalance is
+                        * allowed. If there is a real need of migration,
+                        * periodic load balance will take care of it.
                          */
-                       if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
                                 return NULL;
                 }
  
@@ -9236,9 +9258,9 @@ next_group:
  #define NUMA_IMBALANCE_MIN 2
  
  static inline long adjust_numa_imbalance(int imbalance,
-                               int dst_running, int dst_weight)
+                               int dst_running, int imb_numa_nr)
  {
-       if (!allow_numa_imbalance(dst_running, dst_weight))
+       if (!allow_numa_imbalance(dst_running, imb_numa_nr))
                 return imbalance;
  
         /*
@@ -9350,7 +9372,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 /* Consider allowing a small imbalance between NUMA groups */
                 if (env->sd->flags & SD_NUMA) {
                         env->imbalance = adjust_numa_imbalance(env->imbalance,
-                               busiest->sum_nr_running, busiest->group_weight);
+                               local->sum_nr_running + 1, env->sd->imb_numa_nr);
                 }
  
                 return;