]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blobdiff - kernel/sched/fair.c
sched/numa: Avoid migrating task to CPU-less node
[mirror_ubuntu-kernels.git] / kernel / sched / fair.c
index 095b0aa378df0903c6f34cbd34dfcca945d87758..11a72e1b3b2c0e6da31964390e1d069fe34e4b70 100644 (file)
@@ -1259,10 +1259,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
 
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
-                                       int maxdist, bool task)
+                                       int lim_dist, bool task)
 {
        unsigned long score = 0;
-       int node;
+       int node, max_dist;
 
        /*
         * All nodes are directly connected, and the same distance
@@ -1271,6 +1271,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
        if (sched_numa_topology_type == NUMA_DIRECT)
                return 0;
 
+       /* sched_max_numa_distance may be changed in parallel. */
+       max_dist = READ_ONCE(sched_max_numa_distance);
        /*
         * This code is called for each node, introducing N^2 complexity,
         * which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1285,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                 * The furthest away nodes in the system are not interesting
                 * for placement; nid was already counted.
                 */
-               if (dist == sched_max_numa_distance || node == nid)
+               if (dist >= max_dist || node == nid)
                        continue;
 
                /*
@@ -1293,8 +1295,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                 * "hoplimit", only nodes closer by than "hoplimit" are part
                 * of each group. Skip other nodes.
                 */
-               if (sched_numa_topology_type == NUMA_BACKPLANE &&
-                                       dist >= maxdist)
+               if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
                        continue;
 
                /* Add up the faults from nearby nodes. */
@@ -1312,8 +1313,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                 * This seems to result in good task placement.
                 */
                if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
-                       faults *= (sched_max_numa_distance - dist);
-                       faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+                       faults *= (max_dist - dist);
+                       faults /= (max_dist - LOCAL_DISTANCE);
                }
 
                score += faults;
@@ -1489,6 +1490,7 @@ struct task_numa_env {
 
        int src_cpu, src_nid;
        int dst_cpu, dst_nid;
+       int imb_numa_nr;
 
        struct numa_stats src_stats, dst_stats;
 
@@ -1503,7 +1505,7 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static inline long adjust_numa_imbalance(int imbalance,
-                                       int dst_running, int dst_weight);
+                                       int dst_running, int imb_numa_nr);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1886,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
                dst_running = env->dst_stats.nr_running + 1;
                imbalance = max(0, dst_running - src_running);
                imbalance = adjust_numa_imbalance(imbalance, dst_running,
-                                                       env->dst_stats.weight);
+                                                 env->imb_numa_nr);
 
                /* Use idle CPU if there is no imbalance */
                if (!imbalance) {
@@ -1949,8 +1951,10 @@ static int task_numa_migrate(struct task_struct *p)
         */
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       if (sd)
+       if (sd) {
                env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+               env.imb_numa_nr = sd->imb_numa_nr;
+       }
        rcu_read_unlock();
 
        /*
@@ -1985,7 +1989,7 @@ static int task_numa_migrate(struct task_struct *p)
         */
        ng = deref_curr_numa_group(p);
        if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
-               for_each_online_node(nid) {
+               for_each_node_state(nid, N_CPU) {
                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                continue;
 
@@ -2083,13 +2087,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
        unsigned long faults, max_faults = 0;
        int nid, active_nodes = 0;
 
-       for_each_online_node(nid) {
+       for_each_node_state(nid, N_CPU) {
                faults = group_faults_cpu(numa_group, nid);
                if (faults > max_faults)
                        max_faults = faults;
        }
 
-       for_each_online_node(nid) {
+       for_each_node_state(nid, N_CPU) {
                faults = group_faults_cpu(numa_group, nid);
                if (faults * ACTIVE_NODE_FRACTION > max_faults)
                        active_nodes++;
@@ -2243,7 +2247,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 
                dist = sched_max_numa_distance;
 
-               for_each_online_node(node) {
+               for_each_node_state(node, N_CPU) {
                        score = group_weight(p, node, dist);
                        if (score > max_score) {
                                max_score = score;
@@ -2262,7 +2266,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
         * inside the highest scoring group of nodes. The nodemask tricks
         * keep the complexity of the search down.
         */
-       nodes = node_online_map;
+       nodes = node_states[N_CPU];
        for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
                unsigned long max_faults = 0;
                nodemask_t max_group = NODE_MASK_NONE;
@@ -2401,6 +2405,21 @@ static void task_numa_placement(struct task_struct *p)
                }
        }
 
+       /* Cannot migrate task to CPU-less node */
+       if (!node_state(max_nid, N_CPU)) {
+               int near_nid = max_nid;
+               int distance, near_distance = INT_MAX;
+
+               for_each_node_state(nid, N_CPU) {
+                       distance = node_distance(max_nid, nid);
+                       if (distance < near_distance) {
+                               near_nid = nid;
+                               near_distance = distance;
+                       }
+               }
+               max_nid = near_nid;
+       }
+
        if (ng) {
                numa_group_count_active_nodes(ng);
                spin_unlock_irq(group_lock);
@@ -2825,6 +2844,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
        /* Protect against double add, see task_tick_numa and task_numa_work */
        p->numa_work.next               = &p->numa_work;
        p->numa_faults                  = NULL;
+       p->numa_pages_migrated          = 0;
+       p->total_numa_faults            = 0;
        RCU_INIT_POINTER(p->numa_group, NULL);
        p->last_task_numa_placement     = 0;
        p->last_sum_exec_runtime        = 0;
@@ -9003,9 +9024,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
 {
-       return (dst_running < (dst_weight >> 2));
+       return running <= imb_numa_nr;
 }
 
 /*
@@ -9139,12 +9160,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                                return idlest;
 #endif
                        /*
-                        * Otherwise, keep the task on this node to stay close
-                        * its wakeup source and improve locality. If there is
-                        * a real need of migration, periodic load balance will
-                        * take care of it.
+                        * Otherwise, keep the task close to the wakeup source
+                        * and improve locality if the number of running tasks
+                        * would remain below threshold where an imbalance is
+                        * allowed. If there is a real need of migration,
+                        * periodic load balance will take care of it.
                         */
-                       if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+                       if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
                                return NULL;
                }
 
@@ -9236,9 +9258,9 @@ next_group:
 #define NUMA_IMBALANCE_MIN 2
 
 static inline long adjust_numa_imbalance(int imbalance,
-                               int dst_running, int dst_weight)
+                               int dst_running, int imb_numa_nr)
 {
-       if (!allow_numa_imbalance(dst_running, dst_weight))
+       if (!allow_numa_imbalance(dst_running, imb_numa_nr))
                return imbalance;
 
        /*
@@ -9350,7 +9372,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                /* Consider allowing a small imbalance between NUMA groups */
                if (env->sd->flags & SD_NUMA) {
                        env->imbalance = adjust_numa_imbalance(env->imbalance,
-                               busiest->sum_nr_running, busiest->group_weight);
+                               local->sum_nr_running + 1, env->sd->imb_numa_nr);
                }
 
                return;