#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
+#include <linux/cpuidle.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
}
#ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
group->faults_cpu[task_faults_idx(nid, 1)];
}
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
+}
+
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
if (!p->numa_faults_memory)
return 0;
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int cpu, cpus = 0;
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
if (!cpus)
return;
- ns->task_capacity =
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ ns->task_capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
/*
* "imp" is the fault differential for the source task between the
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
* instead.
*/
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_free_capacity &&
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
+ /*
+ * One idle CPU per node is evaluated for a task numa move.
+ * Call select_idle_sibling to maybe find a better one.
+ */
+ if (!cur)
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
assign:
task_numa_assign(env, cur, imp);
unlock:
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = max_group_nid;
+ max_nid = preferred_group_nid(p, max_group_nid);
}
if (max_faults) {
list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
- rcu_assign_pointer(p->numa_group, NULL);
+ RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
if (!p->mm)
return;
- /* Do not worry about placement if exiting */
- if (p->state == TASK_DEAD)
- return;
-
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults_memory)) {
int size = sizeof(*p->numa_faults_memory) *
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma))
continue;
/*
/*
* As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
- * With a look-up table which covers k^n (n<PERIOD)
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
*
* To achieve constant time decay_load.
*/
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
+ if (!tg_contrib)
+ return;
+
if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic_long_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
resched_curr(rq);
return;
}
-
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- if (rq->curr != p)
- delta = max_t(s64, 10000LL, delta);
-
hrtick_start(rq, delta);
}
}
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+ unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
+ s64 this_eff_load, prev_eff_load;
int idx, this_cpu, prev_cpu;
- unsigned long tl_per_task;
struct task_group *tg;
unsigned long weight;
int balanced;
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load > 0) {
- s64 this_eff_load, prev_eff_load;
+ this_eff_load = 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
- this_eff_load = 100;
- this_eff_load *= capacity_of(prev_cpu);
+ if (this_load > 0) {
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
- balanced = this_eff_load <= prev_eff_load;
- } else
- balanced = true;
-
- /*
- * If the currently running task will sleep within
- * a reasonable amount of time then attract this newly
- * woken task:
- */
- if (sync && balanced)
- return 1;
+ balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
- if (balanced ||
- (this_load <= load &&
- this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ if (!balanced)
+ return 0;
- return 1;
- }
- return 0;
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
}
/*
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
- int idlest = -1;
+ unsigned int min_exit_latency = UINT_MAX;
+ u64 latest_idle_timestamp = 0;
+ int least_loaded_cpu = this_cpu;
+ int shallowest_idle_cpu = -1;
int i;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- load = weighted_cpuload(i);
-
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- idlest = i;
+ if (idle_cpu(i)) {
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ rq->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If equal or no active idle state, then
+ * the most recently idled CPU might have
+ * a warmer cache.
+ */
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else {
+ load = weighted_cpuload(i);
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
}
}
- return idlest;
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
/*
if (p->nr_cpus_allowed == 1)
return prev_cpu;
- if (sd_flag & SD_BALANCE_WAKE) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
- want_affine = 1;
- new_cpu = prev_cpu;
- }
+ if (sd_flag & SD_BALANCE_WAKE)
+ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
return;
/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
unsigned int loop_max;
enum fbq_type fbq_type;
+ struct list_head tasks;
};
-/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
/*
* Is this task likely cache-hot:
*/
{
s64 delta;
+ lockdep_assert_held(&env->src_rq->lock);
+
if (p->sched_class != &fair_sched_class)
return 0;
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
- if (migrate_improves_locality(p, env)) {
-#ifdef CONFIG_SCHEDSTATS
- if (tsk_cache_hot) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
- }
-#endif
- return 1;
- }
-
- if (!tsk_cache_hot ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-
+ if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-
return 1;
}
}
/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
*
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
*/
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
+ lockdep_assert_held(&env->src_rq->lock);
+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
- move_task(p, env);
+ detach_task(p, env);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
if ((load / 2) > env->imbalance)
goto next;
- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
}
/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
+
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
- return pulled;
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
#endif
/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
+ enum group_type group_type;
int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
},
};
}
return default_scale_capacity(sd, cpu);
}
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long weight = sd->span_weight;
- unsigned long smt_gain = sd->smt_gain;
-
- smt_gain /= weight;
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
- return smt_gain;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_capacity(sd, cpu);
+ return default_scale_cpu_capacity(sd, cpu);
}
static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long weight = sd->span_weight;
unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_smt_capacity(sd, cpu);
- else
- capacity *= default_scale_smt_capacity(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_cpu_capacity(sd, cpu);
+ else
+ capacity *= default_scale_cpu_capacity(sd, cpu);
- capacity >>= SCHED_CAPACITY_SHIFT;
- }
+ capacity >>= SCHED_CAPACITY_SHIFT;
sdg->sgc->capacity_orig = capacity;
return capacity_factor;
}
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
load = source_load(i, load_idx);
sgs->group_load += load;
- sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
*overload = true;
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
-
- sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
+ sgs->group_type = group_classify(group, sgs);
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
- return false;
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_type > busiest->group_type)
return true;
- if (sgs->group_imb)
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/*
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (busiest->group_imb) {
+ if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
return fix_small_imbalance(env, sds);
}
- if (!busiest->group_imb) {
- /*
- * Don't want to pull so many tasks that a group would go idle.
- * Except of course for the group_imb case, since then we might
- * have to drop below capacity to reach cpu-load equilibrium.
- */
+ /*
+ * If there aren't any idle cpus, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
load_above_capacity =
(busiest->sum_nr_running - busiest->group_capacity_factor);
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (busiest->group_imb)
+ if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
goto force_balance;
/*
- * If the local group is more busy than the selected busiest group
+ * If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
if (env->idle == CPU_IDLE) {
/*
- * This cpu is idle. If the busiest group load doesn't
- * have more tasks than the number of available cpu's and
- * there is no imbalance between this and busiest group
- * wrt to idle cpu's, it is balanced.
+ * This cpu is idle. If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest group
+ * wrt idle cpus, it is balanced. The imbalance becomes
+ * significant if the diff is greater than 1 otherwise we
+ * might end up to just move the imbalance on another group
*/
- if ((local->idle_cpus < busiest->idle_cpus) &&
- busiest->sum_nr_running <= busiest->group_weight)
+ if ((busiest->group_type != group_overloaded) &&
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
goto out_balanced;
} else {
/*
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};
/*
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
- local_irq_restore(flags);
+ cur_ld_moved = detach_tasks(&env);
/*
- * some other cpu did the load balance for us.
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
*/
- if (cur_ld_moved && env.dst_cpu != smp_processor_id())
- resched_cpu(env.dst_cpu);
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
+ local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
- } else if (*group_imbalance)
- *group_imbalance = 0;
}
/* All tasks on this runqueue were pinned by CPU affinity */
env.loop_break = sched_nr_migrate_break;
goto redo;
}
- goto out_balanced;
+ goto out_all_pinned;
}
}
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
goto out;
out_balanced:
+ /*
+ * We reach balance although we may have faced some affinity
+ * constraints. Clear the imbalance flag if it was set.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
+out_all_pinned:
+ /*
+ * We reach balance because all tasks are pinned at this level so
+ * we can't migrate them. Let the imbalance flag set so parent level
+ * can try to migrate them.
+ */
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock);
*/
BUG_ON(busiest_rq == target_rq);
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
schedstat_inc(sd, alb_count);
- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
return 0;
}
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *se = &p->se;
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP