* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ u64 delta;
+
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
if (entity_is_task(se)) {
p = task_of(se);
se->statistics.wait_sum += delta;
se->statistics.wait_start = 0;
}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
+
+ if (flags & DEQUEUE_SLEEP) {
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+ }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
}
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
/*
* We are picking a new current task - update its stats:
*/
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
+ int active_nodes;
struct rcu_head rcu;
- nodemask_t active_nodes;
unsigned long total_faults;
+ unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
return true;
/*
- * Do not migrate if the destination is not a node that
- * is actively used by this numa group.
- */
- if (!node_isset(dst_nid, ng->active_nodes))
- return false;
-
- /*
- * Source is a node that is not actively used by this
- * numa group, while the destination is. Migrate.
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
*/
- if (!node_isset(src_nid, ng->active_nodes))
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
return true;
/*
- * Both source and destination are nodes in active
- * use by this numa group. Maximize memory bandwidth
- * by migrating from more heavily used groups, to less
- * heavily used ones, spreading the load around.
- * Use a 1/4 hysteresis to avoid spurious page movement.
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
*/
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long weighted_cpuload(const int cpu);
.best_task = NULL,
.best_imp = 0,
- .best_cpu = -1
+ .best_cpu = -1,
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group &&
- nodes_weight(p->numa_group->active_nodes) > 1)) {
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
- if (node_isset(nid, p->numa_group->active_nodes))
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
}
}
/*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
- int nid;
+ int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
- if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
- node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
- node_clear(nid, numa_group->active_nodes);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
}
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
}
/*
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
- update_numa_active_node_mask(p->numa_group);
+ numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
}
return;
atomic_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
- node_set(task_node(current), grp->active_nodes);
-
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- if (!priv && !local && p->numa_group &&
- node_isset(cpu_node, p->numa_group->active_nodes) &&
- node_isset(mem_node, p->numa_group->active_nodes))
+ ng = p->numa_group;
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled()) {
+ pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ "stat_blocked and stat_runtime require the "
+ "kernel parameter schedstats=enabled or "
+ "kernel.sched_schedstats=1\n");
+ }
+#endif
+}
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_curr(cfs_rq);
+
/*
- * Update run-time statistics of the 'current'.
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past.
*/
- update_curr(cfs_rq);
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
+ if (schedstat_enabled())
+ enqueue_sleeper(cfs_rq, se);
}
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- if (se != cfs_rq->curr)
+ check_schedstat_required();
+ if (schedstat_enabled()) {
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ }
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
-#endif
- }
+ if (schedstat_enabled())
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ if (schedstat_enabled())
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
+ if (schedstat_enabled()) {
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq)
+ update_stats_wait_start(cfs_rq, prev);
+ }
+
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
/* scale is effectively 1 << i now, and >> i divides by scale */
- old_load = this_rq->cpu_load[i] - tickless_load;
+ old_load = this_rq->cpu_load[i];
old_load = decay_load_missed(old_load, pending_updates - 1, i);
- old_load += tickless_load;
+ if (tickless_load) {
+ old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+ /*
+ * old_load can never be a negative value because a
+ * decayed tickless_load cannot be greater than the
+ * original tickless_load.
+ */
+ old_load += tickless_load;
+ }
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
}
#ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load,
+ int active)
+{
+ unsigned long pending_updates;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
+ */
+ __update_cpu_load(this_rq, load, pending_updates, active);
+ }
+}
+
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
- unsigned long pending_updates;
-
/*
* bail if there's load or we're actually up-to-date.
*/
- if (load || curr_jiffies == this_rq->last_load_update_tick)
+ if (weighted_cpuload(cpu_of(this_rq)))
return;
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates, 0);
+ __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
}
/*
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
- unsigned long pending_updates;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * In the regular NOHZ case, we were idle, this means load 0.
- * In the NOHZ_FULL case, we were non-idle, we should consider
- * its weighted load.
- */
- __update_cpu_load(this_rq, load, pending_updates, active);
- }
+ __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
raw_spin_unlock(&this_rq->lock);
}
#endif /* CONFIG_NO_HZ */
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, load, 1, 1);
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- update_idle_cpu_load(rq);
+ update_cpu_load_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se) {
- if (tg->se[i])
- remove_entity_load_avg(tg->se[i]);
+ if (tg->se)
kfree(tg->se[i]);
- }
}
kfree(tg->cfs_rq);
return 0;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
{
- struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ struct rq *rq;
+ int cpu;
- /*
- * Only empty task groups can be destroyed; so we can speculatively
- * check on_list without danger of it being re-added.
- */
- if (!tg->cfs_rq[cpu]->on_list)
- return;
+ for_each_possible_cpu(cpu) {
+ if (tg->se[cpu])
+ remove_entity_load_avg(tg->se[cpu]);
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ continue;
+
+ rq = cpu_rq(cpu);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
return 1;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */