/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
- = SCHED_TUNABLESCALING_LOG;
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
+ *
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#ifdef CONFIG_SMP
/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
+ * For asym packing, by default the lower numbered cpu has higher priority.
*/
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
- * default: 5 msec, units: microseconds
- */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+unsigned int capacity_margin = 1280;
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
static inline struct task_struct *task_of(struct sched_entity *se)
{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
+ SCHED_WARN_ON(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *curr = cfs_rq->curr;
+
u64 vruntime = cfs_rq->min_vruntime;
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+ if (curr) {
+ if (curr->on_rq)
+ vruntime = curr->vruntime;
+ else
+ curr = NULL;
+ }
if (cfs_rq->rb_leftmost) {
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
struct sched_entity,
run_node);
- if (!cfs_rq->curr)
+ if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
/*
* At this point, util_avg won't be used in select_task_rq_fair anyway
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = now;
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
return;
}
}
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
update_curr(cfs_rq_of(&rq->curr->se));
}
-#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start = rq_clock(rq_of(cfs_rq));
+ u64 wait_start, prev_wait_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ wait_start = rq_clock(rq_of(cfs_rq));
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > se->statistics.wait_start))
- wait_start -= se->statistics.wait_start;
+ likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
- se->statistics.wait_start = wait_start;
+ schedstat_set(se->statistics.wait_start, wait_start);
}
-static void
+static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
- delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+ if (!schedstat_enabled())
+ return;
+
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- se->statistics.wait_start = delta;
+ schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- se->statistics.wait_max = max(se->statistics.wait_max, delta);
- se->statistics.wait_count++;
- se->statistics.wait_sum += delta;
- se->statistics.wait_start = 0;
+ schedstat_set(se->statistics.wait_max,
+ max(schedstat_val(se->statistics.wait_max), delta));
+ schedstat_inc(se->statistics.wait_count);
+ schedstat_add(se->statistics.wait_sum, delta);
+ schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *tsk = NULL;
+ u64 sleep_start, block_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ sleep_start = schedstat_val(se->statistics.sleep_start);
+ block_start = schedstat_val(se->statistics.block_start);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ schedstat_set(se->statistics.sleep_max, delta);
+
+ schedstat_set(se->statistics.sleep_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ schedstat_set(se->statistics.block_max, delta);
+
+ schedstat_set(se->statistics.block_start, 0);
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ schedstat_add(se->statistics.iowait_sum, delta);
+ schedstat_inc(se->statistics.iowait_count);
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (!schedstat_enabled())
+ return;
+
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+
+ if (!schedstat_enabled())
+ return;
+
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ schedstat_set(se->statistics.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ schedstat_set(se->statistics.block_start,
+ rq_clock(rq_of(cfs_rq)));
}
-
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
}
-#endif
/*
* We are picking a new current task - update its stats:
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-cpu cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
+ local_irq_enable();
+ }
assign:
task_numa_assign(env, cur, imp);
unsigned long nr_pte_updates = 0;
long pages, virtpages;
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
se->avg.last_update_time = n_last_update_time;
}
}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ return 0;
+}
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
*
* See cpu_util().
*/
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ cpufreq_update_util(rq_of(cfs_rq), 0);
}
}
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed utilization. It is expected
- * that one calls update_tg_load_avg() on this condition, but after you've
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
- * avg up.
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
*/
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed_load = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
return decayed || removed_load;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
+ int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
}
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- *
- * Or we're fresh through post_init_entity_util_avg().
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
cfs_rq_util_change(cfs_rq);
}
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
}
#endif
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
* calls this.
*/
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
return 0;
}
-static inline void update_load_avg(struct sched_entity *se, int not_used)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- struct rq *rq = rq_of(cfs_rq);
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
- cpufreq_trigger_update(rq_clock(rq));
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
+{
+ cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
}
static inline void
#endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
-
- if (entity_is_task(se))
- tsk = task_of(se);
-
- if (se->statistics.sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
-
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (se->statistics.block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
-
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
-
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
-}
-
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
d = -d;
if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
+ schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
- if (flags & ENQUEUE_WAKEUP) {
+ if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
- if (schedstat_enabled())
- enqueue_sleeper(cfs_rq, se);
- }
check_schedstat_required();
- if (schedstat_enabled()) {
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- }
+ update_stats_enqueue(cfs_rq, se, flags);
+ check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
- if (schedstat_enabled())
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
/*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
+ * Normalize after update_curr(); which will also have moved
+ * min_vruntime if @se is the one holding it back. But before doing
+ * update_min_vruntime() again, which will discount @se's position and
+ * can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
+
+ /*
+ * Now advance min_vruntime if @se was the entity holding it back,
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ * put back on, and if we advance min_vruntime, we'll be placed back
+ * further than we started -- ie. we'll be penalized.
+ */
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ update_min_vruntime(cfs_rq);
}
/*
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- if (schedstat_enabled())
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ schedstat_set(se->statistics.slice_max,
+ max((u64)schedstat_val(se->statistics.slice_max),
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-#endif
+
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- if (schedstat_enabled()) {
- check_spread(cfs_rq, prev);
- if (prev->on_rq)
- update_stats_wait_start(cfs_rq, prev);
- }
+ check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
+ update_load_avg(curr, UPDATE_TG);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- WARN_ON(task_rq(p) != rq);
+ SCHED_WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
update_cfs_shares(cfs_rq);
}
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
update_cfs_shares(cfs_rq);
}
}
#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
* wl = S * s'_i; see (2)
*/
if (W > 0 && w < W)
- wl = (w * (long)tg->shares) / W;
+ wl = (w * (long)scale_load_down(tg->shares)) / W;
else
- wl = tg->shares;
+ wl = scale_load_down(tg->shares);
/*
* Per the above, wl is the new se->load.weight value; since
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
balanced = this_eff_load <= prev_eff_load;
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
return 1;
}
+static inline int task_util(struct task_struct *p);
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
else
load = target_load(i, load_idx);
- avg_load += load;
+ runnable_load += load;
+
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
+ }
+
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+
+ if (local_group) {
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new cpu
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so take the
+ * blocked load into account through avg_load.
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
+ } while (group = group->next, group != sd->groups);
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
- if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
- }
- } while (group = group->next, group != sd->groups);
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+
+ if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
+ if (!idlest)
+ return NULL;
+
+ if (min_runnable_load > (this_runnable_load + imbalance))
+ return NULL;
- if (!idlest || 100*this_load < imbalance*min_load)
+ if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load))
return NULL;
+
return idlest;
}
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
}
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+ int next;
+
+again:
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+ if (*wrapped) {
+ if (next >= start)
+ return nr_cpumask_bits;
+ } else {
+ if (next >= nr_cpumask_bits) {
+ *wrapped = 1;
+ n = -1;
+ goto again;
+ }
+ }
+
+ return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+ for ((wrap) = 0, (cpu) = (start)-1; \
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+ (cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu, wrap;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+ for_each_cpu_wrap(core, cpus, target, wrap) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct sched_domain *this_sd;
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 time, cost;
+ s64 delta;
+ int cpu, wrap;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+ avg_cost = this_sd->avg_scan_cost;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ if ((avg_idle / 512) < avg_cost)
+ return -1;
+
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
- int i = task_cpu(p);
+ int i;
if (idle_cpu(target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+ return prev;
- /*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
- */
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
- }
-done:
return target;
}
return (util >= capacity) ? capacity : util;
}
+static inline int task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+ unsigned long util, capacity;
+
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+ return (util >= capacity) ? capacity : util;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
}
rcu_read_lock();
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else while (sd) {
struct sched_group *group;
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
- schedstat_inc(env->sd, lb_gained[env->idle]);
+ schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], detached);
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
}
capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ min_capacity = min(sgc->min_capacity, min_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
* Something like:
*
- * { 0 1 2 3 } { 4 5 6 7 }
- * * * * *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
return false;
}
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->min_capacity * capacity_margin <
+ ref->sgc->min_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
if (env->idle == CPU_NOT_IDLE)
return true;
/*
- * ASYM_PACKING needs to move all the work to the lowest
- * numbered CPUs in the group, therefore mark all groups
- * higher than ourself as busy.
+ * ASYM_PACKING needs to move all the work to the highest
+ * prority CPUs in the group, therefore mark all groups
+ * of lower priority than ourself as busy.
*/
- if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running &&
+ sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
if (!sds->busiest)
return true;
- /* Prefer to move from highest possible cpu's work */
- if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
+ /* Prefer to move from lowest priority cpu's work */
+ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+ sg->asym_prefer_cpu))
return true;
}
if (!sds->busiest)
return 0;
- busiest_cpu = group_first_cpu(sds->busiest);
- if (env->dst_cpu > busiest_cpu)
+ busiest_cpu = sds->busiest->asym_prefer_cpu;
+ if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
/*
* ASYM_PACKING needs to force migrate tasks from busy but
- * higher numbered CPUs in order to pack all tasks in the
- * lowest numbered CPUs.
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu))
return 1;
}
cpumask_copy(cpus, cpu_active_mask);
- schedstat_inc(sd, lb_count[idle]);
+ schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
group = find_busiest_group(&env);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
}
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
}
static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
- interval = get_sd_balance_interval(sd, cpu_busy);
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
continue;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
break;
}
curr_cost += domain_cost;
}
- update_next_balance(sd, 0, &next_balance);
+ update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
.idle = CPU_IDLE,
};
- schedstat_inc(sd, alb_count);
+ schedstat_inc(sd->alb_count);
p = detach_one_task(&env);
if (p) {
- schedstat_inc(sd, alb_pushed);
+ schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
- schedstat_inc(sd, alb_failed);
+ schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy, i, cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
return true;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
-
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
}
sd = rcu_dereference(per_cpu(sd_asym, cpu));
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu)) {
- kick = true;
- goto unlock;
- }
+ if (sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (i == cpu ||
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+ continue;
+ if (sched_asym_prefer(i, cpu)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ }
unlock:
rcu_read_unlock();
return kick;
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
if (!vruntime_normalized(p)) {
/*
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- detach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- int tg_update;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
- /* Synchronize task with its cfs_rq */
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
- attach_entity_load_avg(cfs_rq, se);
- if (tg_update)
- update_tg_load_avg(cfs_rq, false);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
+ attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
}