#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ
+ u64 nohz_stamp;
unsigned char in_nohz_recently;
#endif
+ unsigned int skip_clock_update;
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (test_tsk_need_resched(p))
+ rq->skip_clock_update = 1;
}
static inline int cpu_of(struct rq *rq)
inline void update_rq_clock(struct rq *rq)
{
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update)
+ rq->clock = sched_clock_cpu(cpu_of(rq));
}
/*
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
*/
static inline int task_is_waking(struct task_struct *p)
{
- return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+ return unlikely(p->state == TASK_WAKING);
}
/*
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
local_irq_save(*flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
+
+int nohz_ratelimit(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 diff = rq->clock - rq->nohz_stamp;
+
+ rq->nohz_stamp = rq->clock;
+
+ return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
- update_rq_clock(rq1);
- update_rq_clock(rq2);
}
/*
static void
enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
{
- if (wakeup)
- p->se.start_runtime = p->se.sum_exec_runtime;
-
+ update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup, head);
p->se.on_rq = 1;
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep) {
- if (p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
- } else {
- update_avg(&p->se.avg_wakeup,
- sysctl_sched_wakeup_granularity);
- }
- }
-
+ update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, sleep);
p->se.on_rq = 0;
}
#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int dest_cpu;
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- rcu_read_lock();
- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
- rcu_read_unlock();
- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
}
/*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- * exec: is unstable, retry loop
- * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
unsigned long flags;
struct rq *rq;
- if (!sched_feat(SYNC_WAKEUPS))
- wake_flags &= ~WF_SYNC;
-
this_cpu = get_cpu();
smp_wmb();
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
if (!(p->state & state))
goto out;
*
* First fix up the nr_uninterruptible count:
*/
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ if (task_contributes_to_load(p)) {
+ if (likely(cpu_online(orig_cpu)))
+ rq->nr_uninterruptible--;
+ else
+ this_rq()->nr_uninterruptible--;
+ }
p->state = TASK_WAKING;
if (p->sched_class->task_waking)
p->sched_class->task_waking(rq, p);
- __task_rq_unlock(rq);
-
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu) {
- /*
- * Since we migrate the task without holding any rq->lock,
- * we need to be careful with task_rq_lock(), since that
- * might end up locking an invalid rq.
- */
+ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+ if (cpu != orig_cpu)
set_task_cpu(p, cpu);
- }
+ __task_rq_unlock(rq);
rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
/*
* We migrated the task without holding either rq->lock, however
activate_task(rq, p, 1);
success = 1;
- /*
- * Only attribute actual wakeups done by this task.
- */
- if (!in_interrupt()) {
- struct sched_entity *se = ¤t->se;
- u64 sample = se->sum_exec_runtime;
-
- if (se->last_wakeup)
- sample -= se->last_wakeup;
- else
- sample -= se->start_runtime;
- update_avg(&se->avg_wakeup, sample);
-
- se->last_wakeup = se->sum_exec_runtime;
- }
-
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, wake_flags);
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
- p->se.last_wakeup = 0;
- p->se.avg_overlap = 0;
- p->se.start_runtime = 0;
- p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
__sched_fork(p);
/*
- * We mark the process as waking here. This guarantees that
+ * We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_WAKING;
+ p->state = TASK_RUNNING;
/*
* Revert to default priority/policy on fork if requested.
{
unsigned long flags;
struct rq *rq;
- int cpu = get_cpu();
+ int cpu __maybe_unused = get_cpu();
#ifdef CONFIG_SMP
+ rq = task_rq_lock(p, &flags);
+ p->state = TASK_WAKING;
+
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*
- * We still have TASK_WAKING but PF_STARTING is gone now, meaning
- * ->cpus_allowed is stable, we have preemption disabled, meaning
- * cpu_online_mask is stable.
+ * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+ * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
set_task_cpu(p, cpu);
-#endif
- /*
- * Since the task is not on the rq and we still have TASK_WAKING set
- * nobody else will migrate this task.
- */
- rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- BUG_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
- update_rq_clock(rq);
+ task_rq_unlock(rq, &flags);
+#endif
+
+ rq = task_rq_lock(p, &flags);
activate_task(rq, p, 0);
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
{
struct task_struct *p = current;
struct migration_req req;
- int dest_cpu, this_cpu;
unsigned long flags;
struct rq *rq;
-
-again:
- this_cpu = get_cpu();
- dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
- if (dest_cpu == this_cpu) {
- put_cpu();
- return;
- }
+ int dest_cpu;
rq = task_rq_lock(p, &flags);
- put_cpu();
+ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ if (dest_cpu == smp_processor_id())
+ goto unlock;
/*
* select_task_rq() can race against ->cpus_allowed
*/
- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
- || unlikely(!cpu_active(dest_cpu))) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
-
- /* force the process onto the specified CPU */
- if (migrate_task(p, dest_cpu, &req)) {
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+ likely(cpu_active(dest_cpu)) &&
+ migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
struct task_struct *mt = rq->migration_thread;
return;
}
+unlock:
task_rq_unlock(rq, &flags);
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->state == TASK_RUNNING) {
- u64 runtime = prev->se.sum_exec_runtime;
-
- runtime -= prev->se.prev_sum_exec_runtime;
- runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
- /*
- * In order to avoid avg_overlap growing stale when we are
- * indeed overlapping and hence not getting put to sleep, grow
- * the avg_overlap on preemption.
- *
- * We use the average preemption runtime because that
- * correlates to the amount of cache footprint a task can
- * build up.
- */
- update_avg(&prev->se.avg_overlap, runtime);
- }
+ if (prev->se.on_rq)
+ update_rq_clock(rq);
+ rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
oldprio = p->prio;
prev_class = p->sched_class;
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
- return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
}
if (!lock_task_sighand(p, &flags))
return -ESRCH;
- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
unlock_task_sighand(p, &flags);
/* can't set/change the rt policy */
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
int ret;
cpumask_var_t mask;
- if (len < cpumask_size())
+ if (len < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
else
- ret = cpumask_size();
+ ret = retlen;
}
free_cpumask_var(mask);
struct rq *rq;
int ret = 0;
+ /*
+ * Serialize against TASK_WAKING so that ttwu() and wunt() can
+ * drop the rq->lock and still rely on ->cpus_allowed.
+ */
+again:
+ while (task_is_waking(p))
+ cpu_relax();
rq = task_rq_lock(p, &flags);
+ if (task_is_waking(p)) {
+ task_rq_unlock(rq, &flags);
+ goto again;
+ }
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
}
#ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- int ret;
-
- local_irq_disable();
- ret = __migrate_task(p, src_cpu, dest_cpu);
- local_irq_enable();
- return ret;
-}
-
/*
* Figure out where task on dead CPU should go, use force if necessary.
*/
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- int dest_cpu;
+ struct rq *rq = cpu_rq(dead_cpu);
+ int needs_cpu, uninitialized_var(dest_cpu);
+ unsigned long flags;
-again:
- dest_cpu = select_fallback_rq(dead_cpu, p);
+ local_irq_save(flags);
- /* It can have affinity changed while we were choosing. */
- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
- goto again;
+ raw_spin_lock(&rq->lock);
+ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+ if (needs_cpu)
+ dest_cpu = select_fallback_rq(dead_cpu, p);
+ raw_spin_unlock(&rq->lock);
+ /*
+ * It can only fail if we race with set_cpus_allowed(),
+ * in the racer should migrate the task anyway.
+ */
+ if (needs_cpu)
+ __migrate_task(p, dead_cpu, dest_cpu);
+ local_irq_restore(flags);
}
/*
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- update_rq_clock(rq);
activate_task(rq, p, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
for ( ; ; ) {
if (!rq->nr_running)
break;
- update_rq_clock(rq);
next = pick_next_task(rq);
if (!next)
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
deactivate_task(rq, rq->idle, 0);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
raw_spin_unlock_irq(&rq->lock);
- cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
#ifdef CONFIG_SCHED_MC
static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
#ifdef CONFIG_SCHED_SMT
static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
{
int on_rq;
- update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
rq = task_rq_lock(tsk, &flags);
- update_rq_clock(rq);
-
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
- u64 *cpuusage;
+ u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};