Merge branch 'sched/urgent' into sched/core

author Ingo Molnar <mingo@elte.hu>

Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)

committer Ingo Molnar <mingo@elte.hu>

Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)
author Ingo Molnar <mingo@elte.hu>
Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)
committer Ingo Molnar <mingo@elte.hu>
Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)
diff --combined include/linux/sched.h

index 9a7bc5ba7e7e501a6850e2cef17afe3a11ddb2c3,6e0bb86de9905b63dd7139bd438f292e3d5ef60c..2091ea2a2c5cbb0b0ad2dca9a534aeac682abf11
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -271,16 -271,13 +271,10 @@@ extern int runqueue_is_locked(int cpu)
   
   extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
- -extern int select_nohz_load_balancer(int cpu);
- -extern int get_nohz_load_balancer(void);
+ +extern void select_nohz_load_balancer(int stop_tick);
+ +extern int get_nohz_timer_target(void);
- extern int nohz_ratelimit(int cpu);
   #else
- -static inline int select_nohz_load_balancer(int cpu)
- -{
- -      return 0;
- -}
+ +static inline void select_nohz_load_balancer(int stop_tick) { }
- 
- static inline int nohz_ratelimit(int cpu)
- {
-       return 0;
- }
   #endif
   
   /*
@@@ -801,7 -798,7 +795,7 @@@ enum cpu_idle_type 
   #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
   #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
   #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
- -
+ +#define SD_ASYM_PACKING               0x0800  /* Place busy groups earlier in the domain */
   #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
   
   enum powersavings_balance_level {
@@@ -836,8 -833,6 +830,8 @@@ static inline int sd_balance_for_packag
         return SD_PREFER_SIBLING;
   }
   
+ +extern int __weak arch_sd_sibiling_asym_packing(void);
+ +
   /*
    * Optimise SD flags for power savings:
    * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
@@@ -859,7 -854,7 +853,7 @@@ struct sched_group 
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
          * single CPU.
          */
- -      unsigned int cpu_power;
+ +      unsigned int cpu_power, cpu_power_orig;
   
         /*
          * The CPUs this group covers.
@@@ -1695,7 -1690,6 +1689,7 @@@ extern void thread_group_times(struct t
   #define PF_EXITING    0x00000004      /* getting shut down */
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
   #define PF_VCPU               0x00000010      /* I'm a virtual CPU */
+ +#define PF_WQ_WORKER  0x00000020      /* I'm a workqueue worker */
   #define PF_FORKNOEXEC 0x00000040      /* forked but didn't exec */
   #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
   #define PF_SUPERPRIV  0x00000100      /* used super-user privileges */
@@@ -1790,23 -1784,20 +1784,23 @@@ static inline int set_cpus_allowed(stru
   #endif
   
   /*
- - * Architectures can set this to 1 if they have specified
- - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- - * but then during bootup it turns out that sched_clock()
- - * is reliable after all:
+ + * Do not use outside of architecture code which knows its limitations.
+ + *
+ + * sched_clock() has no promise of monotonicity or bounded drift between
+ + * CPUs, use (which you should not) requires disabling IRQs.
+ + *
+ + * Please use one of the three interfaces below.
    */
- -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- -extern int sched_clock_stable;
- -#endif
- -
- -/* ftrace calls sched_clock() directly */
   extern unsigned long long notrace sched_clock(void);
+ +/*
+ + * See the comment in kernel/sched_clock.c
+ + */
+ +extern u64 cpu_clock(int cpu);
+ +extern u64 local_clock(void);
+ +extern u64 sched_clock_cpu(int cpu);
+ +
   
   extern void sched_clock_init(void);
- -extern u64 sched_clock_cpu(int cpu);
   
   #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
   static inline void sched_clock_tick(void)
@@@ -1821,19 -1812,17 +1815,19 @@@ static inline void sched_clock_idle_wak
   {
   }
   #else
+ +/*
+ + * Architectures can set this to 1 if they have specified
+ + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ + * but then during bootup it turns out that sched_clock()
+ + * is reliable after all:
+ + */
+ +extern int sched_clock_stable;
+ +
   extern void sched_clock_tick(void);
   extern void sched_clock_idle_sleep_event(void);
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
   #endif
   
- -/*
- - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- - * clock constructed from sched_clock():
- - */
- -extern unsigned long long cpu_clock(int cpu);
- -
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
   extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
diff --combined kernel/sched.c

index 16f3f77f71beccb828d47e3870a3650a9db255c5,63b4a14682faa3193830ec4dfc0ba26c9bada4d0..f6c9bb6ac70b382a29a907833f13bca9bc2bb61e
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -77,7 -77,6 +77,7 @@@
   #include <asm/irq_regs.h>
   
   #include "sched_cpupri.h"
+ +#include "workqueue_sched.h"
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/sched.h>
@@@ -457,10 -456,9 +457,10 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ +      unsigned long last_load_update_tick;
   #ifdef CONFIG_NO_HZ
         u64 nohz_stamp;
- -      unsigned char in_nohz_recently;
+ +      unsigned char nohz_balance_kick;
   #endif
         unsigned int skip_clock_update;
   
@@@ -1194,27 -1192,6 +1194,27 @@@ static void resched_cpu(int cpu
   }
   
   #ifdef CONFIG_NO_HZ
+ +/*
+ + * In the semi idle case, use the nearest busy cpu for migrating timers
+ + * from an idle cpu.  This is good for power-savings.
+ + *
+ + * We don't do similar optimization for completely idle system, as
+ + * selecting an idle cpu will add more delays to the timers than intended
+ + * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ + */
+ +int get_nohz_timer_target(void)
+ +{
+ +      int cpu = smp_processor_id();
+ +      int i;
+ +      struct sched_domain *sd;
+ +
+ +      for_each_domain(cpu, sd) {
+ +              for_each_cpu(i, sched_domain_span(sd))
+ +                      if (!idle_cpu(i))
+ +                              return i;
+ +      }
+ +      return cpu;
+ +}
   /*
    * When add_timer_on() enqueues a timer into the timer wheel of an
    * idle CPU then this timer might expire before the next timer event
@@@ -1255,16 -1232,6 +1255,6 @@@ void wake_up_idle_cpu(int cpu
                 smp_send_reschedule(cpu);
   }
   
- int nohz_ratelimit(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
-       u64 diff = rq->clock - rq->nohz_stamp;
- 
-       rq->nohz_stamp = rq->clock;
- 
-       return diff < (NSEC_PER_SEC / HZ) >> 1;
- }
- 
   #endif /* CONFIG_NO_HZ */
   
   static u64 sched_avg_period(void)
@@@ -1675,7 -1642,7 +1665,7 @@@ static void update_shares(struct sched_
         if (root_task_group_empty())
                 return;
   
- -      now = cpu_clock(raw_smp_processor_id());
+ +      now = local_clock();
         elapsed = now - sd->last_update;
   
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@@ -1828,7 -1795,6 +1818,7 @@@ static void cfs_rq_set_shares(struct cf
   static void calc_load_account_idle(struct rq *this_rq);
   static void update_sysctl(void);
   static int get_update_sysctl_factor(void);
+ +static void update_cpu_load(struct rq *this_rq);
   
   static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
   {
@@@ -2291,55 -2257,11 +2281,55 @@@ static void update_avg(u64 *avg, u64 sa
   }
   #endif
   
- -/***
+ +static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ +                               bool is_sync, bool is_migrate, bool is_local,
+ +                               unsigned long en_flags)
+ +{
+ +      schedstat_inc(p, se.statistics.nr_wakeups);
+ +      if (is_sync)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_sync);
+ +      if (is_migrate)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ +      if (is_local)
+ +              schedstat_inc(p, se.statistics.nr_wakeups_local);
+ +      else
+ +              schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ +
+ +      activate_task(rq, p, en_flags);
+ +}
+ +
+ +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ +                                      int wake_flags, bool success)
+ +{
+ +      trace_sched_wakeup(p, success);
+ +      check_preempt_curr(rq, p, wake_flags);
+ +
+ +      p->state = TASK_RUNNING;
+ +#ifdef CONFIG_SMP
+ +      if (p->sched_class->task_woken)
+ +              p->sched_class->task_woken(rq, p);
+ +
+ +      if (unlikely(rq->idle_stamp)) {
+ +              u64 delta = rq->clock - rq->idle_stamp;
+ +              u64 max = 2*sysctl_sched_migration_cost;
+ +
+ +              if (delta > max)
+ +                      rq->avg_idle = max;
+ +              else
+ +                      update_avg(&rq->avg_idle, delta);
+ +              rq->idle_stamp = 0;
+ +      }
+ +#endif
+ +      /* if a worker is waking up, notify workqueue */
+ +      if ((p->flags & PF_WQ_WORKER) && success)
+ +              wq_worker_waking_up(p, cpu_of(rq));
+ +}
+ +
+ +/**
    * try_to_wake_up - wake up a thread
- - * @p: the to-be-woken-up thread
+ + * @p: the thread to be awakened
    * @state: the mask of task states that can be woken
- - * @sync: do a synchronous wakeup?
+ + * @wake_flags: wake modifier flags (WF_*)
    *
    * Put it on the run-queue if it's not already there. The "current"
    * thread is always on the run-queue (except when the actual
@@@ -2347,8 -2269,7 +2337,8 @@@
    * the simpler "current->state = TASK_RUNNING" to mark yourself
    * runnable without the overhead of this.
    *
- - * returns failure only if the task is already active.
+ + * Returns %true if @p was woken up, %false if it was already running
+ + * or @state didn't match @p's state.
    */
   static int try_to_wake_up(struct task_struct *p, unsigned int state,
                           int wake_flags)
@@@ -2428,11 -2349,38 +2418,11 @@@
   
   out_activate:
   #endif /* CONFIG_SMP */
- -      schedstat_inc(p, se.statistics.nr_wakeups);
- -      if (wake_flags & WF_SYNC)
- -              schedstat_inc(p, se.statistics.nr_wakeups_sync);
- -      if (orig_cpu != cpu)
- -              schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- -      if (cpu == this_cpu)
- -              schedstat_inc(p, se.statistics.nr_wakeups_local);
- -      else
- -              schedstat_inc(p, se.statistics.nr_wakeups_remote);
- -      activate_task(rq, p, en_flags);
+ +      ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+ +                    cpu == this_cpu, en_flags);
         success = 1;
- -
   out_running:
- -      trace_sched_wakeup(p, success);
- -      check_preempt_curr(rq, p, wake_flags);
- -
- -      p->state = TASK_RUNNING;
- -#ifdef CONFIG_SMP
- -      if (p->sched_class->task_woken)
- -              p->sched_class->task_woken(rq, p);
- -
- -      if (unlikely(rq->idle_stamp)) {
- -              u64 delta = rq->clock - rq->idle_stamp;
- -              u64 max = 2*sysctl_sched_migration_cost;
- -
- -              if (delta > max)
- -                      rq->avg_idle = max;
- -              else
- -                      update_avg(&rq->avg_idle, delta);
- -              rq->idle_stamp = 0;
- -      }
- -#endif
+ +      ttwu_post_activation(p, rq, wake_flags, success);
   out:
         task_rq_unlock(rq, &flags);
         put_cpu();
@@@ -2440,37 -2388,6 +2430,37 @@@
         return success;
   }
   
+ +/**
+ + * try_to_wake_up_local - try to wake up a local task with rq lock held
+ + * @p: the thread to be awakened
+ + *
+ + * Put @p on the run-queue if it's not alredy there.  The caller must
+ + * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ + * the current task.  this_rq() stays locked over invocation.
+ + */
+ +static void try_to_wake_up_local(struct task_struct *p)
+ +{
+ +      struct rq *rq = task_rq(p);
+ +      bool success = false;
+ +
+ +      BUG_ON(rq != this_rq());
+ +      BUG_ON(p == current);
+ +      lockdep_assert_held(&rq->lock);
+ +
+ +      if (!(p->state & TASK_NORMAL))
+ +              return;
+ +
+ +      if (!p->se.on_rq) {
+ +              if (likely(!task_running(rq, p))) {
+ +                      schedstat_inc(rq, ttwu_count);
+ +                      schedstat_inc(rq, ttwu_local);
+ +              }
+ +              ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+ +              success = true;
+ +      }
+ +      ttwu_post_activation(p, rq, 0, success);
+ +}
+ +
   /**
    * wake_up_process - Wake up a specific process
    * @p: The process to be woken up.
@@@ -3084,103 -3001,24 +3074,103 @@@ static void calc_load_account_active(st
         this_rq->calc_load_update += LOAD_FREQ;
   }
   
+ +/*
+ + * The exact cpuload at various idx values, calculated at every tick would be
+ + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ + *
+ + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ + * on nth tick when cpu may be busy, then we have:
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ + *
+ + * decay_load_missed() below does efficient calculation of
+ + * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ + *
+ + * The calculation is approximated on a 128 point scale.
+ + * degrade_zero_ticks is the number of ticks after which load at any
+ + * particular idx is approximated to be zero.
+ + * degrade_factor is a precomputed table, a row for each load idx.
+ + * Each column corresponds to degradation factor for a power of two ticks,
+ + * based on 128 point scale.
+ + * Example:
+ + * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ + *
+ + * With this power of 2 load factors, we can degrade the load n times
+ + * by looking at 1 bits in n and doing as many mult/shift instead of
+ + * n mult/shifts needed by the exact degradation.
+ + */
+ +#define DEGRADE_SHIFT         7
+ +static const unsigned char
+ +              degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+ +static const unsigned char
+ +              degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ +                                      {0, 0, 0, 0, 0, 0, 0, 0},
+ +                                      {64, 32, 8, 0, 0, 0, 0, 0},
+ +                                      {96, 72, 40, 12, 1, 0, 0},
+ +                                      {112, 98, 75, 43, 15, 1, 0},
+ +                                      {120, 112, 98, 76, 45, 16, 2} };
+ +
+ +/*
+ + * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ + * would be when CPU is idle and so we just decay the old load without
+ + * adding any new load.
+ + */
+ +static unsigned long
+ +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+ +{
+ +      int j = 0;
+ +
+ +      if (!missed_updates)
+ +              return load;
+ +
+ +      if (missed_updates >= degrade_zero_ticks[idx])
+ +              return 0;
+ +
+ +      if (idx == 1)
+ +              return load >> missed_updates;
+ +
+ +      while (missed_updates) {
+ +              if (missed_updates % 2)
+ +                      load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+ +
+ +              missed_updates >>= 1;
+ +              j++;
+ +      }
+ +      return load;
+ +}
+ +
   /*
    * Update rq->cpu_load[] statistics. This function is usually called every
- - * scheduler tick (TICK_NSEC).
+ + * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ + * every tick. We fix it up based on jiffies.
    */
   static void update_cpu_load(struct rq *this_rq)
   {
         unsigned long this_load = this_rq->load.weight;
+ +      unsigned long curr_jiffies = jiffies;
+ +      unsigned long pending_updates;
         int i, scale;
   
         this_rq->nr_load_updates++;
   
+ +      /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+ +      if (curr_jiffies == this_rq->last_load_update_tick)
+ +              return;
+ +
+ +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ +      this_rq->last_load_update_tick = curr_jiffies;
+ +
         /* Update our load: */
- -      for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ +      this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ +      for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
                 unsigned long old_load, new_load;
   
                 /* scale is effectively 1 << i now, and >> i divides by scale */
   
                 old_load = this_rq->cpu_load[i];
+ +              old_load = decay_load_missed(old_load, pending_updates - 1, i);
                 new_load = this_load;
                 /*
                  * Round up the averaging division if load is increasing. This
@@@ -3188,15 -3026,9 +3178,15 @@@
                  * example.
                  */
                 if (new_load > old_load)
- -                      new_load += scale-1;
- -              this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ +                      new_load += scale - 1;
+ +
+ +              this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
         }
+ +}
+ +
+ +static void update_cpu_load_active(struct rq *this_rq)
+ +{
+ +      update_cpu_load(this_rq);
   
         calc_load_account_active(this_rq);
   }
@@@ -3584,7 -3416,7 +3574,7 @@@ void scheduler_tick(void
   
         raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
- -      update_cpu_load(rq);
+ +      update_cpu_load_active(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
   
@@@ -3756,6 -3588,7 +3746,6 @@@ need_resched
         rq = cpu_rq(cpu);
         rcu_note_context_switch(cpu);
         prev = rq->curr;
- -      switch_count = &prev->nivcsw;
   
         release_kernel_lock(prev);
   need_resched_nonpreemptible:
@@@ -3768,26 -3601,11 +3758,26 @@@
         raw_spin_lock_irq(&rq->lock);
         clear_tsk_need_resched(prev);
   
+ +      switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- -              if (unlikely(signal_pending_state(prev->state, prev)))
+ +              if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
- -              else
+ +              } else {
+ +                      /*
+ +                       * If a worker is going to sleep, notify and
+ +                       * ask workqueue whether it wants to wake up a
+ +                       * task to maintain concurrency.  If so, wake
+ +                       * up the task.
+ +                       */
+ +                      if (prev->flags & PF_WQ_WORKER) {
+ +                              struct task_struct *to_wakeup;
+ +
+ +                              to_wakeup = wq_worker_sleeping(prev, cpu);
+ +                              if (to_wakeup)
+ +                                      try_to_wake_up_local(to_wakeup);
+ +                      }
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ +              }
                 switch_count = &prev->nvcsw;
         }
   
@@@ -3809,10 -3627,8 +3799,10 @@@
   
                 context_switch(rq, prev, next); /* unlocks the rq */
                 /*
- -               * the context switch might have flipped the stack from under
- -               * us, hence refresh the local variables.
+ +               * The context switch have flipped the stack from under us
+ +               * and restored the local variables which were saved when
+ +               * this task called schedule() in the past. prev == current
+ +               * is still correct, but it can be moved to another cpu/rq.
                  */
                 cpu = smp_processor_id();
                 rq = cpu_rq(cpu);
@@@ -3821,8 -3637,11 +3811,8 @@@
   
         post_schedule(rq);
   
- -      if (unlikely(reacquire_kernel_lock(current) < 0)) {
- -              prev = rq->curr;
- -              switch_count = &prev->nivcsw;
+ +      if (unlikely(reacquire_kernel_lock(prev)))
                 goto need_resched_nonpreemptible;
- -      }
   
         preempt_enable_no_resched();
         if (need_resched())
@@@ -4612,8 -4431,12 +4602,8 @@@ recheck
          */
         if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
- -                      unsigned long rlim_rtprio;
- -
- -                      if (!lock_task_sighand(p, &flags))
- -                              return -ESRCH;
- -                      rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- -                      unlock_task_sighand(p, &flags);
+ +                      unsigned long rlim_rtprio =
+ +                                      task_rlimit(p, RLIMIT_RTPRIO);
   
                         /* can't set/change the rt policy */
                         if (policy != p->policy && !rlim_rtprio)
@@@ -5983,49 -5806,20 +5973,49 @@@ migration_call(struct notifier_block *n
    */
   static struct notifier_block __cpuinitdata migration_notifier = {
         .notifier_call = migration_call,
- -      .priority = 10
+ +      .priority = CPU_PRI_MIGRATION,
   };
   
+ +static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ +                                    unsigned long action, void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_ONLINE:
+ +      case CPU_DOWN_FAILED:
+ +              set_cpu_active((long)hcpu, true);
+ +              return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
+ +
+ +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ +                                      unsigned long action, void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_DOWN_PREPARE:
+ +              set_cpu_active((long)hcpu, false);
+ +              return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
+ +
   static int __init migration_init(void)
   {
         void *cpu = (void *)(long)smp_processor_id();
         int err;
   
- -      /* Start one for the boot CPU: */
+ +      /* Initialize migration for the boot CPU */
         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
         BUG_ON(err == NOTIFY_BAD);
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
   
+ +      /* Register cpu active notifiers */
+ +      cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ +      cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+ +
         return 0;
   }
   early_initcall(migration_init);
@@@ -6260,18 -6054,23 +6250,18 @@@ static void rq_attach_root(struct rq *r
                 free_rootdomain(old_rd);
   }
   
- -static int init_rootdomain(struct root_domain *rd, bool bootmem)
+ +static int init_rootdomain(struct root_domain *rd)
   {
- -      gfp_t gfp = GFP_KERNEL;
- -
         memset(rd, 0, sizeof(*rd));
   
- -      if (bootmem)
- -              gfp = GFP_NOWAIT;
- -
- -      if (!alloc_cpumask_var(&rd->span, gfp))
+ +      if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
- -      if (!alloc_cpumask_var(&rd->online, gfp))
+ +      if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
- -      if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+ +      if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_online;
   
- -      if (cpupri_init(&rd->cpupri, bootmem) != 0)
+ +      if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
         return 0;
   
@@@ -6287,7 -6086,7 +6277,7 @@@ out
   
   static void init_defrootdomain(void)
   {
- -      init_rootdomain(&def_root_domain, true);
+ +      init_rootdomain(&def_root_domain);
   
         atomic_set(&def_root_domain.refcount, 1);
   }
@@@ -6300,7 -6099,7 +6290,7 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
- -      if (init_rootdomain(rd, false) != 0) {
+ +      if (init_rootdomain(rd) != 0) {
                 kfree(rd);
                 return NULL;
         }
@@@ -7479,35 -7278,29 +7469,35 @@@ int __init sched_create_sysfs_power_sav
   }
   #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
- -#ifndef CONFIG_CPUSETS
   /*
- - * Add online and remove offline CPUs from the scheduler domains.
- - * When cpusets are enabled they take over this function.
+ + * Update cpusets according to cpu_active mask.  If cpusets are
+ + * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ + * around partition_sched_domains().
    */
- -static int update_sched_domains(struct notifier_block *nfb,
- -                              unsigned long action, void *hcpu)
+ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ +                           void *hcpu)
   {
- -      switch (action) {
+ +      switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
- -      case CPU_ONLINE_FROZEN:
- -      case CPU_DOWN_PREPARE:
- -      case CPU_DOWN_PREPARE_FROZEN:
         case CPU_DOWN_FAILED:
- -      case CPU_DOWN_FAILED_FROZEN:
- -              partition_sched_domains(1, NULL, NULL);
+ +              cpuset_update_active_cpus();
                 return NOTIFY_OK;
+ +      default:
+ +              return NOTIFY_DONE;
+ +      }
+ +}
   
+ +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ +                             void *hcpu)
+ +{
+ +      switch (action & ~CPU_TASKS_FROZEN) {
+ +      case CPU_DOWN_PREPARE:
+ +              cpuset_update_active_cpus();
+ +              return NOTIFY_OK;
         default:
                 return NOTIFY_DONE;
         }
   }
- -#endif
   
   static int update_runtime(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
@@@ -7553,8 -7346,10 +7543,8 @@@ void __init sched_init_smp(void
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
- -#ifndef CONFIG_CPUSETS
- -      /* XXX: Theoretical race here - CPU may be hotplugged now */
- -      hotcpu_notifier(update_sched_domains, 0);
- -#endif
+ +      hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ +      hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
   
         /* RT runtime code needs to handle some hotplug events */
         hotcpu_notifier(update_runtime, 0);
@@@ -7799,9 -7594,6 +7789,9 @@@ void __init sched_init(void
   
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                         rq->cpu_load[j] = 0;
+ +
+ +              rq->last_load_update_tick = jiffies;
+ +
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
@@@ -7815,10 -7607,6 +7805,10 @@@
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq_attach_root(rq, &def_root_domain);
+ +#ifdef CONFIG_NO_HZ
+ +              rq->nohz_balance_kick = 0;
+ +              init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+ +#endif
   #endif
                 init_rq_hrtick(rq);
                 atomic_set(&rq->nr_iowait, 0);
@@@ -7863,11 -7651,8 +7853,11 @@@
         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
   #ifdef CONFIG_SMP
   #ifdef CONFIG_NO_HZ
- -      zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
- -      alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+ +      zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+ +      atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ +      atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+ +      atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
   #endif
         /* May be allocated at isolcpus cmdline parse time */
         if (cpu_isolated_map == NULL)
diff --combined kernel/time/tick-sched.c

index 17525cac6cfefd136d9a4f68e942dfe0346bffa1,f898af60817156507f700abf095653352ab3f431..021d2f878f193bfe6eb2bacebd313dc3a4f5f072
--- 1/kernel/time/tick-sched.c
--- 2/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@@ -325,7 -325,7 +325,7 @@@ void tick_nohz_stop_sched_tick(int inid
         } while (read_seqretry(&xtime_lock, seq));
   
         if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-           arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+           arch_needs_cpu(cpu)) {
                 next_jiffies = last_jiffies + 1;
                 delta_jiffies = 1;
         } else {
@@@ -405,7 -405,13 +405,7 @@@
                  * the scheduler tick in nohz_restart_sched_tick.
                  */
                 if (!ts->tick_stopped) {
- -                      if (select_nohz_load_balancer(1)) {
- -                              /*
- -                               * sched tick not stopped!
- -                               */
- -                              cpumask_clear_cpu(cpu, nohz_cpu_mask);
- -                              goto out;
- -                      }
+ +                      select_nohz_load_balancer(1);
   
                         ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                         ts->tick_stopped = 1;
author	Ingo Molnar <mingo@elte.hu>
	Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 5 Aug 2010 07:46:29 +0000 (09:46 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-sched.c	patch \|	diff1 \|	diff2 \|	blob \| history