kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20  *              fair scheduling design by Con Kolivas.
  21  *  2007-05-05  Load balancing (smp-nice) and other improvements
  22  *              by Peter Williams
  23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25  */
  26
  27 #include <linux/mm.h>
  28 #include <linux/module.h>
  29 #include <linux/nmi.h>
  30 #include <linux/init.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/highmem.h>
  33 #include <linux/smp_lock.h>
  34 #include <asm/mmu_context.h>
  35 #include <linux/interrupt.h>
  36 #include <linux/capability.h>
  37 #include <linux/completion.h>
  38 #include <linux/kernel_stat.h>
  39 #include <linux/debug_locks.h>
  40 #include <linux/security.h>
  41 #include <linux/notifier.h>
  42 #include <linux/profile.h>
  43 #include <linux/freezer.h>
  44 #include <linux/vmalloc.h>
  45 #include <linux/blkdev.h>
  46 #include <linux/delay.h>
  47 #include <linux/smp.h>
  48 #include <linux/threads.h>
  49 #include <linux/timer.h>
  50 #include <linux/rcupdate.h>
  51 #include <linux/cpu.h>
  52 #include <linux/cpuset.h>
  53 #include <linux/percpu.h>
  54 #include <linux/kthread.h>
  55 #include <linux/seq_file.h>
  56 #include <linux/syscalls.h>
  57 #include <linux/times.h>
  58 #include <linux/tsacct_kern.h>
  59 #include <linux/kprobes.h>
  60 #include <linux/delayacct.h>
  61 #include <linux/reciprocal_div.h>
  62 #include <linux/unistd.h>
  63
  64 #include <asm/tlb.h>
  65
  66 /*
  67  * Scheduler clock - returns current time in nanosec units.
  68  * This is default implementation.
  69  * Architectures and sub-architectures can override this.
  70  */
  71 unsigned long long __attribute__((weak)) sched_clock(void)
  72 {
  73         return (unsigned long long)jiffies * (1000000000 / HZ);
  74 }
  75
  76 /*
  77  * Convert user-nice values [ -20 ... 0 ... 19 ]
  78  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  79  * and back.
  80  */
  81 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  82 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  83 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  84
  85 /*
  86  * 'User priority' is the nice value converted to something we
  87  * can work with better when scaling various scheduler parameters,
  88  * it's a [ 0 ... 39 ] range.
  89  */
  90 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  91 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  92 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  93
  94 /*
  95  * Some helpers for converting nanosecond timing to jiffy resolution
  96  */
  97 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  98 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  99
 100 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 101 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 102
 103 /*
 104  * These are the 'tuning knobs' of the scheduler:
 105  *
 106  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 107  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 108  * Timeslices get refilled after they expire.
 109  */
 110 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 111 #define DEF_TIMESLICE           (100 * HZ / 1000)
 112
 113 #ifdef CONFIG_SMP
 114 /*
 115  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 116  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 117  */
 118 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 119 {
 120         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 121 }
 122
 123 /*
 124  * Each time a sched group cpu_power is changed,
 125  * we must compute its reciprocal value
 126  */
 127 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 128 {
 129         sg->__cpu_power += val;
 130         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 131 }
 132 #endif
 133
 134 #define SCALE_PRIO(x, prio) \
 135         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 136
 137 /*
 138  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 139  * to time slice values: [800ms ... 100ms ... 5ms]
 140  */
 141 static unsigned int static_prio_timeslice(int static_prio)
 142 {
 143         if (static_prio == NICE_TO_PRIO(19))
 144                 return 1;
 145
 146         if (static_prio < NICE_TO_PRIO(0))
 147                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 148         else
 149                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 150 }
 151
 152 static inline int rt_policy(int policy)
 153 {
 154         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 155                 return 1;
 156         return 0;
 157 }
 158
 159 static inline int task_has_rt_policy(struct task_struct *p)
 160 {
 161         return rt_policy(p->policy);
 162 }
 163
 164 /*
 165  * This is the priority-queue data structure of the RT scheduling class:
 166  */
 167 struct rt_prio_array {
 168         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 169         struct list_head queue[MAX_RT_PRIO];
 170 };
 171
 172 struct load_stat {
 173         struct load_weight load;
 174         u64 load_update_start, load_update_last;
 175         unsigned long delta_fair, delta_exec, delta_stat;
 176 };
 177
 178 /* CFS-related fields in a runqueue */
 179 struct cfs_rq {
 180         struct load_weight load;
 181         unsigned long nr_running;
 182
 183         s64 fair_clock;
 184         u64 exec_clock;
 185         s64 wait_runtime;
 186         u64 sleeper_bonus;
 187         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 188
 189         struct rb_root tasks_timeline;
 190         struct rb_node *rb_leftmost;
 191         struct rb_node *rb_load_balance_curr;
 192 #ifdef CONFIG_FAIR_GROUP_SCHED
 193         /* 'curr' points to currently running entity on this cfs_rq.
 194          * It is set to NULL otherwise (i.e when none are currently running).
 195          */
 196         struct sched_entity *curr;
 197         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 198
 199         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 200          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 201          * (like users, containers etc.)
 202          *
 203          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 204          * list is used during load balance.
 205          */
 206         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 207 #endif
 208 };
 209
 210 /* Real-Time classes' related field in a runqueue: */
 211 struct rt_rq {
 212         struct rt_prio_array active;
 213         int rt_load_balance_idx;
 214         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 215 };
 216
 217 /*
 218  * This is the main, per-CPU runqueue data structure.
 219  *
 220  * Locking rule: those places that want to lock multiple runqueues
 221  * (such as the load balancing or the thread migration code), lock
 222  * acquire operations must be ordered by ascending &runqueue.
 223  */
 224 struct rq {
 225         spinlock_t lock;        /* runqueue lock */
 226
 227         /*
 228          * nr_running and cpu_load should be in the same cacheline because
 229          * remote CPUs use both these fields when doing load calculation.
 230          */
 231         unsigned long nr_running;
 232         #define CPU_LOAD_IDX_MAX 5
 233         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 234         unsigned char idle_at_tick;
 235 #ifdef CONFIG_NO_HZ
 236         unsigned char in_nohz_recently;
 237 #endif
 238         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 239         unsigned long nr_load_updates;
 240         u64 nr_switches;
 241
 242         struct cfs_rq cfs;
 243 #ifdef CONFIG_FAIR_GROUP_SCHED
 244         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 245 #endif
 246         struct rt_rq  rt;
 247
 248         /*
 249          * This is part of a global counter where only the total sum
 250          * over all CPUs matters. A task can increase this counter on
 251          * one CPU and if it got migrated afterwards it may decrease
 252          * it on another CPU. Always updated under the runqueue lock:
 253          */
 254         unsigned long nr_uninterruptible;
 255
 256         struct task_struct *curr, *idle;
 257         unsigned long next_balance;
 258         struct mm_struct *prev_mm;
 259
 260         u64 clock, prev_clock_raw;
 261         s64 clock_max_delta;
 262
 263         unsigned int clock_warps, clock_overflows;
 264         unsigned int clock_unstable_events;
 265
 266         struct sched_class *load_balance_class;
 267
 268         atomic_t nr_iowait;
 269
 270 #ifdef CONFIG_SMP
 271         struct sched_domain *sd;
 272
 273         /* For active balancing */
 274         int active_balance;
 275         int push_cpu;
 276         int cpu;                /* cpu of this runqueue */
 277
 278         struct task_struct *migration_thread;
 279         struct list_head migration_queue;
 280 #endif
 281
 282 #ifdef CONFIG_SCHEDSTATS
 283         /* latency stats */
 284         struct sched_info rq_sched_info;
 285
 286         /* sys_sched_yield() stats */
 287         unsigned long yld_exp_empty;
 288         unsigned long yld_act_empty;
 289         unsigned long yld_both_empty;
 290         unsigned long yld_cnt;
 291
 292         /* schedule() stats */
 293         unsigned long sched_switch;
 294         unsigned long sched_cnt;
 295         unsigned long sched_goidle;
 296
 297         /* try_to_wake_up() stats */
 298         unsigned long ttwu_cnt;
 299         unsigned long ttwu_local;
 300 #endif
 301         struct lock_class_key rq_lock_key;
 302 };
 303
 304 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 305 static DEFINE_MUTEX(sched_hotcpu_mutex);
 306
 307 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 308 {
 309         rq->curr->sched_class->check_preempt_curr(rq, p);
 310 }
 311
 312 static inline int cpu_of(struct rq *rq)
 313 {
 314 #ifdef CONFIG_SMP
 315         return rq->cpu;
 316 #else
 317         return 0;
 318 #endif
 319 }
 320
 321 /*
 322  * Per-runqueue clock, as finegrained as the platform can give us:
 323  */
 324 static unsigned long long __rq_clock(struct rq *rq)
 325 {
 326         u64 prev_raw = rq->prev_clock_raw;
 327         u64 now = sched_clock();
 328         s64 delta = now - prev_raw;
 329         u64 clock = rq->clock;
 330
 331         /*
 332          * Protect against sched_clock() occasionally going backwards:
 333          */
 334         if (unlikely(delta < 0)) {
 335                 clock++;
 336                 rq->clock_warps++;
 337         } else {
 338                 /*
 339                  * Catch too large forward jumps too:
 340                  */
 341                 if (unlikely(delta > 2*TICK_NSEC)) {
 342                         clock++;
 343                         rq->clock_overflows++;
 344                 } else {
 345                         if (unlikely(delta > rq->clock_max_delta))
 346                                 rq->clock_max_delta = delta;
 347                         clock += delta;
 348                 }
 349         }
 350
 351         rq->prev_clock_raw = now;
 352         rq->clock = clock;
 353
 354         return clock;
 355 }
 356
 357 static inline unsigned long long rq_clock(struct rq *rq)
 358 {
 359         int this_cpu = smp_processor_id();
 360
 361         if (this_cpu == cpu_of(rq))
 362                 return __rq_clock(rq);
 363
 364         return rq->clock;
 365 }
 366
 367 /*
 368  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 369  * See detach_destroy_domains: synchronize_sched for details.
 370  *
 371  * The domain tree of any CPU may only be accessed from within
 372  * preempt-disabled sections.
 373  */
 374 #define for_each_domain(cpu, __sd) \
 375         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 376
 377 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 378 #define this_rq()               (&__get_cpu_var(runqueues))
 379 #define task_rq(p)              cpu_rq(task_cpu(p))
 380 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 381
 382 #ifdef CONFIG_FAIR_GROUP_SCHED
 383 /* Change a task's ->cfs_rq if it moves across CPUs */
 384 static inline void set_task_cfs_rq(struct task_struct *p)
 385 {
 386         p->se.cfs_rq = &task_rq(p)->cfs;
 387 }
 388 #else
 389 static inline void set_task_cfs_rq(struct task_struct *p)
 390 {
 391 }
 392 #endif
 393
 394 #ifndef prepare_arch_switch
 395 # define prepare_arch_switch(next)      do { } while (0)
 396 #endif
 397 #ifndef finish_arch_switch
 398 # define finish_arch_switch(prev)       do { } while (0)
 399 #endif
 400
 401 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 402 static inline int task_running(struct rq *rq, struct task_struct *p)
 403 {
 404         return rq->curr == p;
 405 }
 406
 407 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 408 {
 409 }
 410
 411 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 412 {
 413 #ifdef CONFIG_DEBUG_SPINLOCK
 414         /* this is a valid case when another task releases the spinlock */
 415         rq->lock.owner = current;
 416 #endif
 417         /*
 418          * If we are tracking spinlock dependencies then we have to
 419          * fix up the runqueue lock - which gets 'carried over' from
 420          * prev into current:
 421          */
 422         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 423
 424         spin_unlock_irq(&rq->lock);
 425 }
 426
 427 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 428 static inline int task_running(struct rq *rq, struct task_struct *p)
 429 {
 430 #ifdef CONFIG_SMP
 431         return p->oncpu;
 432 #else
 433         return rq->curr == p;
 434 #endif
 435 }
 436
 437 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 438 {
 439 #ifdef CONFIG_SMP
 440         /*
 441          * We can optimise this out completely for !SMP, because the
 442          * SMP rebalancing from interrupt is the only thing that cares
 443          * here.
 444          */
 445         next->oncpu = 1;
 446 #endif
 447 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 448         spin_unlock_irq(&rq->lock);
 449 #else
 450         spin_unlock(&rq->lock);
 451 #endif
 452 }
 453
 454 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 455 {
 456 #ifdef CONFIG_SMP
 457         /*
 458          * After ->oncpu is cleared, the task can be moved to a different CPU.
 459          * We must ensure this doesn't happen until the switch is completely
 460          * finished.
 461          */
 462         smp_wmb();
 463         prev->oncpu = 0;
 464 #endif
 465 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 466         local_irq_enable();
 467 #endif
 468 }
 469 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 470
 471 /*
 472  * __task_rq_lock - lock the runqueue a given task resides on.
 473  * Must be called interrupts disabled.
 474  */
 475 static inline struct rq *__task_rq_lock(struct task_struct *p)
 476         __acquires(rq->lock)
 477 {
 478         struct rq *rq;
 479
 480 repeat_lock_task:
 481         rq = task_rq(p);
 482         spin_lock(&rq->lock);
 483         if (unlikely(rq != task_rq(p))) {
 484                 spin_unlock(&rq->lock);
 485                 goto repeat_lock_task;
 486         }
 487         return rq;
 488 }
 489
 490 /*
 491  * task_rq_lock - lock the runqueue a given task resides on and disable
 492  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 493  * explicitly disabling preemption.
 494  */
 495 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 496         __acquires(rq->lock)
 497 {
 498         struct rq *rq;
 499
 500 repeat_lock_task:
 501         local_irq_save(*flags);
 502         rq = task_rq(p);
 503         spin_lock(&rq->lock);
 504         if (unlikely(rq != task_rq(p))) {
 505                 spin_unlock_irqrestore(&rq->lock, *flags);
 506                 goto repeat_lock_task;
 507         }
 508         return rq;
 509 }
 510
 511 static inline void __task_rq_unlock(struct rq *rq)
 512         __releases(rq->lock)
 513 {
 514         spin_unlock(&rq->lock);
 515 }
 516
 517 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 518         __releases(rq->lock)
 519 {
 520         spin_unlock_irqrestore(&rq->lock, *flags);
 521 }
 522
 523 /*
 524  * this_rq_lock - lock this runqueue and disable interrupts.
 525  */
 526 static inline struct rq *this_rq_lock(void)
 527         __acquires(rq->lock)
 528 {
 529         struct rq *rq;
 530
 531         local_irq_disable();
 532         rq = this_rq();
 533         spin_lock(&rq->lock);
 534
 535         return rq;
 536 }
 537
 538 /*
 539  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 540  */
 541 void sched_clock_unstable_event(void)
 542 {
 543         unsigned long flags;
 544         struct rq *rq;
 545
 546         rq = task_rq_lock(current, &flags);
 547         rq->prev_clock_raw = sched_clock();
 548         rq->clock_unstable_events++;
 549         task_rq_unlock(rq, &flags);
 550 }
 551
 552 /*
 553  * resched_task - mark a task 'to be rescheduled now'.
 554  *
 555  * On UP this means the setting of the need_resched flag, on SMP it
 556  * might also involve a cross-CPU call to trigger the scheduler on
 557  * the target CPU.
 558  */
 559 #ifdef CONFIG_SMP
 560
 561 #ifndef tsk_is_polling
 562 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 563 #endif
 564
 565 static void resched_task(struct task_struct *p)
 566 {
 567         int cpu;
 568
 569         assert_spin_locked(&task_rq(p)->lock);
 570
 571         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 572                 return;
 573
 574         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 575
 576         cpu = task_cpu(p);
 577         if (cpu == smp_processor_id())
 578                 return;
 579
 580         /* NEED_RESCHED must be visible before we test polling */
 581         smp_mb();
 582         if (!tsk_is_polling(p))
 583                 smp_send_reschedule(cpu);
 584 }
 585
 586 static void resched_cpu(int cpu)
 587 {
 588         struct rq *rq = cpu_rq(cpu);
 589         unsigned long flags;
 590
 591         if (!spin_trylock_irqsave(&rq->lock, flags))
 592                 return;
 593         resched_task(cpu_curr(cpu));
 594         spin_unlock_irqrestore(&rq->lock, flags);
 595 }
 596 #else
 597 static inline void resched_task(struct task_struct *p)
 598 {
 599         assert_spin_locked(&task_rq(p)->lock);
 600         set_tsk_need_resched(p);
 601 }
 602 #endif
 603
 604 static u64 div64_likely32(u64 divident, unsigned long divisor)
 605 {
 606 #if BITS_PER_LONG == 32
 607         if (likely(divident <= 0xffffffffULL))
 608                 return (u32)divident / divisor;
 609         do_div(divident, divisor);
 610
 611         return divident;
 612 #else
 613         return divident / divisor;
 614 #endif
 615 }
 616
 617 #if BITS_PER_LONG == 32
 618 # define WMULT_CONST    (~0UL)
 619 #else
 620 # define WMULT_CONST    (1UL << 32)
 621 #endif
 622
 623 #define WMULT_SHIFT     32
 624
 625 static inline unsigned long
 626 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 627                 struct load_weight *lw)
 628 {
 629         u64 tmp;
 630
 631         if (unlikely(!lw->inv_weight))
 632                 lw->inv_weight = WMULT_CONST / lw->weight;
 633
 634         tmp = (u64)delta_exec * weight;
 635         /*
 636          * Check whether we'd overflow the 64-bit multiplication:
 637          */
 638         if (unlikely(tmp > WMULT_CONST)) {
 639                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 640                                 >> (WMULT_SHIFT/2);
 641         } else {
 642                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 643         }
 644
 645         return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 646 }
 647
 648 static inline unsigned long
 649 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 650 {
 651         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 652 }
 653
 654 static void update_load_add(struct load_weight *lw, unsigned long inc)
 655 {
 656         lw->weight += inc;
 657         lw->inv_weight = 0;
 658 }
 659
 660 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 661 {
 662         lw->weight -= dec;
 663         lw->inv_weight = 0;
 664 }
 665
 666 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 667 {
 668         if (rq->curr != rq->idle && ls->load.weight) {
 669                 ls->delta_exec += ls->delta_stat;
 670                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 671                 ls->delta_stat = 0;
 672         }
 673 }
 674
 675 /*
 676  * Update delta_exec, delta_fair fields for rq.
 677  *
 678  * delta_fair clock advances at a rate inversely proportional to
 679  * total load (rq->ls.load.weight) on the runqueue, while
 680  * delta_exec advances at the same rate as wall-clock (provided
 681  * cpu is not idle).
 682  *
 683  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 684  * runqueue over any given interval. This (smoothened) load is used
 685  * during load balance.
 686  *
 687  * This function is called /before/ updating rq->ls.load
 688  * and when switching tasks.
 689  */
 690 static void update_curr_load(struct rq *rq, u64 now)
 691 {
 692         struct load_stat *ls = &rq->ls;
 693         u64 start;
 694
 695         start = ls->load_update_start;
 696         ls->load_update_start = now;
 697         ls->delta_stat += now - start;
 698         /*
 699          * Stagger updates to ls->delta_fair. Very frequent updates
 700          * can be expensive.
 701          */
 702         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 703                 __update_curr_load(rq, ls);
 704 }
 705
 706 /*
 707  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 708  * of tasks with abnormal "nice" values across CPUs the contribution that
 709  * each task makes to its run queue's load is weighted according to its
 710  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 711  * scaled version of the new time slice allocation that they receive on time
 712  * slice expiry etc.
 713  */
 714
 715 /*
 716  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 717  * If static_prio_timeslice() is ever changed to break this assumption then
 718  * this code will need modification
 719  */
 720 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 721 #define load_weight(lp) \
 722         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 723 #define PRIO_TO_LOAD_WEIGHT(prio) \
 724         load_weight(static_prio_timeslice(prio))
 725 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 726         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
 727
 728 #define WEIGHT_IDLEPRIO         2
 729 #define WMULT_IDLEPRIO          (1 << 31)
 730
 731 /*
 732  * Nice levels are multiplicative, with a gentle 10% change for every
 733  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 734  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 735  * that remained on nice 0.
 736  *
 737  * The "10% effect" is relative and cumulative: from _any_ nice level,
 738  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 739  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 740  * If a task goes up by ~10% and another task goes down by ~10% then
 741  * the relative distance between them is ~25%.)
 742  */
 743 static const int prio_to_weight[40] = {
 744 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 745 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 746 /*   0 */  NICE_0_LOAD /* 1024 */,
 747 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 748 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 749 };
 750
 751 /*
 752  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
 753  *
 754  * In cases where the weight does not change often, we can use the
 755  * precalculated inverse to speed up arithmetics by turning divisions
 756  * into multiplications:
 757  */
 758 static const u32 prio_to_wmult[40] = {
 759         48356,   60446,   75558,   94446,  118058,  147573,
 760         184467,  230589,  288233,  360285,  450347,
 761         562979,  703746,  879575, 1099582, 1374389,
 762         1717986, 2147483, 2684354, 3355443, 4194304,
 763         5244160, 6557201, 8196502, 10250518, 12782640,
 764         16025997, 19976592, 24970740, 31350126, 39045157,
 765         49367440, 61356675, 76695844, 95443717, 119304647,
 766         148102320, 186737708, 238609294, 286331153,
 767 };
 768
 769 static inline void
 770 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 771 {
 772         update_curr_load(rq, now);
 773         update_load_add(&rq->ls.load, p->se.load.weight);
 774 }
 775
 776 static inline void
 777 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 778 {
 779         update_curr_load(rq, now);
 780         update_load_sub(&rq->ls.load, p->se.load.weight);
 781 }
 782
 783 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 784 {
 785         rq->nr_running++;
 786         inc_load(rq, p, now);
 787 }
 788
 789 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 790 {
 791         rq->nr_running--;
 792         dec_load(rq, p, now);
 793 }
 794
 795 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 796
 797 /*
 798  * runqueue iterator, to support SMP load-balancing between different
 799  * scheduling classes, without having to expose their internal data
 800  * structures to the load-balancing proper:
 801  */
 802 struct rq_iterator {
 803         void *arg;
 804         struct task_struct *(*start)(void *);
 805         struct task_struct *(*next)(void *);
 806 };
 807
 808 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 809                       unsigned long max_nr_move, unsigned long max_load_move,
 810                       struct sched_domain *sd, enum cpu_idle_type idle,
 811                       int *all_pinned, unsigned long *load_moved,
 812                       int this_best_prio, int best_prio, int best_prio_seen,
 813                       struct rq_iterator *iterator);
 814
 815 #include "sched_stats.h"
 816 #include "sched_rt.c"
 817 #include "sched_fair.c"
 818 #include "sched_idletask.c"
 819 #ifdef CONFIG_SCHED_DEBUG
 820 # include "sched_debug.c"
 821 #endif
 822
 823 #define sched_class_highest (&rt_sched_class)
 824
 825 static void set_load_weight(struct task_struct *p)
 826 {
 827         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 828         p->se.wait_runtime = 0;
 829
 830         if (task_has_rt_policy(p)) {
 831                 p->se.load.weight = prio_to_weight[0] * 2;
 832                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 833                 return;
 834         }
 835
 836         /*
 837          * SCHED_IDLE tasks get minimal weight:
 838          */
 839         if (p->policy == SCHED_IDLE) {
 840                 p->se.load.weight = WEIGHT_IDLEPRIO;
 841                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 842                 return;
 843         }
 844
 845         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 846         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 847 }
 848
 849 static void
 850 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 851 {
 852         sched_info_queued(p);
 853         p->sched_class->enqueue_task(rq, p, wakeup, now);
 854         p->se.on_rq = 1;
 855 }
 856
 857 static void
 858 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 859 {
 860         p->sched_class->dequeue_task(rq, p, sleep, now);
 861         p->se.on_rq = 0;
 862 }
 863
 864 /*
 865  * __normal_prio - return the priority that is based on the static prio
 866  */
 867 static inline int __normal_prio(struct task_struct *p)
 868 {
 869         return p->static_prio;
 870 }
 871
 872 /*
 873  * Calculate the expected normal priority: i.e. priority
 874  * without taking RT-inheritance into account. Might be
 875  * boosted by interactivity modifiers. Changes upon fork,
 876  * setprio syscalls, and whenever the interactivity
 877  * estimator recalculates.
 878  */
 879 static inline int normal_prio(struct task_struct *p)
 880 {
 881         int prio;
 882
 883         if (task_has_rt_policy(p))
 884                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 885         else
 886                 prio = __normal_prio(p);
 887         return prio;
 888 }
 889
 890 /*
 891  * Calculate the current priority, i.e. the priority
 892  * taken into account by the scheduler. This value might
 893  * be boosted by RT tasks, or might be boosted by
 894  * interactivity modifiers. Will be RT if the task got
 895  * RT-boosted. If not then it returns p->normal_prio.
 896  */
 897 static int effective_prio(struct task_struct *p)
 898 {
 899         p->normal_prio = normal_prio(p);
 900         /*
 901          * If we are RT tasks or we were boosted to RT priority,
 902          * keep the priority unchanged. Otherwise, update priority
 903          * to the normal priority:
 904          */
 905         if (!rt_prio(p->prio))
 906                 return p->normal_prio;
 907         return p->prio;
 908 }
 909
 910 /*
 911  * activate_task - move a task to the runqueue.
 912  */
 913 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 914 {
 915         u64 now = rq_clock(rq);
 916
 917         if (p->state == TASK_UNINTERRUPTIBLE)
 918                 rq->nr_uninterruptible--;
 919
 920         enqueue_task(rq, p, wakeup, now);
 921         inc_nr_running(p, rq, now);
 922 }
 923
 924 /*
 925  * activate_idle_task - move idle task to the _front_ of runqueue.
 926  */
 927 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 928 {
 929         u64 now = rq_clock(rq);
 930
 931         if (p->state == TASK_UNINTERRUPTIBLE)
 932                 rq->nr_uninterruptible--;
 933
 934         enqueue_task(rq, p, 0, now);
 935         inc_nr_running(p, rq, now);
 936 }
 937
 938 /*
 939  * deactivate_task - remove a task from the runqueue.
 940  */
 941 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 942 {
 943         u64 now = rq_clock(rq);
 944
 945         if (p->state == TASK_UNINTERRUPTIBLE)
 946                 rq->nr_uninterruptible++;
 947
 948         dequeue_task(rq, p, sleep, now);
 949         dec_nr_running(p, rq, now);
 950 }
 951
 952 /**
 953  * task_curr - is this task currently executing on a CPU?
 954  * @p: the task in question.
 955  */
 956 inline int task_curr(const struct task_struct *p)
 957 {
 958         return cpu_curr(task_cpu(p)) == p;
 959 }
 960
 961 /* Used instead of source_load when we know the type == 0 */
 962 unsigned long weighted_cpuload(const int cpu)
 963 {
 964         return cpu_rq(cpu)->ls.load.weight;
 965 }
 966
 967 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 968 {
 969 #ifdef CONFIG_SMP
 970         task_thread_info(p)->cpu = cpu;
 971         set_task_cfs_rq(p);
 972 #endif
 973 }
 974
 975 #ifdef CONFIG_SMP
 976
 977 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 978 {
 979         int old_cpu = task_cpu(p);
 980         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 981         u64 clock_offset, fair_clock_offset;
 982
 983         clock_offset = old_rq->clock - new_rq->clock;
 984         fair_clock_offset = old_rq->cfs.fair_clock -
 985                                                  new_rq->cfs.fair_clock;
 986         if (p->se.wait_start)
 987                 p->se.wait_start -= clock_offset;
 988         if (p->se.wait_start_fair)
 989                 p->se.wait_start_fair -= fair_clock_offset;
 990         if (p->se.sleep_start)
 991                 p->se.sleep_start -= clock_offset;
 992         if (p->se.block_start)
 993                 p->se.block_start -= clock_offset;
 994         if (p->se.sleep_start_fair)
 995                 p->se.sleep_start_fair -= fair_clock_offset;
 996
 997         __set_task_cpu(p, new_cpu);
 998 }
 999
1000 struct migration_req {
1001         struct list_head list;
1002
1003         struct task_struct *task;
1004         int dest_cpu;
1005
1006         struct completion done;
1007 };
1008
1009 /*
1010  * The task's runqueue lock must be held.
1011  * Returns true if you have to wait for migration thread.
1012  */
1013 static int
1014 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1015 {
1016         struct rq *rq = task_rq(p);
1017
1018         /*
1019          * If the task is not on a runqueue (and not running), then
1020          * it is sufficient to simply update the task's cpu field.
1021          */
1022         if (!p->se.on_rq && !task_running(rq, p)) {
1023                 set_task_cpu(p, dest_cpu);
1024                 return 0;
1025         }
1026
1027         init_completion(&req->done);
1028         req->task = p;
1029         req->dest_cpu = dest_cpu;
1030         list_add(&req->list, &rq->migration_queue);
1031
1032         return 1;
1033 }
1034
1035 /*
1036  * wait_task_inactive - wait for a thread to unschedule.
1037  *
1038  * The caller must ensure that the task *will* unschedule sometime soon,
1039  * else this function might spin for a *long* time. This function can't
1040  * be called with interrupts off, or it may introduce deadlock with
1041  * smp_call_function() if an IPI is sent by the same process we are
1042  * waiting to become inactive.
1043  */
1044 void wait_task_inactive(struct task_struct *p)
1045 {
1046         unsigned long flags;
1047         int running, on_rq;
1048         struct rq *rq;
1049
1050 repeat:
1051         /*
1052          * We do the initial early heuristics without holding
1053          * any task-queue locks at all. We'll only try to get
1054          * the runqueue lock when things look like they will
1055          * work out!
1056          */
1057         rq = task_rq(p);
1058
1059         /*
1060          * If the task is actively running on another CPU
1061          * still, just relax and busy-wait without holding
1062          * any locks.
1063          *
1064          * NOTE! Since we don't hold any locks, it's not
1065          * even sure that "rq" stays as the right runqueue!
1066          * But we don't care, since "task_running()" will
1067          * return false if the runqueue has changed and p
1068          * is actually now running somewhere else!
1069          */
1070         while (task_running(rq, p))
1071                 cpu_relax();
1072
1073         /*
1074          * Ok, time to look more closely! We need the rq
1075          * lock now, to be *sure*. If we're wrong, we'll
1076          * just go back and repeat.
1077          */
1078         rq = task_rq_lock(p, &flags);
1079         running = task_running(rq, p);
1080         on_rq = p->se.on_rq;
1081         task_rq_unlock(rq, &flags);
1082
1083         /*
1084          * Was it really running after all now that we
1085          * checked with the proper locks actually held?
1086          *
1087          * Oops. Go back and try again..
1088          */
1089         if (unlikely(running)) {
1090                 cpu_relax();
1091                 goto repeat;
1092         }
1093
1094         /*
1095          * It's not enough that it's not actively running,
1096          * it must be off the runqueue _entirely_, and not
1097          * preempted!
1098          *
1099          * So if it wa still runnable (but just not actively
1100          * running right now), it's preempted, and we should
1101          * yield - it could be a while.
1102          */
1103         if (unlikely(on_rq)) {
1104                 yield();
1105                 goto repeat;
1106         }
1107
1108         /*
1109          * Ahh, all good. It wasn't running, and it wasn't
1110          * runnable, which means that it will never become
1111          * running in the future either. We're all done!
1112          */
1113 }
1114
1115 /***
1116  * kick_process - kick a running thread to enter/exit the kernel
1117  * @p: the to-be-kicked thread
1118  *
1119  * Cause a process which is running on another CPU to enter
1120  * kernel-mode, without any delay. (to get signals handled.)
1121  *
1122  * NOTE: this function doesnt have to take the runqueue lock,
1123  * because all it wants to ensure is that the remote task enters
1124  * the kernel. If the IPI races and the task has been migrated
1125  * to another CPU then no harm is done and the purpose has been
1126  * achieved as well.
1127  */
1128 void kick_process(struct task_struct *p)
1129 {
1130         int cpu;
1131
1132         preempt_disable();
1133         cpu = task_cpu(p);
1134         if ((cpu != smp_processor_id()) && task_curr(p))
1135                 smp_send_reschedule(cpu);
1136         preempt_enable();
1137 }
1138
1139 /*
1140  * Return a low guess at the load of a migration-source cpu weighted
1141  * according to the scheduling class and "nice" value.
1142  *
1143  * We want to under-estimate the load of migration sources, to
1144  * balance conservatively.
1145  */
1146 static inline unsigned long source_load(int cpu, int type)
1147 {
1148         struct rq *rq = cpu_rq(cpu);
1149         unsigned long total = weighted_cpuload(cpu);
1150
1151         if (type == 0)
1152                 return total;
1153
1154         return min(rq->cpu_load[type-1], total);
1155 }
1156
1157 /*
1158  * Return a high guess at the load of a migration-target cpu weighted
1159  * according to the scheduling class and "nice" value.
1160  */
1161 static inline unsigned long target_load(int cpu, int type)
1162 {
1163         struct rq *rq = cpu_rq(cpu);
1164         unsigned long total = weighted_cpuload(cpu);
1165
1166         if (type == 0)
1167                 return total;
1168
1169         return max(rq->cpu_load[type-1], total);
1170 }
1171
1172 /*
1173  * Return the average load per task on the cpu's run queue
1174  */
1175 static inline unsigned long cpu_avg_load_per_task(int cpu)
1176 {
1177         struct rq *rq = cpu_rq(cpu);
1178         unsigned long total = weighted_cpuload(cpu);
1179         unsigned long n = rq->nr_running;
1180
1181         return n ? total / n : SCHED_LOAD_SCALE;
1182 }
1183
1184 /*
1185  * find_idlest_group finds and returns the least busy CPU group within the
1186  * domain.
1187  */
1188 static struct sched_group *
1189 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1190 {
1191         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1192         unsigned long min_load = ULONG_MAX, this_load = 0;
1193         int load_idx = sd->forkexec_idx;
1194         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1195
1196         do {
1197                 unsigned long load, avg_load;
1198                 int local_group;
1199                 int i;
1200
1201                 /* Skip over this group if it has no CPUs allowed */
1202                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1203                         goto nextgroup;
1204
1205                 local_group = cpu_isset(this_cpu, group->cpumask);
1206
1207                 /* Tally up the load of all CPUs in the group */
1208                 avg_load = 0;
1209
1210                 for_each_cpu_mask(i, group->cpumask) {
1211                         /* Bias balancing toward cpus of our domain */
1212                         if (local_group)
1213                                 load = source_load(i, load_idx);
1214                         else
1215                                 load = target_load(i, load_idx);
1216
1217                         avg_load += load;
1218                 }
1219
1220                 /* Adjust by relative CPU power of the group */
1221                 avg_load = sg_div_cpu_power(group,
1222                                 avg_load * SCHED_LOAD_SCALE);
1223
1224                 if (local_group) {
1225                         this_load = avg_load;
1226                         this = group;
1227                 } else if (avg_load < min_load) {
1228                         min_load = avg_load;
1229                         idlest = group;
1230                 }
1231 nextgroup:
1232                 group = group->next;
1233         } while (group != sd->groups);
1234
1235         if (!idlest || 100*this_load < imbalance*min_load)
1236                 return NULL;
1237         return idlest;
1238 }
1239
1240 /*
1241  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1242  */
1243 static int
1244 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1245 {
1246         cpumask_t tmp;
1247         unsigned long load, min_load = ULONG_MAX;
1248         int idlest = -1;
1249         int i;
1250
1251         /* Traverse only the allowed CPUs */
1252         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1253
1254         for_each_cpu_mask(i, tmp) {
1255                 load = weighted_cpuload(i);
1256
1257                 if (load < min_load || (load == min_load && i == this_cpu)) {
1258                         min_load = load;
1259                         idlest = i;
1260                 }
1261         }
1262
1263         return idlest;
1264 }
1265
1266 /*
1267  * sched_balance_self: balance the current task (running on cpu) in domains
1268  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1269  * SD_BALANCE_EXEC.
1270  *
1271  * Balance, ie. select the least loaded group.
1272  *
1273  * Returns the target CPU number, or the same CPU if no balancing is needed.
1274  *
1275  * preempt must be disabled.
1276  */
1277 static int sched_balance_self(int cpu, int flag)
1278 {
1279         struct task_struct *t = current;
1280         struct sched_domain *tmp, *sd = NULL;
1281
1282         for_each_domain(cpu, tmp) {
1283                 /*
1284                  * If power savings logic is enabled for a domain, stop there.
1285                  */
1286                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1287                         break;
1288                 if (tmp->flags & flag)
1289                         sd = tmp;
1290         }
1291
1292         while (sd) {
1293                 cpumask_t span;
1294                 struct sched_group *group;
1295                 int new_cpu, weight;
1296
1297                 if (!(sd->flags & flag)) {
1298                         sd = sd->child;
1299                         continue;
1300                 }
1301
1302                 span = sd->span;
1303                 group = find_idlest_group(sd, t, cpu);
1304                 if (!group) {
1305                         sd = sd->child;
1306                         continue;
1307                 }
1308
1309                 new_cpu = find_idlest_cpu(group, t, cpu);
1310                 if (new_cpu == -1 || new_cpu == cpu) {
1311                         /* Now try balancing at a lower domain level of cpu */
1312                         sd = sd->child;
1313                         continue;
1314                 }
1315
1316                 /* Now try balancing at a lower domain level of new_cpu */
1317                 cpu = new_cpu;
1318                 sd = NULL;
1319                 weight = cpus_weight(span);
1320                 for_each_domain(cpu, tmp) {
1321                         if (weight <= cpus_weight(tmp->span))
1322                                 break;
1323                         if (tmp->flags & flag)
1324                                 sd = tmp;
1325                 }
1326                 /* while loop will break here if sd == NULL */
1327         }
1328
1329         return cpu;
1330 }
1331
1332 #endif /* CONFIG_SMP */
1333
1334 /*
1335  * wake_idle() will wake a task on an idle cpu if task->cpu is
1336  * not idle and an idle cpu is available.  The span of cpus to
1337  * search starts with cpus closest then further out as needed,
1338  * so we always favor a closer, idle cpu.
1339  *
1340  * Returns the CPU we should wake onto.
1341  */
1342 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1343 static int wake_idle(int cpu, struct task_struct *p)
1344 {
1345         cpumask_t tmp;
1346         struct sched_domain *sd;
1347         int i;
1348
1349         /*
1350          * If it is idle, then it is the best cpu to run this task.
1351          *
1352          * This cpu is also the best, if it has more than one task already.
1353          * Siblings must be also busy(in most cases) as they didn't already
1354          * pickup the extra load from this cpu and hence we need not check
1355          * sibling runqueue info. This will avoid the checks and cache miss
1356          * penalities associated with that.
1357          */
1358         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1359                 return cpu;
1360
1361         for_each_domain(cpu, sd) {
1362                 if (sd->flags & SD_WAKE_IDLE) {
1363                         cpus_and(tmp, sd->span, p->cpus_allowed);
1364                         for_each_cpu_mask(i, tmp) {
1365                                 if (idle_cpu(i))
1366                                         return i;
1367                         }
1368                 } else {
1369                         break;
1370                 }
1371         }
1372         return cpu;
1373 }
1374 #else
1375 static inline int wake_idle(int cpu, struct task_struct *p)
1376 {
1377         return cpu;
1378 }
1379 #endif
1380
1381 /***
1382  * try_to_wake_up - wake up a thread
1383  * @p: the to-be-woken-up thread
1384  * @state: the mask of task states that can be woken
1385  * @sync: do a synchronous wakeup?
1386  *
1387  * Put it on the run-queue if it's not already there. The "current"
1388  * thread is always on the run-queue (except when the actual
1389  * re-schedule is in progress), and as such you're allowed to do
1390  * the simpler "current->state = TASK_RUNNING" to mark yourself
1391  * runnable without the overhead of this.
1392  *
1393  * returns failure only if the task is already active.
1394  */
1395 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1396 {
1397         int cpu, this_cpu, success = 0;
1398         unsigned long flags;
1399         long old_state;
1400         struct rq *rq;
1401 #ifdef CONFIG_SMP
1402         struct sched_domain *sd, *this_sd = NULL;
1403         unsigned long load, this_load;
1404         int new_cpu;
1405 #endif
1406
1407         rq = task_rq_lock(p, &flags);
1408         old_state = p->state;
1409         if (!(old_state & state))
1410                 goto out;
1411
1412         if (p->se.on_rq)
1413                 goto out_running;
1414
1415         cpu = task_cpu(p);
1416         this_cpu = smp_processor_id();
1417
1418 #ifdef CONFIG_SMP
1419         if (unlikely(task_running(rq, p)))
1420                 goto out_activate;
1421
1422         new_cpu = cpu;
1423
1424         schedstat_inc(rq, ttwu_cnt);
1425         if (cpu == this_cpu) {
1426                 schedstat_inc(rq, ttwu_local);
1427                 goto out_set_cpu;
1428         }
1429
1430         for_each_domain(this_cpu, sd) {
1431                 if (cpu_isset(cpu, sd->span)) {
1432                         schedstat_inc(sd, ttwu_wake_remote);
1433                         this_sd = sd;
1434                         break;
1435                 }
1436         }
1437
1438         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1439                 goto out_set_cpu;
1440
1441         /*
1442          * Check for affine wakeup and passive balancing possibilities.
1443          */
1444         if (this_sd) {
1445                 int idx = this_sd->wake_idx;
1446                 unsigned int imbalance;
1447
1448                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1449
1450                 load = source_load(cpu, idx);
1451                 this_load = target_load(this_cpu, idx);
1452
1453                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1454
1455                 if (this_sd->flags & SD_WAKE_AFFINE) {
1456                         unsigned long tl = this_load;
1457                         unsigned long tl_per_task;
1458
1459                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1460
1461                         /*
1462                          * If sync wakeup then subtract the (maximum possible)
1463                          * effect of the currently running task from the load
1464                          * of the current CPU:
1465                          */
1466                         if (sync)
1467                                 tl -= current->se.load.weight;
1468
1469                         if ((tl <= load &&
1470                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1471                                100*(tl + p->se.load.weight) <= imbalance*load) {
1472                                 /*
1473                                  * This domain has SD_WAKE_AFFINE and
1474                                  * p is cache cold in this domain, and
1475                                  * there is no bad imbalance.
1476                                  */
1477                                 schedstat_inc(this_sd, ttwu_move_affine);
1478                                 goto out_set_cpu;
1479                         }
1480                 }
1481
1482                 /*
1483                  * Start passive balancing when half the imbalance_pct
1484                  * limit is reached.
1485                  */
1486                 if (this_sd->flags & SD_WAKE_BALANCE) {
1487                         if (imbalance*this_load <= 100*load) {
1488                                 schedstat_inc(this_sd, ttwu_move_balance);
1489                                 goto out_set_cpu;
1490                         }
1491                 }
1492         }
1493
1494         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1495 out_set_cpu:
1496         new_cpu = wake_idle(new_cpu, p);
1497         if (new_cpu != cpu) {
1498                 set_task_cpu(p, new_cpu);
1499                 task_rq_unlock(rq, &flags);
1500                 /* might preempt at this point */
1501                 rq = task_rq_lock(p, &flags);
1502                 old_state = p->state;
1503                 if (!(old_state & state))
1504                         goto out;
1505                 if (p->se.on_rq)
1506                         goto out_running;
1507
1508                 this_cpu = smp_processor_id();
1509                 cpu = task_cpu(p);
1510         }
1511
1512 out_activate:
1513 #endif /* CONFIG_SMP */
1514         activate_task(rq, p, 1);
1515         /*
1516          * Sync wakeups (i.e. those types of wakeups where the waker
1517          * has indicated that it will leave the CPU in short order)
1518          * don't trigger a preemption, if the woken up task will run on
1519          * this cpu. (in this case the 'I will reschedule' promise of
1520          * the waker guarantees that the freshly woken up task is going
1521          * to be considered on this CPU.)
1522          */
1523         if (!sync || cpu != this_cpu)
1524                 check_preempt_curr(rq, p);
1525         success = 1;
1526
1527 out_running:
1528         p->state = TASK_RUNNING;
1529 out:
1530         task_rq_unlock(rq, &flags);
1531
1532         return success;
1533 }
1534
1535 int fastcall wake_up_process(struct task_struct *p)
1536 {
1537         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1538                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1539 }
1540 EXPORT_SYMBOL(wake_up_process);
1541
1542 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1543 {
1544         return try_to_wake_up(p, state, 0);
1545 }
1546
1547 /*
1548  * Perform scheduler related setup for a newly forked process p.
1549  * p is forked by current.
1550  *
1551  * __sched_fork() is basic setup used by init_idle() too:
1552  */
1553 static void __sched_fork(struct task_struct *p)
1554 {
1555         p->se.wait_start_fair           = 0;
1556         p->se.wait_start                = 0;
1557         p->se.exec_start                = 0;
1558         p->se.sum_exec_runtime          = 0;
1559         p->se.delta_exec                = 0;
1560         p->se.delta_fair_run            = 0;
1561         p->se.delta_fair_sleep          = 0;
1562         p->se.wait_runtime              = 0;
1563         p->se.sum_wait_runtime          = 0;
1564         p->se.sum_sleep_runtime         = 0;
1565         p->se.sleep_start               = 0;
1566         p->se.sleep_start_fair          = 0;
1567         p->se.block_start               = 0;
1568         p->se.sleep_max                 = 0;
1569         p->se.block_max                 = 0;
1570         p->se.exec_max                  = 0;
1571         p->se.wait_max                  = 0;
1572         p->se.wait_runtime_overruns     = 0;
1573         p->se.wait_runtime_underruns    = 0;
1574
1575         INIT_LIST_HEAD(&p->run_list);
1576         p->se.on_rq = 0;
1577
1578         /*
1579          * We mark the process as running here, but have not actually
1580          * inserted it onto the runqueue yet. This guarantees that
1581          * nobody will actually run it, and a signal or other external
1582          * event cannot wake it up and insert it on the runqueue either.
1583          */
1584         p->state = TASK_RUNNING;
1585 }
1586
1587 /*
1588  * fork()/clone()-time setup:
1589  */
1590 void sched_fork(struct task_struct *p, int clone_flags)
1591 {
1592         int cpu = get_cpu();
1593
1594         __sched_fork(p);
1595
1596 #ifdef CONFIG_SMP
1597         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1598 #endif
1599         __set_task_cpu(p, cpu);
1600
1601         /*
1602          * Make sure we do not leak PI boosting priority to the child:
1603          */
1604         p->prio = current->normal_prio;
1605
1606 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1607         if (likely(sched_info_on()))
1608                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1609 #endif
1610 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1611         p->oncpu = 0;
1612 #endif
1613 #ifdef CONFIG_PREEMPT
1614         /* Want to start with kernel preemption disabled. */
1615         task_thread_info(p)->preempt_count = 1;
1616 #endif
1617         put_cpu();
1618 }
1619
1620 /*
1621  * After fork, child runs first. (default) If set to 0 then
1622  * parent will (try to) run first.
1623  */
1624 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1625
1626 /*
1627  * wake_up_new_task - wake up a newly created task for the first time.
1628  *
1629  * This function will do some initial scheduler statistics housekeeping
1630  * that must be done for every newly created context, then puts the task
1631  * on the runqueue and wakes it.
1632  */
1633 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1634 {
1635         unsigned long flags;
1636         struct rq *rq;
1637         int this_cpu;
1638
1639         rq = task_rq_lock(p, &flags);
1640         BUG_ON(p->state != TASK_RUNNING);
1641         this_cpu = smp_processor_id(); /* parent's CPU */
1642
1643         p->prio = effective_prio(p);
1644
1645         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1646                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1647                 activate_task(rq, p, 0);
1648         } else {
1649                 /*
1650                  * Let the scheduling class do new task startup
1651                  * management (if any):
1652                  */
1653                 p->sched_class->task_new(rq, p);
1654         }
1655         check_preempt_curr(rq, p);
1656         task_rq_unlock(rq, &flags);
1657 }
1658
1659 /**
1660  * prepare_task_switch - prepare to switch tasks
1661  * @rq: the runqueue preparing to switch
1662  * @next: the task we are going to switch to.
1663  *
1664  * This is called with the rq lock held and interrupts off. It must
1665  * be paired with a subsequent finish_task_switch after the context
1666  * switch.
1667  *
1668  * prepare_task_switch sets up locking and calls architecture specific
1669  * hooks.
1670  */
1671 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1672 {
1673         prepare_lock_switch(rq, next);
1674         prepare_arch_switch(next);
1675 }
1676
1677 /**
1678  * finish_task_switch - clean up after a task-switch
1679  * @rq: runqueue associated with task-switch
1680  * @prev: the thread we just switched away from.
1681  *
1682  * finish_task_switch must be called after the context switch, paired
1683  * with a prepare_task_switch call before the context switch.
1684  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1685  * and do any other architecture-specific cleanup actions.
1686  *
1687  * Note that we may have delayed dropping an mm in context_switch(). If
1688  * so, we finish that here outside of the runqueue lock.  (Doing it
1689  * with the lock held can cause deadlocks; see schedule() for
1690  * details.)
1691  */
1692 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1693         __releases(rq->lock)
1694 {
1695         struct mm_struct *mm = rq->prev_mm;
1696         long prev_state;
1697
1698         rq->prev_mm = NULL;
1699
1700         /*
1701          * A task struct has one reference for the use as "current".
1702          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1703          * schedule one last time. The schedule call will never return, and
1704          * the scheduled task must drop that reference.
1705          * The test for TASK_DEAD must occur while the runqueue locks are
1706          * still held, otherwise prev could be scheduled on another cpu, die
1707          * there before we look at prev->state, and then the reference would
1708          * be dropped twice.
1709          *              Manfred Spraul <manfred@colorfullife.com>
1710          */
1711         prev_state = prev->state;
1712         finish_arch_switch(prev);
1713         finish_lock_switch(rq, prev);
1714         if (mm)
1715                 mmdrop(mm);
1716         if (unlikely(prev_state == TASK_DEAD)) {
1717                 /*
1718                  * Remove function-return probe instances associated with this
1719                  * task and put them back on the free list.
1720                  */
1721                 kprobe_flush_task(prev);
1722                 put_task_struct(prev);
1723         }
1724 }
1725
1726 /**
1727  * schedule_tail - first thing a freshly forked thread must call.
1728  * @prev: the thread we just switched away from.
1729  */
1730 asmlinkage void schedule_tail(struct task_struct *prev)
1731         __releases(rq->lock)
1732 {
1733         struct rq *rq = this_rq();
1734
1735         finish_task_switch(rq, prev);
1736 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1737         /* In this case, finish_task_switch does not reenable preemption */
1738         preempt_enable();
1739 #endif
1740         if (current->set_child_tid)
1741                 put_user(current->pid, current->set_child_tid);
1742 }
1743
1744 /*
1745  * context_switch - switch to the new MM and the new
1746  * thread's register state.
1747  */
1748 static inline void
1749 context_switch(struct rq *rq, struct task_struct *prev,
1750                struct task_struct *next)
1751 {
1752         struct mm_struct *mm, *oldmm;
1753
1754         prepare_task_switch(rq, next);
1755         mm = next->mm;
1756         oldmm = prev->active_mm;
1757         /*
1758          * For paravirt, this is coupled with an exit in switch_to to
1759          * combine the page table reload and the switch backend into
1760          * one hypercall.
1761          */
1762         arch_enter_lazy_cpu_mode();
1763
1764         if (unlikely(!mm)) {
1765                 next->active_mm = oldmm;
1766                 atomic_inc(&oldmm->mm_count);
1767                 enter_lazy_tlb(oldmm, next);
1768         } else
1769                 switch_mm(oldmm, mm, next);
1770
1771         if (unlikely(!prev->mm)) {
1772                 prev->active_mm = NULL;
1773                 rq->prev_mm = oldmm;
1774         }
1775         /*
1776          * Since the runqueue lock will be released by the next
1777          * task (which is an invalid locking op but in the case
1778          * of the scheduler it's an obvious special-case), so we
1779          * do an early lockdep release here:
1780          */
1781 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1782         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1783 #endif
1784
1785         /* Here we just switch the register state and the stack. */
1786         switch_to(prev, next, prev);
1787
1788         barrier();
1789         /*
1790          * this_rq must be evaluated again because prev may have moved
1791          * CPUs since it called schedule(), thus the 'rq' on its stack
1792          * frame will be invalid.
1793          */
1794         finish_task_switch(this_rq(), prev);
1795 }
1796
1797 /*
1798  * nr_running, nr_uninterruptible and nr_context_switches:
1799  *
1800  * externally visible scheduler statistics: current number of runnable
1801  * threads, current number of uninterruptible-sleeping threads, total
1802  * number of context switches performed since bootup.
1803  */
1804 unsigned long nr_running(void)
1805 {
1806         unsigned long i, sum = 0;
1807
1808         for_each_online_cpu(i)
1809                 sum += cpu_rq(i)->nr_running;
1810
1811         return sum;
1812 }
1813
1814 unsigned long nr_uninterruptible(void)
1815 {
1816         unsigned long i, sum = 0;
1817
1818         for_each_possible_cpu(i)
1819                 sum += cpu_rq(i)->nr_uninterruptible;
1820
1821         /*
1822          * Since we read the counters lockless, it might be slightly
1823          * inaccurate. Do not allow it to go below zero though:
1824          */
1825         if (unlikely((long)sum < 0))
1826                 sum = 0;
1827
1828         return sum;
1829 }
1830
1831 unsigned long long nr_context_switches(void)
1832 {
1833         int i;
1834         unsigned long long sum = 0;
1835
1836         for_each_possible_cpu(i)
1837                 sum += cpu_rq(i)->nr_switches;
1838
1839         return sum;
1840 }
1841
1842 unsigned long nr_iowait(void)
1843 {
1844         unsigned long i, sum = 0;
1845
1846         for_each_possible_cpu(i)
1847                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1848
1849         return sum;
1850 }
1851
1852 unsigned long nr_active(void)
1853 {
1854         unsigned long i, running = 0, uninterruptible = 0;
1855
1856         for_each_online_cpu(i) {
1857                 running += cpu_rq(i)->nr_running;
1858                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1859         }
1860
1861         if (unlikely((long)uninterruptible < 0))
1862                 uninterruptible = 0;
1863
1864         return running + uninterruptible;
1865 }
1866
1867 /*
1868  * Update rq->cpu_load[] statistics. This function is usually called every
1869  * scheduler tick (TICK_NSEC).
1870  */
1871 static void update_cpu_load(struct rq *this_rq)
1872 {
1873         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1874         unsigned long total_load = this_rq->ls.load.weight;
1875         unsigned long this_load =  total_load;
1876         struct load_stat *ls = &this_rq->ls;
1877         u64 now = __rq_clock(this_rq);
1878         int i, scale;
1879
1880         this_rq->nr_load_updates++;
1881         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1882                 goto do_avg;
1883
1884         /* Update delta_fair/delta_exec fields first */
1885         update_curr_load(this_rq, now);
1886
1887         fair_delta64 = ls->delta_fair + 1;
1888         ls->delta_fair = 0;
1889
1890         exec_delta64 = ls->delta_exec + 1;
1891         ls->delta_exec = 0;
1892
1893         sample_interval64 = now - ls->load_update_last;
1894         ls->load_update_last = now;
1895
1896         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1897                 sample_interval64 = TICK_NSEC;
1898
1899         if (exec_delta64 > sample_interval64)
1900                 exec_delta64 = sample_interval64;
1901
1902         idle_delta64 = sample_interval64 - exec_delta64;
1903
1904         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1905         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1906
1907         this_load = (unsigned long)tmp64;
1908
1909 do_avg:
1910
1911         /* Update our load: */
1912         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1913                 unsigned long old_load, new_load;
1914
1915                 /* scale is effectively 1 << i now, and >> i divides by scale */
1916
1917                 old_load = this_rq->cpu_load[i];
1918                 new_load = this_load;
1919
1920                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1921         }
1922 }
1923
1924 #ifdef CONFIG_SMP
1925
1926 /*
1927  * double_rq_lock - safely lock two runqueues
1928  *
1929  * Note this does not disable interrupts like task_rq_lock,
1930  * you need to do so manually before calling.
1931  */
1932 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1933         __acquires(rq1->lock)
1934         __acquires(rq2->lock)
1935 {
1936         BUG_ON(!irqs_disabled());
1937         if (rq1 == rq2) {
1938                 spin_lock(&rq1->lock);
1939                 __acquire(rq2->lock);   /* Fake it out ;) */
1940         } else {
1941                 if (rq1 < rq2) {
1942                         spin_lock(&rq1->lock);
1943                         spin_lock(&rq2->lock);
1944                 } else {
1945                         spin_lock(&rq2->lock);
1946                         spin_lock(&rq1->lock);
1947                 }
1948         }
1949 }
1950
1951 /*
1952  * double_rq_unlock - safely unlock two runqueues
1953  *
1954  * Note this does not restore interrupts like task_rq_unlock,
1955  * you need to do so manually after calling.
1956  */
1957 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1958         __releases(rq1->lock)
1959         __releases(rq2->lock)
1960 {
1961         spin_unlock(&rq1->lock);
1962         if (rq1 != rq2)
1963                 spin_unlock(&rq2->lock);
1964         else
1965                 __release(rq2->lock);
1966 }
1967
1968 /*
1969  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1970  */
1971 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1972         __releases(this_rq->lock)
1973         __acquires(busiest->lock)
1974         __acquires(this_rq->lock)
1975 {
1976         if (unlikely(!irqs_disabled())) {
1977                 /* printk() doesn't work good under rq->lock */
1978                 spin_unlock(&this_rq->lock);
1979                 BUG_ON(1);
1980         }
1981         if (unlikely(!spin_trylock(&busiest->lock))) {
1982                 if (busiest < this_rq) {
1983                         spin_unlock(&this_rq->lock);
1984                         spin_lock(&busiest->lock);
1985                         spin_lock(&this_rq->lock);
1986                 } else
1987                         spin_lock(&busiest->lock);
1988         }
1989 }
1990
1991 /*
1992  * If dest_cpu is allowed for this process, migrate the task to it.
1993  * This is accomplished by forcing the cpu_allowed mask to only
1994  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1995  * the cpu_allowed mask is restored.
1996  */
1997 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1998 {
1999         struct migration_req req;
2000         unsigned long flags;
2001         struct rq *rq;
2002
2003         rq = task_rq_lock(p, &flags);
2004         if (!cpu_isset(dest_cpu, p->cpus_allowed)
2005             || unlikely(cpu_is_offline(dest_cpu)))
2006                 goto out;
2007
2008         /* force the process onto the specified CPU */
2009         if (migrate_task(p, dest_cpu, &req)) {
2010                 /* Need to wait for migration thread (might exit: take ref). */
2011                 struct task_struct *mt = rq->migration_thread;
2012
2013                 get_task_struct(mt);
2014                 task_rq_unlock(rq, &flags);
2015                 wake_up_process(mt);
2016                 put_task_struct(mt);
2017                 wait_for_completion(&req.done);
2018
2019                 return;
2020         }
2021 out:
2022         task_rq_unlock(rq, &flags);
2023 }
2024
2025 /*
2026  * sched_exec - execve() is a valuable balancing opportunity, because at
2027  * this point the task has the smallest effective memory and cache footprint.
2028  */
2029 void sched_exec(void)
2030 {
2031         int new_cpu, this_cpu = get_cpu();
2032         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2033         put_cpu();
2034         if (new_cpu != this_cpu)
2035                 sched_migrate_task(current, new_cpu);
2036 }
2037
2038 /*
2039  * pull_task - move a task from a remote runqueue to the local runqueue.
2040  * Both runqueues must be locked.
2041  */
2042 static void pull_task(struct rq *src_rq, struct task_struct *p,
2043                       struct rq *this_rq, int this_cpu)
2044 {
2045         deactivate_task(src_rq, p, 0);
2046         set_task_cpu(p, this_cpu);
2047         activate_task(this_rq, p, 0);
2048         /*
2049          * Note that idle threads have a prio of MAX_PRIO, for this test
2050          * to be always true for them.
2051          */
2052         check_preempt_curr(this_rq, p);
2053 }
2054
2055 /*
2056  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2057  */
2058 static
2059 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2060                      struct sched_domain *sd, enum cpu_idle_type idle,
2061                      int *all_pinned)
2062 {
2063         /*
2064          * We do not migrate tasks that are:
2065          * 1) running (obviously), or
2066          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2067          * 3) are cache-hot on their current CPU.
2068          */
2069         if (!cpu_isset(this_cpu, p->cpus_allowed))
2070                 return 0;
2071         *all_pinned = 0;
2072
2073         if (task_running(rq, p))
2074                 return 0;
2075
2076         /*
2077          * Aggressive migration if too many balance attempts have failed:
2078          */
2079         if (sd->nr_balance_failed > sd->cache_nice_tries)
2080                 return 1;
2081
2082         return 1;
2083 }
2084
2085 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2086                       unsigned long max_nr_move, unsigned long max_load_move,
2087                       struct sched_domain *sd, enum cpu_idle_type idle,
2088                       int *all_pinned, unsigned long *load_moved,
2089                       int this_best_prio, int best_prio, int best_prio_seen,
2090                       struct rq_iterator *iterator)
2091 {
2092         int pulled = 0, pinned = 0, skip_for_load;
2093         struct task_struct *p;
2094         long rem_load_move = max_load_move;
2095
2096         if (max_nr_move == 0 || max_load_move == 0)
2097                 goto out;
2098
2099         pinned = 1;
2100
2101         /*
2102          * Start the load-balancing iterator:
2103          */
2104         p = iterator->start(iterator->arg);
2105 next:
2106         if (!p)
2107                 goto out;
2108         /*
2109          * To help distribute high priority tasks accross CPUs we don't
2110          * skip a task if it will be the highest priority task (i.e. smallest
2111          * prio value) on its new queue regardless of its load weight
2112          */
2113         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2114                                                          SCHED_LOAD_SCALE_FUZZ;
2115         if (skip_for_load && p->prio < this_best_prio)
2116                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2117         if (skip_for_load ||
2118             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2119
2120                 best_prio_seen |= p->prio == best_prio;
2121                 p = iterator->next(iterator->arg);
2122                 goto next;
2123         }
2124
2125         pull_task(busiest, p, this_rq, this_cpu);
2126         pulled++;
2127         rem_load_move -= p->se.load.weight;
2128
2129         /*
2130          * We only want to steal up to the prescribed number of tasks
2131          * and the prescribed amount of weighted load.
2132          */
2133         if (pulled < max_nr_move && rem_load_move > 0) {
2134                 if (p->prio < this_best_prio)
2135                         this_best_prio = p->prio;
2136                 p = iterator->next(iterator->arg);
2137                 goto next;
2138         }
2139 out:
2140         /*
2141          * Right now, this is the only place pull_task() is called,
2142          * so we can safely collect pull_task() stats here rather than
2143          * inside pull_task().
2144          */
2145         schedstat_add(sd, lb_gained[idle], pulled);
2146
2147         if (all_pinned)
2148                 *all_pinned = pinned;
2149         *load_moved = max_load_move - rem_load_move;
2150         return pulled;
2151 }
2152
2153 /*
2154  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2155  * load from busiest to this_rq, as part of a balancing operation within
2156  * "domain". Returns the number of tasks moved.
2157  *
2158  * Called with both runqueues locked.
2159  */
2160 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2161                       unsigned long max_nr_move, unsigned long max_load_move,
2162                       struct sched_domain *sd, enum cpu_idle_type idle,
2163                       int *all_pinned)
2164 {
2165         struct sched_class *class = sched_class_highest;
2166         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2167         long rem_load_move = max_load_move;
2168
2169         do {
2170                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2171                                 max_nr_move, (unsigned long)rem_load_move,
2172                                 sd, idle, all_pinned, &load_moved);
2173                 total_nr_moved += nr_moved;
2174                 max_nr_move -= nr_moved;
2175                 rem_load_move -= load_moved;
2176                 class = class->next;
2177         } while (class && max_nr_move && rem_load_move > 0);
2178
2179         return total_nr_moved;
2180 }
2181
2182 /*
2183  * find_busiest_group finds and returns the busiest CPU group within the
2184  * domain. It calculates and returns the amount of weighted load which
2185  * should be moved to restore balance via the imbalance parameter.
2186  */
2187 static struct sched_group *
2188 find_busiest_group(struct sched_domain *sd, int this_cpu,
2189                    unsigned long *imbalance, enum cpu_idle_type idle,
2190                    int *sd_idle, cpumask_t *cpus, int *balance)
2191 {
2192         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2193         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2194         unsigned long max_pull;
2195         unsigned long busiest_load_per_task, busiest_nr_running;
2196         unsigned long this_load_per_task, this_nr_running;
2197         int load_idx;
2198 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2199         int power_savings_balance = 1;
2200         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2201         unsigned long min_nr_running = ULONG_MAX;
2202         struct sched_group *group_min = NULL, *group_leader = NULL;
2203 #endif
2204
2205         max_load = this_load = total_load = total_pwr = 0;
2206         busiest_load_per_task = busiest_nr_running = 0;
2207         this_load_per_task = this_nr_running = 0;
2208         if (idle == CPU_NOT_IDLE)
2209                 load_idx = sd->busy_idx;
2210         else if (idle == CPU_NEWLY_IDLE)
2211                 load_idx = sd->newidle_idx;
2212         else
2213                 load_idx = sd->idle_idx;
2214
2215         do {
2216                 unsigned long load, group_capacity;
2217                 int local_group;
2218                 int i;
2219                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2220                 unsigned long sum_nr_running, sum_weighted_load;
2221
2222                 local_group = cpu_isset(this_cpu, group->cpumask);
2223
2224                 if (local_group)
2225                         balance_cpu = first_cpu(group->cpumask);
2226
2227                 /* Tally up the load of all CPUs in the group */
2228                 sum_weighted_load = sum_nr_running = avg_load = 0;
2229
2230                 for_each_cpu_mask(i, group->cpumask) {
2231                         struct rq *rq;
2232
2233                         if (!cpu_isset(i, *cpus))
2234                                 continue;
2235
2236                         rq = cpu_rq(i);
2237
2238                         if (*sd_idle && !idle_cpu(i))
2239                                 *sd_idle = 0;
2240
2241                         /* Bias balancing toward cpus of our domain */
2242                         if (local_group) {
2243                                 if (idle_cpu(i) && !first_idle_cpu) {
2244                                         first_idle_cpu = 1;
2245                                         balance_cpu = i;
2246                                 }
2247
2248                                 load = target_load(i, load_idx);
2249                         } else
2250                                 load = source_load(i, load_idx);
2251
2252                         avg_load += load;
2253                         sum_nr_running += rq->nr_running;
2254                         sum_weighted_load += weighted_cpuload(i);
2255                 }
2256
2257                 /*
2258                  * First idle cpu or the first cpu(busiest) in this sched group
2259                  * is eligible for doing load balancing at this and above
2260                  * domains.
2261                  */
2262                 if (local_group && balance_cpu != this_cpu && balance) {
2263                         *balance = 0;
2264                         goto ret;
2265                 }
2266
2267                 total_load += avg_load;
2268                 total_pwr += group->__cpu_power;
2269
2270                 /* Adjust by relative CPU power of the group */
2271                 avg_load = sg_div_cpu_power(group,
2272                                 avg_load * SCHED_LOAD_SCALE);
2273
2274                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2275
2276                 if (local_group) {
2277                         this_load = avg_load;
2278                         this = group;
2279                         this_nr_running = sum_nr_running;
2280                         this_load_per_task = sum_weighted_load;
2281                 } else if (avg_load > max_load &&
2282                            sum_nr_running > group_capacity) {
2283                         max_load = avg_load;
2284                         busiest = group;
2285                         busiest_nr_running = sum_nr_running;
2286                         busiest_load_per_task = sum_weighted_load;
2287                 }
2288
2289 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2290                 /*
2291                  * Busy processors will not participate in power savings
2292                  * balance.
2293                  */
2294                 if (idle == CPU_NOT_IDLE ||
2295                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2296                         goto group_next;
2297
2298                 /*
2299                  * If the local group is idle or completely loaded
2300                  * no need to do power savings balance at this domain
2301                  */
2302                 if (local_group && (this_nr_running >= group_capacity ||
2303                                     !this_nr_running))
2304                         power_savings_balance = 0;
2305
2306                 /*
2307                  * If a group is already running at full capacity or idle,
2308                  * don't include that group in power savings calculations
2309                  */
2310                 if (!power_savings_balance || sum_nr_running >= group_capacity
2311                     || !sum_nr_running)
2312                         goto group_next;
2313
2314                 /*
2315                  * Calculate the group which has the least non-idle load.
2316                  * This is the group from where we need to pick up the load
2317                  * for saving power
2318                  */
2319                 if ((sum_nr_running < min_nr_running) ||
2320                     (sum_nr_running == min_nr_running &&
2321                      first_cpu(group->cpumask) <
2322                      first_cpu(group_min->cpumask))) {
2323                         group_min = group;
2324                         min_nr_running = sum_nr_running;
2325                         min_load_per_task = sum_weighted_load /
2326                                                 sum_nr_running;
2327                 }
2328
2329                 /*
2330                  * Calculate the group which is almost near its
2331                  * capacity but still has some space to pick up some load
2332                  * from other group and save more power
2333                  */
2334                 if (sum_nr_running <= group_capacity - 1) {
2335                         if (sum_nr_running > leader_nr_running ||
2336                             (sum_nr_running == leader_nr_running &&
2337                              first_cpu(group->cpumask) >
2338                               first_cpu(group_leader->cpumask))) {
2339                                 group_leader = group;
2340                                 leader_nr_running = sum_nr_running;
2341                         }
2342                 }
2343 group_next:
2344 #endif
2345                 group = group->next;
2346         } while (group != sd->groups);
2347
2348         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2349                 goto out_balanced;
2350
2351         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2352
2353         if (this_load >= avg_load ||
2354                         100*max_load <= sd->imbalance_pct*this_load)
2355                 goto out_balanced;
2356
2357         busiest_load_per_task /= busiest_nr_running;
2358         /*
2359          * We're trying to get all the cpus to the average_load, so we don't
2360          * want to push ourselves above the average load, nor do we wish to
2361          * reduce the max loaded cpu below the average load, as either of these
2362          * actions would just result in more rebalancing later, and ping-pong
2363          * tasks around. Thus we look for the minimum possible imbalance.
2364          * Negative imbalances (*we* are more loaded than anyone else) will
2365          * be counted as no imbalance for these purposes -- we can't fix that
2366          * by pulling tasks to us.  Be careful of negative numbers as they'll
2367          * appear as very large values with unsigned longs.
2368          */
2369         if (max_load <= busiest_load_per_task)
2370                 goto out_balanced;
2371
2372         /*
2373          * In the presence of smp nice balancing, certain scenarios can have
2374          * max load less than avg load(as we skip the groups at or below
2375          * its cpu_power, while calculating max_load..)
2376          */
2377         if (max_load < avg_load) {
2378                 *imbalance = 0;
2379                 goto small_imbalance;
2380         }
2381
2382         /* Don't want to pull so many tasks that a group would go idle */
2383         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2384
2385         /* How much load to actually move to equalise the imbalance */
2386         *imbalance = min(max_pull * busiest->__cpu_power,
2387                                 (avg_load - this_load) * this->__cpu_power)
2388                         / SCHED_LOAD_SCALE;
2389
2390         /*
2391          * if *imbalance is less than the average load per runnable task
2392          * there is no gaurantee that any tasks will be moved so we'll have
2393          * a think about bumping its value to force at least one task to be
2394          * moved
2395          */
2396         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2397                 unsigned long tmp, pwr_now, pwr_move;
2398                 unsigned int imbn;
2399
2400 small_imbalance:
2401                 pwr_move = pwr_now = 0;
2402                 imbn = 2;
2403                 if (this_nr_running) {
2404                         this_load_per_task /= this_nr_running;
2405                         if (busiest_load_per_task > this_load_per_task)
2406                                 imbn = 1;
2407                 } else
2408                         this_load_per_task = SCHED_LOAD_SCALE;
2409
2410                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2411                                         busiest_load_per_task * imbn) {
2412                         *imbalance = busiest_load_per_task;
2413                         return busiest;
2414                 }
2415
2416                 /*
2417                  * OK, we don't have enough imbalance to justify moving tasks,
2418                  * however we may be able to increase total CPU power used by
2419                  * moving them.
2420                  */
2421
2422                 pwr_now += busiest->__cpu_power *
2423                                 min(busiest_load_per_task, max_load);
2424                 pwr_now += this->__cpu_power *
2425                                 min(this_load_per_task, this_load);
2426                 pwr_now /= SCHED_LOAD_SCALE;
2427
2428                 /* Amount of load we'd subtract */
2429                 tmp = sg_div_cpu_power(busiest,
2430                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2431                 if (max_load > tmp)
2432                         pwr_move += busiest->__cpu_power *
2433                                 min(busiest_load_per_task, max_load - tmp);
2434
2435                 /* Amount of load we'd add */
2436                 if (max_load * busiest->__cpu_power <
2437                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2438                         tmp = sg_div_cpu_power(this,
2439                                         max_load * busiest->__cpu_power);
2440                 else
2441                         tmp = sg_div_cpu_power(this,
2442                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2443                 pwr_move += this->__cpu_power *
2444                                 min(this_load_per_task, this_load + tmp);
2445                 pwr_move /= SCHED_LOAD_SCALE;
2446
2447                 /* Move if we gain throughput */
2448                 if (pwr_move <= pwr_now)
2449                         goto out_balanced;
2450
2451                 *imbalance = busiest_load_per_task;
2452         }
2453
2454         return busiest;
2455
2456 out_balanced:
2457 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2458         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2459                 goto ret;
2460
2461         if (this == group_leader && group_leader != group_min) {
2462                 *imbalance = min_load_per_task;
2463                 return group_min;
2464         }
2465 #endif
2466 ret:
2467         *imbalance = 0;
2468         return NULL;
2469 }
2470
2471 /*
2472  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2473  */
2474 static struct rq *
2475 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2476                    unsigned long imbalance, cpumask_t *cpus)
2477 {
2478         struct rq *busiest = NULL, *rq;
2479         unsigned long max_load = 0;
2480         int i;
2481
2482         for_each_cpu_mask(i, group->cpumask) {
2483                 unsigned long wl;
2484
2485                 if (!cpu_isset(i, *cpus))
2486                         continue;
2487
2488                 rq = cpu_rq(i);
2489                 wl = weighted_cpuload(i);
2490
2491                 if (rq->nr_running == 1 && wl > imbalance)
2492                         continue;
2493
2494                 if (wl > max_load) {
2495                         max_load = wl;
2496                         busiest = rq;
2497                 }
2498         }
2499
2500         return busiest;
2501 }
2502
2503 /*
2504  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2505  * so long as it is large enough.
2506  */
2507 #define MAX_PINNED_INTERVAL     512
2508
2509 static inline unsigned long minus_1_or_zero(unsigned long n)
2510 {
2511         return n > 0 ? n - 1 : 0;
2512 }
2513
2514 /*
2515  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2516  * tasks if there is an imbalance.
2517  */
2518 static int load_balance(int this_cpu, struct rq *this_rq,
2519                         struct sched_domain *sd, enum cpu_idle_type idle,
2520                         int *balance)
2521 {
2522         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2523         struct sched_group *group;
2524         unsigned long imbalance;
2525         struct rq *busiest;
2526         cpumask_t cpus = CPU_MASK_ALL;
2527         unsigned long flags;
2528
2529         /*
2530          * When power savings policy is enabled for the parent domain, idle
2531          * sibling can pick up load irrespective of busy siblings. In this case,
2532          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2533          * portraying it as CPU_NOT_IDLE.
2534          */
2535         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2536             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2537                 sd_idle = 1;
2538
2539         schedstat_inc(sd, lb_cnt[idle]);
2540
2541 redo:
2542         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2543                                    &cpus, balance);
2544
2545         if (*balance == 0)
2546                 goto out_balanced;
2547
2548         if (!group) {
2549                 schedstat_inc(sd, lb_nobusyg[idle]);
2550                 goto out_balanced;
2551         }
2552
2553         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2554         if (!busiest) {
2555                 schedstat_inc(sd, lb_nobusyq[idle]);
2556                 goto out_balanced;
2557         }
2558
2559         BUG_ON(busiest == this_rq);
2560
2561         schedstat_add(sd, lb_imbalance[idle], imbalance);
2562
2563         nr_moved = 0;
2564         if (busiest->nr_running > 1) {
2565                 /*
2566                  * Attempt to move tasks. If find_busiest_group has found
2567                  * an imbalance but busiest->nr_running <= 1, the group is
2568                  * still unbalanced. nr_moved simply stays zero, so it is
2569                  * correctly treated as an imbalance.
2570                  */
2571                 local_irq_save(flags);
2572                 double_rq_lock(this_rq, busiest);
2573                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2574                                       minus_1_or_zero(busiest->nr_running),
2575                                       imbalance, sd, idle, &all_pinned);
2576                 double_rq_unlock(this_rq, busiest);
2577                 local_irq_restore(flags);
2578
2579                 /*
2580                  * some other cpu did the load balance for us.
2581                  */
2582                 if (nr_moved && this_cpu != smp_processor_id())
2583                         resched_cpu(this_cpu);
2584
2585                 /* All tasks on this runqueue were pinned by CPU affinity */
2586                 if (unlikely(all_pinned)) {
2587                         cpu_clear(cpu_of(busiest), cpus);
2588                         if (!cpus_empty(cpus))
2589                                 goto redo;
2590                         goto out_balanced;
2591                 }
2592         }
2593
2594         if (!nr_moved) {
2595                 schedstat_inc(sd, lb_failed[idle]);
2596                 sd->nr_balance_failed++;
2597
2598                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2599
2600                         spin_lock_irqsave(&busiest->lock, flags);
2601
2602                         /* don't kick the migration_thread, if the curr
2603                          * task on busiest cpu can't be moved to this_cpu
2604                          */
2605                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2606                                 spin_unlock_irqrestore(&busiest->lock, flags);
2607                                 all_pinned = 1;
2608                                 goto out_one_pinned;
2609                         }
2610
2611                         if (!busiest->active_balance) {
2612                                 busiest->active_balance = 1;
2613                                 busiest->push_cpu = this_cpu;
2614                                 active_balance = 1;
2615                         }
2616                         spin_unlock_irqrestore(&busiest->lock, flags);
2617                         if (active_balance)
2618                                 wake_up_process(busiest->migration_thread);
2619
2620                         /*
2621                          * We've kicked active balancing, reset the failure
2622                          * counter.
2623                          */
2624                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2625                 }
2626         } else
2627                 sd->nr_balance_failed = 0;
2628
2629         if (likely(!active_balance)) {
2630                 /* We were unbalanced, so reset the balancing interval */
2631                 sd->balance_interval = sd->min_interval;
2632         } else {
2633                 /*
2634                  * If we've begun active balancing, start to back off. This
2635                  * case may not be covered by the all_pinned logic if there
2636                  * is only 1 task on the busy runqueue (because we don't call
2637                  * move_tasks).
2638                  */
2639                 if (sd->balance_interval < sd->max_interval)
2640                         sd->balance_interval *= 2;
2641         }
2642
2643         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2644             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2645                 return -1;
2646         return nr_moved;
2647
2648 out_balanced:
2649         schedstat_inc(sd, lb_balanced[idle]);
2650
2651         sd->nr_balance_failed = 0;
2652
2653 out_one_pinned:
2654         /* tune up the balancing interval */
2655         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2656                         (sd->balance_interval < sd->max_interval))
2657                 sd->balance_interval *= 2;
2658
2659         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2660             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2661                 return -1;
2662         return 0;
2663 }
2664
2665 /*
2666  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2667  * tasks if there is an imbalance.
2668  *
2669  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2670  * this_rq is locked.
2671  */
2672 static int
2673 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2674 {
2675         struct sched_group *group;
2676         struct rq *busiest = NULL;
2677         unsigned long imbalance;
2678         int nr_moved = 0;
2679         int sd_idle = 0;
2680         cpumask_t cpus = CPU_MASK_ALL;
2681
2682         /*
2683          * When power savings policy is enabled for the parent domain, idle
2684          * sibling can pick up load irrespective of busy siblings. In this case,
2685          * let the state of idle sibling percolate up as IDLE, instead of
2686          * portraying it as CPU_NOT_IDLE.
2687          */
2688         if (sd->flags & SD_SHARE_CPUPOWER &&
2689             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2690                 sd_idle = 1;
2691
2692         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2693 redo:
2694         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2695                                    &sd_idle, &cpus, NULL);
2696         if (!group) {
2697                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2698                 goto out_balanced;
2699         }
2700
2701         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2702                                 &cpus);
2703         if (!busiest) {
2704                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2705                 goto out_balanced;
2706         }
2707
2708         BUG_ON(busiest == this_rq);
2709
2710         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2711
2712         nr_moved = 0;
2713         if (busiest->nr_running > 1) {
2714                 /* Attempt to move tasks */
2715                 double_lock_balance(this_rq, busiest);
2716                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2717                                         minus_1_or_zero(busiest->nr_running),
2718                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2719                 spin_unlock(&busiest->lock);
2720
2721                 if (!nr_moved) {
2722                         cpu_clear(cpu_of(busiest), cpus);
2723                         if (!cpus_empty(cpus))
2724                                 goto redo;
2725                 }
2726         }
2727
2728         if (!nr_moved) {
2729                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2730                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2731                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2732                         return -1;
2733         } else
2734                 sd->nr_balance_failed = 0;
2735
2736         return nr_moved;
2737
2738 out_balanced:
2739         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2740         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2741             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2742                 return -1;
2743         sd->nr_balance_failed = 0;
2744
2745         return 0;
2746 }
2747
2748 /*
2749  * idle_balance is called by schedule() if this_cpu is about to become
2750  * idle. Attempts to pull tasks from other CPUs.
2751  */
2752 static void idle_balance(int this_cpu, struct rq *this_rq)
2753 {
2754         struct sched_domain *sd;
2755         int pulled_task = -1;
2756         unsigned long next_balance = jiffies + HZ;
2757
2758         for_each_domain(this_cpu, sd) {
2759                 unsigned long interval;
2760
2761                 if (!(sd->flags & SD_LOAD_BALANCE))
2762                         continue;
2763
2764                 if (sd->flags & SD_BALANCE_NEWIDLE)
2765                         /* If we've pulled tasks over stop searching: */
2766                         pulled_task = load_balance_newidle(this_cpu,
2767                                                                 this_rq, sd);
2768
2769                 interval = msecs_to_jiffies(sd->balance_interval);
2770                 if (time_after(next_balance, sd->last_balance + interval))
2771                         next_balance = sd->last_balance + interval;
2772                 if (pulled_task)
2773                         break;
2774         }
2775         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2776                 /*
2777                  * We are going idle. next_balance may be set based on
2778                  * a busy processor. So reset next_balance.
2779                  */
2780                 this_rq->next_balance = next_balance;
2781         }
2782 }
2783
2784 /*
2785  * active_load_balance is run by migration threads. It pushes running tasks
2786  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2787  * running on each physical CPU where possible, and avoids physical /
2788  * logical imbalances.
2789  *
2790  * Called with busiest_rq locked.
2791  */
2792 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2793 {
2794         int target_cpu = busiest_rq->push_cpu;
2795         struct sched_domain *sd;
2796         struct rq *target_rq;
2797
2798         /* Is there any task to move? */
2799         if (busiest_rq->nr_running <= 1)
2800                 return;
2801
2802         target_rq = cpu_rq(target_cpu);
2803
2804         /*
2805          * This condition is "impossible", if it occurs
2806          * we need to fix it.  Originally reported by
2807          * Bjorn Helgaas on a 128-cpu setup.
2808          */
2809         BUG_ON(busiest_rq == target_rq);
2810
2811         /* move a task from busiest_rq to target_rq */
2812         double_lock_balance(busiest_rq, target_rq);
2813
2814         /* Search for an sd spanning us and the target CPU. */
2815         for_each_domain(target_cpu, sd) {
2816                 if ((sd->flags & SD_LOAD_BALANCE) &&
2817                     cpu_isset(busiest_cpu, sd->span))
2818                                 break;
2819         }
2820
2821         if (likely(sd)) {
2822                 schedstat_inc(sd, alb_cnt);
2823
2824                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2825                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2826                                NULL))
2827                         schedstat_inc(sd, alb_pushed);
2828                 else
2829                         schedstat_inc(sd, alb_failed);
2830         }
2831         spin_unlock(&target_rq->lock);
2832 }
2833
2834 #ifdef CONFIG_NO_HZ
2835 static struct {
2836         atomic_t load_balancer;
2837         cpumask_t  cpu_mask;
2838 } nohz ____cacheline_aligned = {
2839         .load_balancer = ATOMIC_INIT(-1),
2840         .cpu_mask = CPU_MASK_NONE,
2841 };
2842
2843 /*
2844  * This routine will try to nominate the ilb (idle load balancing)
2845  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2846  * load balancing on behalf of all those cpus. If all the cpus in the system
2847  * go into this tickless mode, then there will be no ilb owner (as there is
2848  * no need for one) and all the cpus will sleep till the next wakeup event
2849  * arrives...
2850  *
2851  * For the ilb owner, tick is not stopped. And this tick will be used
2852  * for idle load balancing. ilb owner will still be part of
2853  * nohz.cpu_mask..
2854  *
2855  * While stopping the tick, this cpu will become the ilb owner if there
2856  * is no other owner. And will be the owner till that cpu becomes busy
2857  * or if all cpus in the system stop their ticks at which point
2858  * there is no need for ilb owner.
2859  *
2860  * When the ilb owner becomes busy, it nominates another owner, during the
2861  * next busy scheduler_tick()
2862  */
2863 int select_nohz_load_balancer(int stop_tick)
2864 {
2865         int cpu = smp_processor_id();
2866
2867         if (stop_tick) {
2868                 cpu_set(cpu, nohz.cpu_mask);
2869                 cpu_rq(cpu)->in_nohz_recently = 1;
2870
2871                 /*
2872                  * If we are going offline and still the leader, give up!
2873                  */
2874                 if (cpu_is_offline(cpu) &&
2875                     atomic_read(&nohz.load_balancer) == cpu) {
2876                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2877                                 BUG();
2878                         return 0;
2879                 }
2880
2881                 /* time for ilb owner also to sleep */
2882                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2883                         if (atomic_read(&nohz.load_balancer) == cpu)
2884                                 atomic_set(&nohz.load_balancer, -1);
2885                         return 0;
2886                 }
2887
2888                 if (atomic_read(&nohz.load_balancer) == -1) {
2889                         /* make me the ilb owner */
2890                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2891                                 return 1;
2892                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2893                         return 1;
2894         } else {
2895                 if (!cpu_isset(cpu, nohz.cpu_mask))
2896                         return 0;
2897
2898                 cpu_clear(cpu, nohz.cpu_mask);
2899
2900                 if (atomic_read(&nohz.load_balancer) == cpu)
2901                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2902                                 BUG();
2903         }
2904         return 0;
2905 }
2906 #endif
2907
2908 static DEFINE_SPINLOCK(balancing);
2909
2910 /*
2911  * It checks each scheduling domain to see if it is due to be balanced,
2912  * and initiates a balancing operation if so.
2913  *
2914  * Balancing parameters are set up in arch_init_sched_domains.
2915  */
2916 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2917 {
2918         int balance = 1;
2919         struct rq *rq = cpu_rq(cpu);
2920         unsigned long interval;
2921         struct sched_domain *sd;
2922         /* Earliest time when we have to do rebalance again */
2923         unsigned long next_balance = jiffies + 60*HZ;
2924
2925         for_each_domain(cpu, sd) {
2926                 if (!(sd->flags & SD_LOAD_BALANCE))
2927                         continue;
2928
2929                 interval = sd->balance_interval;
2930                 if (idle != CPU_IDLE)
2931                         interval *= sd->busy_factor;
2932
2933                 /* scale ms to jiffies */
2934                 interval = msecs_to_jiffies(interval);
2935                 if (unlikely(!interval))
2936                         interval = 1;
2937                 if (interval > HZ*NR_CPUS/10)
2938                         interval = HZ*NR_CPUS/10;
2939
2940
2941                 if (sd->flags & SD_SERIALIZE) {
2942                         if (!spin_trylock(&balancing))
2943                                 goto out;
2944                 }
2945
2946                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2947                         if (load_balance(cpu, rq, sd, idle, &balance)) {
2948                                 /*
2949                                  * We've pulled tasks over so either we're no
2950                                  * longer idle, or one of our SMT siblings is
2951                                  * not idle.
2952                                  */
2953                                 idle = CPU_NOT_IDLE;
2954                         }
2955                         sd->last_balance = jiffies;
2956                 }
2957                 if (sd->flags & SD_SERIALIZE)
2958                         spin_unlock(&balancing);
2959 out:
2960                 if (time_after(next_balance, sd->last_balance + interval))
2961                         next_balance = sd->last_balance + interval;
2962
2963                 /*
2964                  * Stop the load balance at this level. There is another
2965                  * CPU in our sched group which is doing load balancing more
2966                  * actively.
2967                  */
2968                 if (!balance)
2969                         break;
2970         }
2971         rq->next_balance = next_balance;
2972 }
2973
2974 /*
2975  * run_rebalance_domains is triggered when needed from the scheduler tick.
2976  * In CONFIG_NO_HZ case, the idle load balance owner will do the
2977  * rebalancing for all the cpus for whom scheduler ticks are stopped.
2978  */
2979 static void run_rebalance_domains(struct softirq_action *h)
2980 {
2981         int this_cpu = smp_processor_id();
2982         struct rq *this_rq = cpu_rq(this_cpu);
2983         enum cpu_idle_type idle = this_rq->idle_at_tick ?
2984                                                 CPU_IDLE : CPU_NOT_IDLE;
2985
2986         rebalance_domains(this_cpu, idle);
2987
2988 #ifdef CONFIG_NO_HZ
2989         /*
2990          * If this cpu is the owner for idle load balancing, then do the
2991          * balancing on behalf of the other idle cpus whose ticks are
2992          * stopped.
2993          */
2994         if (this_rq->idle_at_tick &&
2995             atomic_read(&nohz.load_balancer) == this_cpu) {
2996                 cpumask_t cpus = nohz.cpu_mask;
2997                 struct rq *rq;
2998                 int balance_cpu;
2999
3000                 cpu_clear(this_cpu, cpus);
3001                 for_each_cpu_mask(balance_cpu, cpus) {
3002                         /*
3003                          * If this cpu gets work to do, stop the load balancing
3004                          * work being done for other cpus. Next load
3005                          * balancing owner will pick it up.
3006                          */
3007                         if (need_resched())
3008                                 break;
3009
3010                         rebalance_domains(balance_cpu, SCHED_IDLE);
3011
3012                         rq = cpu_rq(balance_cpu);
3013                         if (time_after(this_rq->next_balance, rq->next_balance))
3014                                 this_rq->next_balance = rq->next_balance;
3015                 }
3016         }
3017 #endif
3018 }
3019
3020 /*
3021  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3022  *
3023  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3024  * idle load balancing owner or decide to stop the periodic load balancing,
3025  * if the whole system is idle.
3026  */
3027 static inline void trigger_load_balance(struct rq *rq, int cpu)
3028 {
3029 #ifdef CONFIG_NO_HZ
3030         /*
3031          * If we were in the nohz mode recently and busy at the current
3032          * scheduler tick, then check if we need to nominate new idle
3033          * load balancer.
3034          */
3035         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3036                 rq->in_nohz_recently = 0;
3037
3038                 if (atomic_read(&nohz.load_balancer) == cpu) {
3039                         cpu_clear(cpu, nohz.cpu_mask);
3040                         atomic_set(&nohz.load_balancer, -1);
3041                 }
3042
3043                 if (atomic_read(&nohz.load_balancer) == -1) {
3044                         /*
3045                          * simple selection for now: Nominate the
3046                          * first cpu in the nohz list to be the next
3047                          * ilb owner.
3048                          *
3049                          * TBD: Traverse the sched domains and nominate
3050                          * the nearest cpu in the nohz.cpu_mask.
3051                          */
3052                         int ilb = first_cpu(nohz.cpu_mask);
3053
3054                         if (ilb != NR_CPUS)
3055                                 resched_cpu(ilb);
3056                 }
3057         }
3058
3059         /*
3060          * If this cpu is idle and doing idle load balancing for all the
3061          * cpus with ticks stopped, is it time for that to stop?
3062          */
3063         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3064             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3065                 resched_cpu(cpu);
3066                 return;
3067         }
3068
3069         /*
3070          * If this cpu is idle and the idle load balancing is done by
3071          * someone else, then no need raise the SCHED_SOFTIRQ
3072          */
3073         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3074             cpu_isset(cpu, nohz.cpu_mask))
3075                 return;
3076 #endif
3077         if (time_after_eq(jiffies, rq->next_balance))
3078                 raise_softirq(SCHED_SOFTIRQ);
3079 }
3080
3081 #else   /* CONFIG_SMP */
3082
3083 /*
3084  * on UP we do not need to balance between CPUs:
3085  */
3086 static inline void idle_balance(int cpu, struct rq *rq)
3087 {
3088 }
3089
3090 /* Avoid "used but not defined" warning on UP */
3091 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3092                       unsigned long max_nr_move, unsigned long max_load_move,
3093                       struct sched_domain *sd, enum cpu_idle_type idle,
3094                       int *all_pinned, unsigned long *load_moved,
3095                       int this_best_prio, int best_prio, int best_prio_seen,
3096                       struct rq_iterator *iterator)
3097 {
3098         *load_moved = 0;
3099
3100         return 0;
3101 }
3102
3103 #endif
3104
3105 DEFINE_PER_CPU(struct kernel_stat, kstat);
3106
3107 EXPORT_PER_CPU_SYMBOL(kstat);
3108
3109 /*
3110  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3111  * that have not yet been banked in case the task is currently running.
3112  */
3113 unsigned long long task_sched_runtime(struct task_struct *p)
3114 {
3115         unsigned long flags;
3116         u64 ns, delta_exec;
3117         struct rq *rq;
3118
3119         rq = task_rq_lock(p, &flags);
3120         ns = p->se.sum_exec_runtime;
3121         if (rq->curr == p) {
3122                 delta_exec = rq_clock(rq) - p->se.exec_start;
3123                 if ((s64)delta_exec > 0)
3124                         ns += delta_exec;
3125         }
3126         task_rq_unlock(rq, &flags);
3127
3128         return ns;
3129 }
3130
3131 /*
3132  * Account user cpu time to a process.
3133  * @p: the process that the cpu time gets accounted to
3134  * @hardirq_offset: the offset to subtract from hardirq_count()
3135  * @cputime: the cpu time spent in user space since the last update
3136  */
3137 void account_user_time(struct task_struct *p, cputime_t cputime)
3138 {
3139         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3140         cputime64_t tmp;
3141
3142         p->utime = cputime_add(p->utime, cputime);
3143
3144         /* Add user time to cpustat. */
3145         tmp = cputime_to_cputime64(cputime);
3146         if (TASK_NICE(p) > 0)
3147                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3148         else
3149                 cpustat->user = cputime64_add(cpustat->user, tmp);
3150 }
3151
3152 /*
3153  * Account system cpu time to a process.
3154  * @p: the process that the cpu time gets accounted to
3155  * @hardirq_offset: the offset to subtract from hardirq_count()
3156  * @cputime: the cpu time spent in kernel space since the last update
3157  */
3158 void account_system_time(struct task_struct *p, int hardirq_offset,
3159                          cputime_t cputime)
3160 {
3161         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3162         struct rq *rq = this_rq();
3163         cputime64_t tmp;
3164
3165         p->stime = cputime_add(p->stime, cputime);
3166
3167         /* Add system time to cpustat. */
3168         tmp = cputime_to_cputime64(cputime);
3169         if (hardirq_count() - hardirq_offset)
3170                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3171         else if (softirq_count())
3172                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3173         else if (p != rq->idle)
3174                 cpustat->system = cputime64_add(cpustat->system, tmp);
3175         else if (atomic_read(&rq->nr_iowait) > 0)
3176                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3177         else
3178                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3179         /* Account for system time used */
3180         acct_update_integrals(p);
3181 }
3182
3183 /*
3184  * Account for involuntary wait time.
3185  * @p: the process from which the cpu time has been stolen
3186  * @steal: the cpu time spent in involuntary wait
3187  */
3188 void account_steal_time(struct task_struct *p, cputime_t steal)
3189 {
3190         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3191         cputime64_t tmp = cputime_to_cputime64(steal);
3192         struct rq *rq = this_rq();
3193
3194         if (p == rq->idle) {
3195                 p->stime = cputime_add(p->stime, steal);
3196                 if (atomic_read(&rq->nr_iowait) > 0)
3197                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3198                 else
3199                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3200         } else
3201                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3202 }
3203
3204 /*
3205  * This function gets called by the timer code, with HZ frequency.
3206  * We call it with interrupts disabled.
3207  *
3208  * It also gets called by the fork code, when changing the parent's
3209  * timeslices.
3210  */
3211 void scheduler_tick(void)
3212 {
3213         int cpu = smp_processor_id();
3214         struct rq *rq = cpu_rq(cpu);
3215         struct task_struct *curr = rq->curr;
3216
3217         spin_lock(&rq->lock);
3218         if (curr != rq->idle) /* FIXME: needed? */
3219                 curr->sched_class->task_tick(rq, curr);
3220         update_cpu_load(rq);
3221         spin_unlock(&rq->lock);
3222
3223 #ifdef CONFIG_SMP
3224         rq->idle_at_tick = idle_cpu(cpu);
3225         trigger_load_balance(rq, cpu);
3226 #endif
3227 }
3228
3229 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3230
3231 void fastcall add_preempt_count(int val)
3232 {
3233         /*
3234          * Underflow?
3235          */
3236         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3237                 return;
3238         preempt_count() += val;
3239         /*
3240          * Spinlock count overflowing soon?
3241          */
3242         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3243                                 PREEMPT_MASK - 10);
3244 }
3245 EXPORT_SYMBOL(add_preempt_count);
3246
3247 void fastcall sub_preempt_count(int val)
3248 {
3249         /*
3250          * Underflow?
3251          */
3252         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3253                 return;
3254         /*
3255          * Is the spinlock portion underflowing?
3256          */
3257         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3258                         !(preempt_count() & PREEMPT_MASK)))
3259                 return;
3260
3261         preempt_count() -= val;
3262 }
3263 EXPORT_SYMBOL(sub_preempt_count);
3264
3265 #endif
3266
3267 /*
3268  * Print scheduling while atomic bug:
3269  */
3270 static noinline void __schedule_bug(struct task_struct *prev)
3271 {
3272         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3273                 prev->comm, preempt_count(), prev->pid);
3274         debug_show_held_locks(prev);
3275         if (irqs_disabled())
3276                 print_irqtrace_events(prev);
3277         dump_stack();
3278 }
3279
3280 /*
3281  * Various schedule()-time debugging checks and statistics:
3282  */
3283 static inline void schedule_debug(struct task_struct *prev)
3284 {
3285         /*
3286          * Test if we are atomic.  Since do_exit() needs to call into
3287          * schedule() atomically, we ignore that path for now.
3288          * Otherwise, whine if we are scheduling when we should not be.
3289          */
3290         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3291                 __schedule_bug(prev);
3292
3293         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3294
3295         schedstat_inc(this_rq(), sched_cnt);
3296 }
3297
3298 /*
3299  * Pick up the highest-prio task:
3300  */
3301 static inline struct task_struct *
3302 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3303 {
3304         struct sched_class *class;
3305         struct task_struct *p;
3306
3307         /*
3308          * Optimization: we know that if all tasks are in
3309          * the fair class we can call that function directly:
3310          */
3311         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3312                 p = fair_sched_class.pick_next_task(rq, now);
3313                 if (likely(p))
3314                         return p;
3315         }
3316
3317         class = sched_class_highest;
3318         for ( ; ; ) {
3319                 p = class->pick_next_task(rq, now);
3320                 if (p)
3321                         return p;
3322                 /*
3323                  * Will never be NULL as the idle class always
3324                  * returns a non-NULL p:
3325                  */
3326                 class = class->next;
3327         }
3328 }
3329
3330 /*
3331  * schedule() is the main scheduler function.
3332  */
3333 asmlinkage void __sched schedule(void)
3334 {
3335         struct task_struct *prev, *next;
3336         long *switch_count;
3337         struct rq *rq;
3338         u64 now;
3339         int cpu;
3340
3341 need_resched:
3342         preempt_disable();
3343         cpu = smp_processor_id();
3344         rq = cpu_rq(cpu);
3345         rcu_qsctr_inc(cpu);
3346         prev = rq->curr;
3347         switch_count = &prev->nivcsw;
3348
3349         release_kernel_lock(prev);
3350 need_resched_nonpreemptible:
3351
3352         schedule_debug(prev);
3353
3354         spin_lock_irq(&rq->lock);
3355         clear_tsk_need_resched(prev);
3356
3357         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3358                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3359                                 unlikely(signal_pending(prev)))) {
3360                         prev->state = TASK_RUNNING;
3361                 } else {
3362                         deactivate_task(rq, prev, 1);
3363                 }
3364                 switch_count = &prev->nvcsw;
3365         }
3366
3367         if (unlikely(!rq->nr_running))
3368                 idle_balance(cpu, rq);
3369
3370         now = __rq_clock(rq);
3371         prev->sched_class->put_prev_task(rq, prev, now);
3372         next = pick_next_task(rq, prev, now);
3373
3374         sched_info_switch(prev, next);
3375
3376         if (likely(prev != next)) {
3377                 rq->nr_switches++;
3378                 rq->curr = next;
3379                 ++*switch_count;
3380
3381                 context_switch(rq, prev, next); /* unlocks the rq */
3382         } else
3383                 spin_unlock_irq(&rq->lock);
3384
3385         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3386                 cpu = smp_processor_id();
3387                 rq = cpu_rq(cpu);
3388                 goto need_resched_nonpreemptible;
3389         }
3390         preempt_enable_no_resched();
3391         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3392                 goto need_resched;
3393 }
3394 EXPORT_SYMBOL(schedule);
3395
3396 #ifdef CONFIG_PREEMPT
3397 /*
3398  * this is the entry point to schedule() from in-kernel preemption
3399  * off of preempt_enable.  Kernel preemptions off return from interrupt
3400  * occur there and call schedule directly.
3401  */
3402 asmlinkage void __sched preempt_schedule(void)
3403 {
3404         struct thread_info *ti = current_thread_info();
3405 #ifdef CONFIG_PREEMPT_BKL
3406         struct task_struct *task = current;
3407         int saved_lock_depth;
3408 #endif
3409         /*
3410          * If there is a non-zero preempt_count or interrupts are disabled,
3411          * we do not want to preempt the current task.  Just return..
3412          */
3413         if (likely(ti->preempt_count || irqs_disabled()))
3414                 return;
3415
3416 need_resched:
3417         add_preempt_count(PREEMPT_ACTIVE);
3418         /*
3419          * We keep the big kernel semaphore locked, but we
3420          * clear ->lock_depth so that schedule() doesnt
3421          * auto-release the semaphore:
3422          */
3423 #ifdef CONFIG_PREEMPT_BKL
3424         saved_lock_depth = task->lock_depth;
3425         task->lock_depth = -1;
3426 #endif
3427         schedule();
3428 #ifdef CONFIG_PREEMPT_BKL
3429         task->lock_depth = saved_lock_depth;
3430 #endif
3431         sub_preempt_count(PREEMPT_ACTIVE);
3432
3433         /* we could miss a preemption opportunity between schedule and now */
3434         barrier();
3435         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3436                 goto need_resched;
3437 }
3438 EXPORT_SYMBOL(preempt_schedule);
3439
3440 /*
3441  * this is the entry point to schedule() from kernel preemption
3442  * off of irq context.
3443  * Note, that this is called and return with irqs disabled. This will
3444  * protect us against recursive calling from irq.
3445  */
3446 asmlinkage void __sched preempt_schedule_irq(void)
3447 {
3448         struct thread_info *ti = current_thread_info();
3449 #ifdef CONFIG_PREEMPT_BKL
3450         struct task_struct *task = current;
3451         int saved_lock_depth;
3452 #endif
3453         /* Catch callers which need to be fixed */
3454         BUG_ON(ti->preempt_count || !irqs_disabled());
3455
3456 need_resched:
3457         add_preempt_count(PREEMPT_ACTIVE);
3458         /*
3459          * We keep the big kernel semaphore locked, but we
3460          * clear ->lock_depth so that schedule() doesnt
3461          * auto-release the semaphore:
3462          */
3463 #ifdef CONFIG_PREEMPT_BKL
3464         saved_lock_depth = task->lock_depth;
3465         task->lock_depth = -1;
3466 #endif
3467         local_irq_enable();
3468         schedule();
3469         local_irq_disable();
3470 #ifdef CONFIG_PREEMPT_BKL
3471         task->lock_depth = saved_lock_depth;
3472 #endif
3473         sub_preempt_count(PREEMPT_ACTIVE);
3474
3475         /* we could miss a preemption opportunity between schedule and now */
3476         barrier();
3477         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3478                 goto need_resched;
3479 }
3480
3481 #endif /* CONFIG_PREEMPT */
3482
3483 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3484                           void *key)
3485 {
3486         return try_to_wake_up(curr->private, mode, sync);
3487 }
3488 EXPORT_SYMBOL(default_wake_function);
3489
3490 /*
3491  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3492  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3493  * number) then we wake all the non-exclusive tasks and one exclusive task.
3494  *
3495  * There are circumstances in which we can try to wake a task which has already
3496  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3497  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3498  */
3499 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3500                              int nr_exclusive, int sync, void *key)
3501 {
3502         struct list_head *tmp, *next;
3503
3504         list_for_each_safe(tmp, next, &q->task_list) {
3505                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3506                 unsigned flags = curr->flags;
3507
3508                 if (curr->func(curr, mode, sync, key) &&
3509                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3510                         break;
3511         }
3512 }
3513
3514 /**
3515  * __wake_up - wake up threads blocked on a waitqueue.
3516  * @q: the waitqueue
3517  * @mode: which threads
3518  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3519  * @key: is directly passed to the wakeup function
3520  */
3521 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3522                         int nr_exclusive, void *key)
3523 {
3524         unsigned long flags;
3525
3526         spin_lock_irqsave(&q->lock, flags);
3527         __wake_up_common(q, mode, nr_exclusive, 0, key);
3528         spin_unlock_irqrestore(&q->lock, flags);
3529 }
3530 EXPORT_SYMBOL(__wake_up);
3531
3532 /*
3533  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3534  */
3535 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3536 {
3537         __wake_up_common(q, mode, 1, 0, NULL);
3538 }
3539
3540 /**
3541  * __wake_up_sync - wake up threads blocked on a waitqueue.
3542  * @q: the waitqueue
3543  * @mode: which threads
3544  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3545  *
3546  * The sync wakeup differs that the waker knows that it will schedule
3547  * away soon, so while the target thread will be woken up, it will not
3548  * be migrated to another CPU - ie. the two threads are 'synchronized'
3549  * with each other. This can prevent needless bouncing between CPUs.
3550  *
3551  * On UP it can prevent extra preemption.
3552  */
3553 void fastcall
3554 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3555 {
3556         unsigned long flags;
3557         int sync = 1;
3558
3559         if (unlikely(!q))
3560                 return;
3561
3562         if (unlikely(!nr_exclusive))
3563                 sync = 0;
3564
3565         spin_lock_irqsave(&q->lock, flags);
3566         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3567         spin_unlock_irqrestore(&q->lock, flags);
3568 }
3569 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3570
3571 void fastcall complete(struct completion *x)
3572 {
3573         unsigned long flags;
3574
3575         spin_lock_irqsave(&x->wait.lock, flags);
3576         x->done++;
3577         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3578                          1, 0, NULL);
3579         spin_unlock_irqrestore(&x->wait.lock, flags);
3580 }
3581 EXPORT_SYMBOL(complete);
3582
3583 void fastcall complete_all(struct completion *x)
3584 {
3585         unsigned long flags;
3586
3587         spin_lock_irqsave(&x->wait.lock, flags);
3588         x->done += UINT_MAX/2;
3589         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3590                          0, 0, NULL);
3591         spin_unlock_irqrestore(&x->wait.lock, flags);
3592 }
3593 EXPORT_SYMBOL(complete_all);
3594
3595 void fastcall __sched wait_for_completion(struct completion *x)
3596 {
3597         might_sleep();
3598
3599         spin_lock_irq(&x->wait.lock);
3600         if (!x->done) {
3601                 DECLARE_WAITQUEUE(wait, current);
3602
3603                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3604                 __add_wait_queue_tail(&x->wait, &wait);
3605                 do {
3606                         __set_current_state(TASK_UNINTERRUPTIBLE);
3607                         spin_unlock_irq(&x->wait.lock);
3608                         schedule();
3609                         spin_lock_irq(&x->wait.lock);
3610                 } while (!x->done);
3611                 __remove_wait_queue(&x->wait, &wait);
3612         }
3613         x->done--;
3614         spin_unlock_irq(&x->wait.lock);
3615 }
3616 EXPORT_SYMBOL(wait_for_completion);
3617
3618 unsigned long fastcall __sched
3619 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3620 {
3621         might_sleep();
3622
3623         spin_lock_irq(&x->wait.lock);
3624         if (!x->done) {
3625                 DECLARE_WAITQUEUE(wait, current);
3626
3627                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3628                 __add_wait_queue_tail(&x->wait, &wait);
3629                 do {
3630                         __set_current_state(TASK_UNINTERRUPTIBLE);
3631                         spin_unlock_irq(&x->wait.lock);
3632                         timeout = schedule_timeout(timeout);
3633                         spin_lock_irq(&x->wait.lock);
3634                         if (!timeout) {
3635                                 __remove_wait_queue(&x->wait, &wait);
3636                                 goto out;
3637                         }
3638                 } while (!x->done);
3639                 __remove_wait_queue(&x->wait, &wait);
3640         }
3641         x->done--;
3642 out:
3643         spin_unlock_irq(&x->wait.lock);
3644         return timeout;
3645 }
3646 EXPORT_SYMBOL(wait_for_completion_timeout);
3647
3648 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3649 {
3650         int ret = 0;
3651
3652         might_sleep();
3653
3654         spin_lock_irq(&x->wait.lock);
3655         if (!x->done) {
3656                 DECLARE_WAITQUEUE(wait, current);
3657
3658                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3659                 __add_wait_queue_tail(&x->wait, &wait);
3660                 do {
3661                         if (signal_pending(current)) {
3662                                 ret = -ERESTARTSYS;
3663                                 __remove_wait_queue(&x->wait, &wait);
3664                                 goto out;
3665                         }
3666                         __set_current_state(TASK_INTERRUPTIBLE);
3667                         spin_unlock_irq(&x->wait.lock);
3668                         schedule();
3669                         spin_lock_irq(&x->wait.lock);
3670                 } while (!x->done);
3671                 __remove_wait_queue(&x->wait, &wait);
3672         }
3673         x->done--;
3674 out:
3675         spin_unlock_irq(&x->wait.lock);
3676
3677         return ret;
3678 }
3679 EXPORT_SYMBOL(wait_for_completion_interruptible);
3680
3681 unsigned long fastcall __sched
3682 wait_for_completion_interruptible_timeout(struct completion *x,
3683                                           unsigned long timeout)
3684 {
3685         might_sleep();
3686
3687         spin_lock_irq(&x->wait.lock);
3688         if (!x->done) {
3689                 DECLARE_WAITQUEUE(wait, current);
3690
3691                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3692                 __add_wait_queue_tail(&x->wait, &wait);
3693                 do {
3694                         if (signal_pending(current)) {
3695                                 timeout = -ERESTARTSYS;
3696                                 __remove_wait_queue(&x->wait, &wait);
3697                                 goto out;
3698                         }
3699                         __set_current_state(TASK_INTERRUPTIBLE);
3700                         spin_unlock_irq(&x->wait.lock);
3701                         timeout = schedule_timeout(timeout);
3702                         spin_lock_irq(&x->wait.lock);
3703                         if (!timeout) {
3704                                 __remove_wait_queue(&x->wait, &wait);
3705                                 goto out;
3706                         }
3707                 } while (!x->done);
3708                 __remove_wait_queue(&x->wait, &wait);
3709         }
3710         x->done--;
3711 out:
3712         spin_unlock_irq(&x->wait.lock);
3713         return timeout;
3714 }
3715 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3716
3717 static inline void
3718 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3719 {
3720         spin_lock_irqsave(&q->lock, *flags);
3721         __add_wait_queue(q, wait);
3722         spin_unlock(&q->lock);
3723 }
3724
3725 static inline void
3726 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3727 {
3728         spin_lock_irq(&q->lock);
3729         __remove_wait_queue(q, wait);
3730         spin_unlock_irqrestore(&q->lock, *flags);
3731 }
3732
3733 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3734 {
3735         unsigned long flags;
3736         wait_queue_t wait;
3737
3738         init_waitqueue_entry(&wait, current);
3739
3740         current->state = TASK_INTERRUPTIBLE;
3741
3742         sleep_on_head(q, &wait, &flags);
3743         schedule();
3744         sleep_on_tail(q, &wait, &flags);
3745 }
3746 EXPORT_SYMBOL(interruptible_sleep_on);
3747
3748 long __sched
3749 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3750 {
3751         unsigned long flags;
3752         wait_queue_t wait;
3753
3754         init_waitqueue_entry(&wait, current);
3755
3756         current->state = TASK_INTERRUPTIBLE;
3757
3758         sleep_on_head(q, &wait, &flags);
3759         timeout = schedule_timeout(timeout);
3760         sleep_on_tail(q, &wait, &flags);
3761
3762         return timeout;
3763 }
3764 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3765
3766 void __sched sleep_on(wait_queue_head_t *q)
3767 {
3768         unsigned long flags;
3769         wait_queue_t wait;
3770
3771         init_waitqueue_entry(&wait, current);
3772
3773         current->state = TASK_UNINTERRUPTIBLE;
3774
3775         sleep_on_head(q, &wait, &flags);
3776         schedule();
3777         sleep_on_tail(q, &wait, &flags);
3778 }
3779 EXPORT_SYMBOL(sleep_on);
3780
3781 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3782 {
3783         unsigned long flags;
3784         wait_queue_t wait;
3785
3786         init_waitqueue_entry(&wait, current);
3787
3788         current->state = TASK_UNINTERRUPTIBLE;
3789
3790         sleep_on_head(q, &wait, &flags);
3791         timeout = schedule_timeout(timeout);
3792         sleep_on_tail(q, &wait, &flags);
3793
3794         return timeout;
3795 }
3796 EXPORT_SYMBOL(sleep_on_timeout);
3797
3798 #ifdef CONFIG_RT_MUTEXES
3799
3800 /*
3801  * rt_mutex_setprio - set the current priority of a task
3802  * @p: task
3803  * @prio: prio value (kernel-internal form)
3804  *
3805  * This function changes the 'effective' priority of a task. It does
3806  * not touch ->normal_prio like __setscheduler().
3807  *
3808  * Used by the rt_mutex code to implement priority inheritance logic.
3809  */
3810 void rt_mutex_setprio(struct task_struct *p, int prio)
3811 {
3812         unsigned long flags;
3813         int oldprio, on_rq;
3814         struct rq *rq;
3815         u64 now;
3816
3817         BUG_ON(prio < 0 || prio > MAX_PRIO);
3818
3819         rq = task_rq_lock(p, &flags);
3820         now = rq_clock(rq);
3821
3822         oldprio = p->prio;
3823         on_rq = p->se.on_rq;
3824         if (on_rq)
3825                 dequeue_task(rq, p, 0, now);
3826
3827         if (rt_prio(prio))
3828                 p->sched_class = &rt_sched_class;
3829         else
3830                 p->sched_class = &fair_sched_class;
3831
3832         p->prio = prio;
3833
3834         if (on_rq) {
3835                 enqueue_task(rq, p, 0, now);
3836                 /*
3837                  * Reschedule if we are currently running on this runqueue and
3838                  * our priority decreased, or if we are not currently running on
3839                  * this runqueue and our priority is higher than the current's
3840                  */
3841                 if (task_running(rq, p)) {
3842                         if (p->prio > oldprio)
3843                                 resched_task(rq->curr);
3844                 } else {
3845                         check_preempt_curr(rq, p);
3846                 }
3847         }
3848         task_rq_unlock(rq, &flags);
3849 }
3850
3851 #endif
3852
3853 void set_user_nice(struct task_struct *p, long nice)
3854 {
3855         int old_prio, delta, on_rq;
3856         unsigned long flags;
3857         struct rq *rq;
3858         u64 now;
3859
3860         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3861                 return;
3862         /*
3863          * We have to be careful, if called from sys_setpriority(),
3864          * the task might be in the middle of scheduling on another CPU.
3865          */
3866         rq = task_rq_lock(p, &flags);
3867         now = rq_clock(rq);
3868         /*
3869          * The RT priorities are set via sched_setscheduler(), but we still
3870          * allow the 'normal' nice value to be set - but as expected
3871          * it wont have any effect on scheduling until the task is
3872          * SCHED_FIFO/SCHED_RR:
3873          */
3874         if (task_has_rt_policy(p)) {
3875                 p->static_prio = NICE_TO_PRIO(nice);
3876                 goto out_unlock;
3877         }
3878         on_rq = p->se.on_rq;
3879         if (on_rq) {
3880                 dequeue_task(rq, p, 0, now);
3881                 dec_load(rq, p, now);
3882         }
3883
3884         p->static_prio = NICE_TO_PRIO(nice);
3885         set_load_weight(p);
3886         old_prio = p->prio;
3887         p->prio = effective_prio(p);
3888         delta = p->prio - old_prio;
3889
3890         if (on_rq) {
3891                 enqueue_task(rq, p, 0, now);
3892                 inc_load(rq, p, now);
3893                 /*
3894                  * If the task increased its priority or is running and
3895                  * lowered its priority, then reschedule its CPU:
3896                  */
3897                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3898                         resched_task(rq->curr);
3899         }
3900 out_unlock:
3901         task_rq_unlock(rq, &flags);
3902 }
3903 EXPORT_SYMBOL(set_user_nice);
3904
3905 /*
3906  * can_nice - check if a task can reduce its nice value
3907  * @p: task
3908  * @nice: nice value
3909  */
3910 int can_nice(const struct task_struct *p, const int nice)
3911 {
3912         /* convert nice value [19,-20] to rlimit style value [1,40] */
3913         int nice_rlim = 20 - nice;
3914
3915         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3916                 capable(CAP_SYS_NICE));
3917 }
3918
3919 #ifdef __ARCH_WANT_SYS_NICE
3920
3921 /*
3922  * sys_nice - change the priority of the current process.
3923  * @increment: priority increment
3924  *
3925  * sys_setpriority is a more generic, but much slower function that
3926  * does similar things.
3927  */
3928 asmlinkage long sys_nice(int increment)
3929 {
3930         long nice, retval;
3931
3932         /*
3933          * Setpriority might change our priority at the same moment.
3934          * We don't have to worry. Conceptually one call occurs first
3935          * and we have a single winner.
3936          */
3937         if (increment < -40)
3938                 increment = -40;
3939         if (increment > 40)
3940                 increment = 40;
3941
3942         nice = PRIO_TO_NICE(current->static_prio) + increment;
3943         if (nice < -20)
3944                 nice = -20;
3945         if (nice > 19)
3946                 nice = 19;
3947
3948         if (increment < 0 && !can_nice(current, nice))
3949                 return -EPERM;
3950
3951         retval = security_task_setnice(current, nice);
3952         if (retval)
3953                 return retval;
3954
3955         set_user_nice(current, nice);
3956         return 0;
3957 }
3958
3959 #endif
3960
3961 /**
3962  * task_prio - return the priority value of a given task.
3963  * @p: the task in question.
3964  *
3965  * This is the priority value as seen by users in /proc.
3966  * RT tasks are offset by -200. Normal tasks are centered
3967  * around 0, value goes from -16 to +15.
3968  */
3969 int task_prio(const struct task_struct *p)
3970 {
3971         return p->prio - MAX_RT_PRIO;
3972 }
3973
3974 /**
3975  * task_nice - return the nice value of a given task.
3976  * @p: the task in question.
3977  */
3978 int task_nice(const struct task_struct *p)
3979 {
3980         return TASK_NICE(p);
3981 }
3982 EXPORT_SYMBOL_GPL(task_nice);
3983
3984 /**
3985  * idle_cpu - is a given cpu idle currently?
3986  * @cpu: the processor in question.
3987  */
3988 int idle_cpu(int cpu)
3989 {
3990         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3991 }
3992
3993 /**
3994  * idle_task - return the idle task for a given cpu.
3995  * @cpu: the processor in question.
3996  */
3997 struct task_struct *idle_task(int cpu)
3998 {
3999         return cpu_rq(cpu)->idle;
4000 }
4001
4002 /**
4003  * find_process_by_pid - find a process with a matching PID value.
4004  * @pid: the pid in question.
4005  */
4006 static inline struct task_struct *find_process_by_pid(pid_t pid)
4007 {
4008         return pid ? find_task_by_pid(pid) : current;
4009 }
4010
4011 /* Actually do priority change: must hold rq lock. */
4012 static void
4013 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4014 {
4015         BUG_ON(p->se.on_rq);
4016
4017         p->policy = policy;
4018         switch (p->policy) {
4019         case SCHED_NORMAL:
4020         case SCHED_BATCH:
4021         case SCHED_IDLE:
4022                 p->sched_class = &fair_sched_class;
4023                 break;
4024         case SCHED_FIFO:
4025         case SCHED_RR:
4026                 p->sched_class = &rt_sched_class;
4027                 break;
4028         }
4029
4030         p->rt_priority = prio;
4031         p->normal_prio = normal_prio(p);
4032         /* we are holding p->pi_lock already */
4033         p->prio = rt_mutex_getprio(p);
4034         set_load_weight(p);
4035 }
4036
4037 /**
4038  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4039  * @p: the task in question.
4040  * @policy: new policy.
4041  * @param: structure containing the new RT priority.
4042  *
4043  * NOTE that the task may be already dead.
4044  */
4045 int sched_setscheduler(struct task_struct *p, int policy,
4046                        struct sched_param *param)
4047 {
4048         int retval, oldprio, oldpolicy = -1, on_rq;
4049         unsigned long flags;
4050         struct rq *rq;
4051
4052         /* may grab non-irq protected spin_locks */
4053         BUG_ON(in_interrupt());
4054 recheck:
4055         /* double check policy once rq lock held */
4056         if (policy < 0)
4057                 policy = oldpolicy = p->policy;
4058         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4059                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4060                         policy != SCHED_IDLE)
4061                 return -EINVAL;
4062         /*
4063          * Valid priorities for SCHED_FIFO and SCHED_RR are
4064          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4065          * SCHED_BATCH and SCHED_IDLE is 0.
4066          */
4067         if (param->sched_priority < 0 ||
4068             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4069             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4070                 return -EINVAL;
4071         if (rt_policy(policy) != (param->sched_priority != 0))
4072                 return -EINVAL;
4073
4074         /*
4075          * Allow unprivileged RT tasks to decrease priority:
4076          */
4077         if (!capable(CAP_SYS_NICE)) {
4078                 if (rt_policy(policy)) {
4079                         unsigned long rlim_rtprio;
4080
4081                         if (!lock_task_sighand(p, &flags))
4082                                 return -ESRCH;
4083                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4084                         unlock_task_sighand(p, &flags);
4085
4086                         /* can't set/change the rt policy */
4087                         if (policy != p->policy && !rlim_rtprio)
4088                                 return -EPERM;
4089
4090                         /* can't increase priority */
4091                         if (param->sched_priority > p->rt_priority &&
4092                             param->sched_priority > rlim_rtprio)
4093                                 return -EPERM;
4094                 }
4095                 /*
4096                  * Like positive nice levels, dont allow tasks to
4097                  * move out of SCHED_IDLE either:
4098                  */
4099                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4100                         return -EPERM;
4101
4102                 /* can't change other user's priorities */
4103                 if ((current->euid != p->euid) &&
4104                     (current->euid != p->uid))
4105                         return -EPERM;
4106         }
4107
4108         retval = security_task_setscheduler(p, policy, param);
4109         if (retval)
4110                 return retval;
4111         /*
4112          * make sure no PI-waiters arrive (or leave) while we are
4113          * changing the priority of the task:
4114          */
4115         spin_lock_irqsave(&p->pi_lock, flags);
4116         /*
4117          * To be able to change p->policy safely, the apropriate
4118          * runqueue lock must be held.
4119          */
4120         rq = __task_rq_lock(p);
4121         /* recheck policy now with rq lock held */
4122         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4123                 policy = oldpolicy = -1;
4124                 __task_rq_unlock(rq);
4125                 spin_unlock_irqrestore(&p->pi_lock, flags);
4126                 goto recheck;
4127         }
4128         on_rq = p->se.on_rq;
4129         if (on_rq)
4130                 deactivate_task(rq, p, 0);
4131         oldprio = p->prio;
4132         __setscheduler(rq, p, policy, param->sched_priority);
4133         if (on_rq) {
4134                 activate_task(rq, p, 0);
4135                 /*
4136                  * Reschedule if we are currently running on this runqueue and
4137                  * our priority decreased, or if we are not currently running on
4138                  * this runqueue and our priority is higher than the current's
4139                  */
4140                 if (task_running(rq, p)) {
4141                         if (p->prio > oldprio)
4142                                 resched_task(rq->curr);
4143                 } else {
4144                         check_preempt_curr(rq, p);
4145                 }
4146         }
4147         __task_rq_unlock(rq);
4148         spin_unlock_irqrestore(&p->pi_lock, flags);
4149
4150         rt_mutex_adjust_pi(p);
4151
4152         return 0;
4153 }
4154 EXPORT_SYMBOL_GPL(sched_setscheduler);
4155
4156 static int
4157 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4158 {
4159         struct sched_param lparam;
4160         struct task_struct *p;
4161         int retval;
4162
4163         if (!param || pid < 0)
4164                 return -EINVAL;
4165         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4166                 return -EFAULT;
4167
4168         rcu_read_lock();
4169         retval = -ESRCH;
4170         p = find_process_by_pid(pid);
4171         if (p != NULL)
4172                 retval = sched_setscheduler(p, policy, &lparam);
4173         rcu_read_unlock();
4174
4175         return retval;
4176 }
4177
4178 /**
4179  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4180  * @pid: the pid in question.
4181  * @policy: new policy.
4182  * @param: structure containing the new RT priority.
4183  */
4184 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4185                                        struct sched_param __user *param)
4186 {
4187         /* negative values for policy are not valid */
4188         if (policy < 0)
4189                 return -EINVAL;
4190
4191         return do_sched_setscheduler(pid, policy, param);
4192 }
4193
4194 /**
4195  * sys_sched_setparam - set/change the RT priority of a thread
4196  * @pid: the pid in question.
4197  * @param: structure containing the new RT priority.
4198  */
4199 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4200 {
4201         return do_sched_setscheduler(pid, -1, param);
4202 }
4203
4204 /**
4205  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4206  * @pid: the pid in question.
4207  */
4208 asmlinkage long sys_sched_getscheduler(pid_t pid)
4209 {
4210         struct task_struct *p;
4211         int retval = -EINVAL;
4212
4213         if (pid < 0)
4214                 goto out_nounlock;
4215
4216         retval = -ESRCH;
4217         read_lock(&tasklist_lock);
4218         p = find_process_by_pid(pid);
4219         if (p) {
4220                 retval = security_task_getscheduler(p);
4221                 if (!retval)
4222                         retval = p->policy;
4223         }
4224         read_unlock(&tasklist_lock);
4225
4226 out_nounlock:
4227         return retval;
4228 }
4229
4230 /**
4231  * sys_sched_getscheduler - get the RT priority of a thread
4232  * @pid: the pid in question.
4233  * @param: structure containing the RT priority.
4234  */
4235 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4236 {
4237         struct sched_param lp;
4238         struct task_struct *p;
4239         int retval = -EINVAL;
4240
4241         if (!param || pid < 0)
4242                 goto out_nounlock;
4243
4244         read_lock(&tasklist_lock);
4245         p = find_process_by_pid(pid);
4246         retval = -ESRCH;
4247         if (!p)
4248                 goto out_unlock;
4249
4250         retval = security_task_getscheduler(p);
4251         if (retval)
4252                 goto out_unlock;
4253
4254         lp.sched_priority = p->rt_priority;
4255         read_unlock(&tasklist_lock);
4256
4257         /*
4258          * This one might sleep, we cannot do it with a spinlock held ...
4259          */
4260         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4261
4262 out_nounlock:
4263         return retval;
4264
4265 out_unlock:
4266         read_unlock(&tasklist_lock);
4267         return retval;
4268 }
4269
4270 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4271 {
4272         cpumask_t cpus_allowed;
4273         struct task_struct *p;
4274         int retval;
4275
4276         mutex_lock(&sched_hotcpu_mutex);
4277         read_lock(&tasklist_lock);
4278
4279         p = find_process_by_pid(pid);
4280         if (!p) {
4281                 read_unlock(&tasklist_lock);
4282                 mutex_unlock(&sched_hotcpu_mutex);
4283                 return -ESRCH;
4284         }
4285
4286         /*
4287          * It is not safe to call set_cpus_allowed with the
4288          * tasklist_lock held.  We will bump the task_struct's
4289          * usage count and then drop tasklist_lock.
4290          */
4291         get_task_struct(p);
4292         read_unlock(&tasklist_lock);
4293
4294         retval = -EPERM;
4295         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4296                         !capable(CAP_SYS_NICE))
4297                 goto out_unlock;
4298
4299         retval = security_task_setscheduler(p, 0, NULL);
4300         if (retval)
4301                 goto out_unlock;
4302
4303         cpus_allowed = cpuset_cpus_allowed(p);
4304         cpus_and(new_mask, new_mask, cpus_allowed);
4305         retval = set_cpus_allowed(p, new_mask);
4306
4307 out_unlock:
4308         put_task_struct(p);
4309         mutex_unlock(&sched_hotcpu_mutex);
4310         return retval;
4311 }
4312
4313 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4314                              cpumask_t *new_mask)
4315 {
4316         if (len < sizeof(cpumask_t)) {
4317                 memset(new_mask, 0, sizeof(cpumask_t));
4318         } else if (len > sizeof(cpumask_t)) {
4319                 len = sizeof(cpumask_t);
4320         }
4321         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4322 }
4323
4324 /**
4325  * sys_sched_setaffinity - set the cpu affinity of a process
4326  * @pid: pid of the process
4327  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4328  * @user_mask_ptr: user-space pointer to the new cpu mask
4329  */
4330 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4331                                       unsigned long __user *user_mask_ptr)
4332 {
4333         cpumask_t new_mask;
4334         int retval;
4335
4336         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4337         if (retval)
4338                 return retval;
4339
4340         return sched_setaffinity(pid, new_mask);
4341 }
4342
4343 /*
4344  * Represents all cpu's present in the system
4345  * In systems capable of hotplug, this map could dynamically grow
4346  * as new cpu's are detected in the system via any platform specific
4347  * method, such as ACPI for e.g.
4348  */
4349
4350 cpumask_t cpu_present_map __read_mostly;
4351 EXPORT_SYMBOL(cpu_present_map);
4352
4353 #ifndef CONFIG_SMP
4354 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4355 EXPORT_SYMBOL(cpu_online_map);
4356
4357 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4358 EXPORT_SYMBOL(cpu_possible_map);
4359 #endif
4360
4361 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4362 {
4363         struct task_struct *p;
4364         int retval;
4365
4366         mutex_lock(&sched_hotcpu_mutex);
4367         read_lock(&tasklist_lock);
4368
4369         retval = -ESRCH;
4370         p = find_process_by_pid(pid);
4371         if (!p)
4372                 goto out_unlock;
4373
4374         retval = security_task_getscheduler(p);
4375         if (retval)
4376                 goto out_unlock;
4377
4378         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4379
4380 out_unlock:
4381         read_unlock(&tasklist_lock);
4382         mutex_unlock(&sched_hotcpu_mutex);
4383         if (retval)
4384                 return retval;
4385
4386         return 0;
4387 }
4388
4389 /**
4390  * sys_sched_getaffinity - get the cpu affinity of a process
4391  * @pid: pid of the process
4392  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4393  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4394  */
4395 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4396                                       unsigned long __user *user_mask_ptr)
4397 {
4398         int ret;
4399         cpumask_t mask;
4400
4401         if (len < sizeof(cpumask_t))
4402                 return -EINVAL;
4403
4404         ret = sched_getaffinity(pid, &mask);
4405         if (ret < 0)
4406                 return ret;
4407
4408         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4409                 return -EFAULT;
4410
4411         return sizeof(cpumask_t);
4412 }
4413
4414 /**
4415  * sys_sched_yield - yield the current processor to other threads.
4416  *
4417  * This function yields the current CPU to other tasks. If there are no
4418  * other threads running on this CPU then this function will return.
4419  */
4420 asmlinkage long sys_sched_yield(void)
4421 {
4422         struct rq *rq = this_rq_lock();
4423
4424         schedstat_inc(rq, yld_cnt);
4425         if (unlikely(rq->nr_running == 1))
4426                 schedstat_inc(rq, yld_act_empty);
4427         else
4428                 current->sched_class->yield_task(rq, current);
4429
4430         /*
4431          * Since we are going to call schedule() anyway, there's
4432          * no need to preempt or enable interrupts:
4433          */
4434         __release(rq->lock);
4435         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4436         _raw_spin_unlock(&rq->lock);
4437         preempt_enable_no_resched();
4438
4439         schedule();
4440
4441         return 0;
4442 }
4443
4444 static void __cond_resched(void)
4445 {
4446 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4447         __might_sleep(__FILE__, __LINE__);
4448 #endif
4449         /*
4450          * The BKS might be reacquired before we have dropped
4451          * PREEMPT_ACTIVE, which could trigger a second
4452          * cond_resched() call.
4453          */
4454         do {
4455                 add_preempt_count(PREEMPT_ACTIVE);
4456                 schedule();
4457                 sub_preempt_count(PREEMPT_ACTIVE);
4458         } while (need_resched());
4459 }
4460
4461 int __sched cond_resched(void)
4462 {
4463         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4464                                         system_state == SYSTEM_RUNNING) {
4465                 __cond_resched();
4466                 return 1;
4467         }
4468         return 0;
4469 }
4470 EXPORT_SYMBOL(cond_resched);
4471
4472 /*
4473  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4474  * call schedule, and on return reacquire the lock.
4475  *
4476  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4477  * operations here to prevent schedule() from being called twice (once via
4478  * spin_unlock(), once by hand).
4479  */
4480 int cond_resched_lock(spinlock_t *lock)
4481 {
4482         int ret = 0;
4483
4484         if (need_lockbreak(lock)) {
4485                 spin_unlock(lock);
4486                 cpu_relax();
4487                 ret = 1;
4488                 spin_lock(lock);
4489         }
4490         if (need_resched() && system_state == SYSTEM_RUNNING) {
4491                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4492                 _raw_spin_unlock(lock);
4493                 preempt_enable_no_resched();
4494                 __cond_resched();
4495                 ret = 1;
4496                 spin_lock(lock);
4497         }
4498         return ret;
4499 }
4500 EXPORT_SYMBOL(cond_resched_lock);
4501
4502 int __sched cond_resched_softirq(void)
4503 {
4504         BUG_ON(!in_softirq());
4505
4506         if (need_resched() && system_state == SYSTEM_RUNNING) {
4507                 local_bh_enable();
4508                 __cond_resched();
4509                 local_bh_disable();
4510                 return 1;
4511         }
4512         return 0;
4513 }
4514 EXPORT_SYMBOL(cond_resched_softirq);
4515
4516 /**
4517  * yield - yield the current processor to other threads.
4518  *
4519  * This is a shortcut for kernel-space yielding - it marks the
4520  * thread runnable and calls sys_sched_yield().
4521  */
4522 void __sched yield(void)
4523 {
4524         set_current_state(TASK_RUNNING);
4525         sys_sched_yield();
4526 }
4527 EXPORT_SYMBOL(yield);
4528
4529 /*
4530  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4531  * that process accounting knows that this is a task in IO wait state.
4532  *
4533  * But don't do that if it is a deliberate, throttling IO wait (this task
4534  * has set its backing_dev_info: the queue against which it should throttle)
4535  */
4536 void __sched io_schedule(void)
4537 {
4538         struct rq *rq = &__raw_get_cpu_var(runqueues);
4539
4540         delayacct_blkio_start();
4541         atomic_inc(&rq->nr_iowait);
4542         schedule();
4543         atomic_dec(&rq->nr_iowait);
4544         delayacct_blkio_end();
4545 }
4546 EXPORT_SYMBOL(io_schedule);
4547
4548 long __sched io_schedule_timeout(long timeout)
4549 {
4550         struct rq *rq = &__raw_get_cpu_var(runqueues);
4551         long ret;
4552
4553         delayacct_blkio_start();
4554         atomic_inc(&rq->nr_iowait);
4555         ret = schedule_timeout(timeout);
4556         atomic_dec(&rq->nr_iowait);
4557         delayacct_blkio_end();
4558         return ret;
4559 }
4560
4561 /**
4562  * sys_sched_get_priority_max - return maximum RT priority.
4563  * @policy: scheduling class.
4564  *
4565  * this syscall returns the maximum rt_priority that can be used
4566  * by a given scheduling class.
4567  */
4568 asmlinkage long sys_sched_get_priority_max(int policy)
4569 {
4570         int ret = -EINVAL;
4571
4572         switch (policy) {
4573         case SCHED_FIFO:
4574         case SCHED_RR:
4575                 ret = MAX_USER_RT_PRIO-1;
4576                 break;
4577         case SCHED_NORMAL:
4578         case SCHED_BATCH:
4579         case SCHED_IDLE:
4580                 ret = 0;
4581                 break;
4582         }
4583         return ret;
4584 }
4585
4586 /**
4587  * sys_sched_get_priority_min - return minimum RT priority.
4588  * @policy: scheduling class.
4589  *
4590  * this syscall returns the minimum rt_priority that can be used
4591  * by a given scheduling class.
4592  */
4593 asmlinkage long sys_sched_get_priority_min(int policy)
4594 {
4595         int ret = -EINVAL;
4596
4597         switch (policy) {
4598         case SCHED_FIFO:
4599         case SCHED_RR:
4600                 ret = 1;
4601                 break;
4602         case SCHED_NORMAL:
4603         case SCHED_BATCH:
4604         case SCHED_IDLE:
4605                 ret = 0;
4606         }
4607         return ret;
4608 }
4609
4610 /**
4611  * sys_sched_rr_get_interval - return the default timeslice of a process.
4612  * @pid: pid of the process.
4613  * @interval: userspace pointer to the timeslice value.
4614  *
4615  * this syscall writes the default timeslice value of a given process
4616  * into the user-space timespec buffer. A value of '0' means infinity.
4617  */
4618 asmlinkage
4619 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4620 {
4621         struct task_struct *p;
4622         int retval = -EINVAL;
4623         struct timespec t;
4624
4625         if (pid < 0)
4626                 goto out_nounlock;
4627
4628         retval = -ESRCH;
4629         read_lock(&tasklist_lock);
4630         p = find_process_by_pid(pid);
4631         if (!p)
4632                 goto out_unlock;
4633
4634         retval = security_task_getscheduler(p);
4635         if (retval)
4636                 goto out_unlock;
4637
4638         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4639                                 0 : static_prio_timeslice(p->static_prio), &t);
4640         read_unlock(&tasklist_lock);
4641         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4642 out_nounlock:
4643         return retval;
4644 out_unlock:
4645         read_unlock(&tasklist_lock);
4646         return retval;
4647 }
4648
4649 static const char stat_nam[] = "RSDTtZX";
4650
4651 static void show_task(struct task_struct *p)
4652 {
4653         unsigned long free = 0;
4654         unsigned state;
4655
4656         state = p->state ? __ffs(p->state) + 1 : 0;
4657         printk("%-13.13s %c", p->comm,
4658                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4659 #if BITS_PER_LONG == 32
4660         if (state == TASK_RUNNING)
4661                 printk(" running  ");
4662         else
4663                 printk(" %08lx ", thread_saved_pc(p));
4664 #else
4665         if (state == TASK_RUNNING)
4666                 printk("  running task    ");
4667         else
4668                 printk(" %016lx ", thread_saved_pc(p));
4669 #endif
4670 #ifdef CONFIG_DEBUG_STACK_USAGE
4671         {
4672                 unsigned long *n = end_of_stack(p);
4673                 while (!*n)
4674                         n++;
4675                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4676         }
4677 #endif
4678         printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4679
4680         if (state != TASK_RUNNING)
4681                 show_stack(p, NULL);
4682 }
4683
4684 void show_state_filter(unsigned long state_filter)
4685 {
4686         struct task_struct *g, *p;
4687
4688 #if BITS_PER_LONG == 32
4689         printk(KERN_INFO
4690                 "  task                PC stack   pid father\n");
4691 #else
4692         printk(KERN_INFO
4693                 "  task                        PC stack   pid father\n");
4694 #endif
4695         read_lock(&tasklist_lock);
4696         do_each_thread(g, p) {
4697                 /*
4698                  * reset the NMI-timeout, listing all files on a slow
4699                  * console might take alot of time:
4700                  */
4701                 touch_nmi_watchdog();
4702                 if (!state_filter || (p->state & state_filter))
4703                         show_task(p);
4704         } while_each_thread(g, p);
4705
4706         touch_all_softlockup_watchdogs();
4707
4708 #ifdef CONFIG_SCHED_DEBUG
4709         sysrq_sched_debug_show();
4710 #endif
4711         read_unlock(&tasklist_lock);
4712         /*
4713          * Only show locks if all tasks are dumped:
4714          */
4715         if (state_filter == -1)
4716                 debug_show_all_locks();
4717 }
4718
4719 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4720 {
4721         idle->sched_class = &idle_sched_class;
4722 }
4723
4724 /**
4725  * init_idle - set up an idle thread for a given CPU
4726  * @idle: task in question
4727  * @cpu: cpu the idle task belongs to
4728  *
4729  * NOTE: this function does not set the idle thread's NEED_RESCHED
4730  * flag, to make booting more robust.
4731  */
4732 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4733 {
4734         struct rq *rq = cpu_rq(cpu);
4735         unsigned long flags;
4736
4737         __sched_fork(idle);
4738         idle->se.exec_start = sched_clock();
4739
4740         idle->prio = idle->normal_prio = MAX_PRIO;
4741         idle->cpus_allowed = cpumask_of_cpu(cpu);
4742         __set_task_cpu(idle, cpu);
4743
4744         spin_lock_irqsave(&rq->lock, flags);
4745         rq->curr = rq->idle = idle;
4746 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4747         idle->oncpu = 1;
4748 #endif
4749         spin_unlock_irqrestore(&rq->lock, flags);
4750
4751         /* Set the preempt count _outside_ the spinlocks! */
4752 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4753         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4754 #else
4755         task_thread_info(idle)->preempt_count = 0;
4756 #endif
4757         /*
4758          * The idle tasks have their own, simple scheduling class:
4759          */
4760         idle->sched_class = &idle_sched_class;
4761 }
4762
4763 /*
4764  * In a system that switches off the HZ timer nohz_cpu_mask
4765  * indicates which cpus entered this state. This is used
4766  * in the rcu update to wait only for active cpus. For system
4767  * which do not switch off the HZ timer nohz_cpu_mask should
4768  * always be CPU_MASK_NONE.
4769  */
4770 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4771
4772 /*
4773  * Increase the granularity value when there are more CPUs,
4774  * because with more CPUs the 'effective latency' as visible
4775  * to users decreases. But the relationship is not linear,
4776  * so pick a second-best guess by going with the log2 of the
4777  * number of CPUs.
4778  *
4779  * This idea comes from the SD scheduler of Con Kolivas:
4780  */
4781 static inline void sched_init_granularity(void)
4782 {
4783         unsigned int factor = 1 + ilog2(num_online_cpus());
4784         const unsigned long gran_limit = 100000000;
4785
4786         sysctl_sched_granularity *= factor;
4787         if (sysctl_sched_granularity > gran_limit)
4788                 sysctl_sched_granularity = gran_limit;
4789
4790         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4791         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4792 }
4793
4794 #ifdef CONFIG_SMP
4795 /*
4796  * This is how migration works:
4797  *
4798  * 1) we queue a struct migration_req structure in the source CPU's
4799  *    runqueue and wake up that CPU's migration thread.
4800  * 2) we down() the locked semaphore => thread blocks.
4801  * 3) migration thread wakes up (implicitly it forces the migrated
4802  *    thread off the CPU)
4803  * 4) it gets the migration request and checks whether the migrated
4804  *    task is still in the wrong runqueue.
4805  * 5) if it's in the wrong runqueue then the migration thread removes
4806  *    it and puts it into the right queue.
4807  * 6) migration thread up()s the semaphore.
4808  * 7) we wake up and the migration is done.
4809  */
4810
4811 /*
4812  * Change a given task's CPU affinity. Migrate the thread to a
4813  * proper CPU and schedule it away if the CPU it's executing on
4814  * is removed from the allowed bitmask.
4815  *
4816  * NOTE: the caller must have a valid reference to the task, the
4817  * task must not exit() & deallocate itself prematurely.  The
4818  * call is not atomic; no spinlocks may be held.
4819  */
4820 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4821 {
4822         struct migration_req req;
4823         unsigned long flags;
4824         struct rq *rq;
4825         int ret = 0;
4826
4827         rq = task_rq_lock(p, &flags);
4828         if (!cpus_intersects(new_mask, cpu_online_map)) {
4829                 ret = -EINVAL;
4830                 goto out;
4831         }
4832
4833         p->cpus_allowed = new_mask;
4834         /* Can the task run on the task's current CPU? If so, we're done */
4835         if (cpu_isset(task_cpu(p), new_mask))
4836                 goto out;
4837
4838         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4839                 /* Need help from migration thread: drop lock and wait. */
4840                 task_rq_unlock(rq, &flags);
4841                 wake_up_process(rq->migration_thread);
4842                 wait_for_completion(&req.done);
4843                 tlb_migrate_finish(p->mm);
4844                 return 0;
4845         }
4846 out:
4847         task_rq_unlock(rq, &flags);
4848
4849         return ret;
4850 }
4851 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4852
4853 /*
4854  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4855  * this because either it can't run here any more (set_cpus_allowed()
4856  * away from this CPU, or CPU going down), or because we're
4857  * attempting to rebalance this task on exec (sched_exec).
4858  *
4859  * So we race with normal scheduler movements, but that's OK, as long
4860  * as the task is no longer on this CPU.
4861  *
4862  * Returns non-zero if task was successfully migrated.
4863  */
4864 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4865 {
4866         struct rq *rq_dest, *rq_src;
4867         int ret = 0, on_rq;
4868
4869         if (unlikely(cpu_is_offline(dest_cpu)))
4870                 return ret;
4871
4872         rq_src = cpu_rq(src_cpu);
4873         rq_dest = cpu_rq(dest_cpu);
4874
4875         double_rq_lock(rq_src, rq_dest);
4876         /* Already moved. */
4877         if (task_cpu(p) != src_cpu)
4878                 goto out;
4879         /* Affinity changed (again). */
4880         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4881                 goto out;
4882
4883         on_rq = p->se.on_rq;
4884         if (on_rq)
4885                 deactivate_task(rq_src, p, 0);
4886         set_task_cpu(p, dest_cpu);
4887         if (on_rq) {
4888                 activate_task(rq_dest, p, 0);
4889                 check_preempt_curr(rq_dest, p);
4890         }
4891         ret = 1;
4892 out:
4893         double_rq_unlock(rq_src, rq_dest);
4894         return ret;
4895 }
4896
4897 /*
4898  * migration_thread - this is a highprio system thread that performs
4899  * thread migration by bumping thread off CPU then 'pushing' onto
4900  * another runqueue.
4901  */
4902 static int migration_thread(void *data)
4903 {
4904         int cpu = (long)data;
4905         struct rq *rq;
4906
4907         rq = cpu_rq(cpu);
4908         BUG_ON(rq->migration_thread != current);
4909
4910         set_current_state(TASK_INTERRUPTIBLE);
4911         while (!kthread_should_stop()) {
4912                 struct migration_req *req;
4913                 struct list_head *head;
4914
4915                 try_to_freeze();
4916
4917                 spin_lock_irq(&rq->lock);
4918
4919                 if (cpu_is_offline(cpu)) {
4920                         spin_unlock_irq(&rq->lock);
4921                         goto wait_to_die;
4922                 }
4923
4924                 if (rq->active_balance) {
4925                         active_load_balance(rq, cpu);
4926                         rq->active_balance = 0;
4927                 }
4928
4929                 head = &rq->migration_queue;
4930
4931                 if (list_empty(head)) {
4932                         spin_unlock_irq(&rq->lock);
4933                         schedule();
4934                         set_current_state(TASK_INTERRUPTIBLE);
4935                         continue;
4936                 }
4937                 req = list_entry(head->next, struct migration_req, list);
4938                 list_del_init(head->next);
4939
4940                 spin_unlock(&rq->lock);
4941                 __migrate_task(req->task, cpu, req->dest_cpu);
4942                 local_irq_enable();
4943
4944                 complete(&req->done);
4945         }
4946         __set_current_state(TASK_RUNNING);
4947         return 0;
4948
4949 wait_to_die:
4950         /* Wait for kthread_stop */
4951         set_current_state(TASK_INTERRUPTIBLE);
4952         while (!kthread_should_stop()) {
4953                 schedule();
4954                 set_current_state(TASK_INTERRUPTIBLE);
4955         }
4956         __set_current_state(TASK_RUNNING);
4957         return 0;
4958 }
4959
4960 #ifdef CONFIG_HOTPLUG_CPU
4961 /*
4962  * Figure out where task on dead CPU should go, use force if neccessary.
4963  * NOTE: interrupts should be disabled by the caller
4964  */
4965 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4966 {
4967         unsigned long flags;
4968         cpumask_t mask;
4969         struct rq *rq;
4970         int dest_cpu;
4971
4972 restart:
4973         /* On same node? */
4974         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4975         cpus_and(mask, mask, p->cpus_allowed);
4976         dest_cpu = any_online_cpu(mask);
4977
4978         /* On any allowed CPU? */
4979         if (dest_cpu == NR_CPUS)
4980                 dest_cpu = any_online_cpu(p->cpus_allowed);
4981
4982         /* No more Mr. Nice Guy. */
4983         if (dest_cpu == NR_CPUS) {
4984                 rq = task_rq_lock(p, &flags);
4985                 cpus_setall(p->cpus_allowed);
4986                 dest_cpu = any_online_cpu(p->cpus_allowed);
4987                 task_rq_unlock(rq, &flags);
4988
4989                 /*
4990                  * Don't tell them about moving exiting tasks or
4991                  * kernel threads (both mm NULL), since they never
4992                  * leave kernel.
4993                  */
4994                 if (p->mm && printk_ratelimit())
4995                         printk(KERN_INFO "process %d (%s) no "
4996                                "longer affine to cpu%d\n",
4997                                p->pid, p->comm, dead_cpu);
4998         }
4999         if (!__migrate_task(p, dead_cpu, dest_cpu))
5000                 goto restart;
5001 }
5002
5003 /*
5004  * While a dead CPU has no uninterruptible tasks queued at this point,
5005  * it might still have a nonzero ->nr_uninterruptible counter, because
5006  * for performance reasons the counter is not stricly tracking tasks to
5007  * their home CPUs. So we just add the counter to another CPU's counter,
5008  * to keep the global sum constant after CPU-down:
5009  */
5010 static void migrate_nr_uninterruptible(struct rq *rq_src)
5011 {
5012         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5013         unsigned long flags;
5014
5015         local_irq_save(flags);
5016         double_rq_lock(rq_src, rq_dest);
5017         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5018         rq_src->nr_uninterruptible = 0;
5019         double_rq_unlock(rq_src, rq_dest);
5020         local_irq_restore(flags);
5021 }
5022
5023 /* Run through task list and migrate tasks from the dead cpu. */
5024 static void migrate_live_tasks(int src_cpu)
5025 {
5026         struct task_struct *p, *t;
5027
5028         write_lock_irq(&tasklist_lock);
5029
5030         do_each_thread(t, p) {
5031                 if (p == current)
5032                         continue;
5033
5034                 if (task_cpu(p) == src_cpu)
5035                         move_task_off_dead_cpu(src_cpu, p);
5036         } while_each_thread(t, p);
5037
5038         write_unlock_irq(&tasklist_lock);
5039 }
5040
5041 /*
5042  * Schedules idle task to be the next runnable task on current CPU.
5043  * It does so by boosting its priority to highest possible and adding it to
5044  * the _front_ of the runqueue. Used by CPU offline code.
5045  */
5046 void sched_idle_next(void)
5047 {
5048         int this_cpu = smp_processor_id();
5049         struct rq *rq = cpu_rq(this_cpu);
5050         struct task_struct *p = rq->idle;
5051         unsigned long flags;
5052
5053         /* cpu has to be offline */
5054         BUG_ON(cpu_online(this_cpu));
5055
5056         /*
5057          * Strictly not necessary since rest of the CPUs are stopped by now
5058          * and interrupts disabled on the current cpu.
5059          */
5060         spin_lock_irqsave(&rq->lock, flags);
5061
5062         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5063
5064         /* Add idle task to the _front_ of its priority queue: */
5065         activate_idle_task(p, rq);
5066
5067         spin_unlock_irqrestore(&rq->lock, flags);
5068 }
5069
5070 /*
5071  * Ensures that the idle task is using init_mm right before its cpu goes
5072  * offline.
5073  */
5074 void idle_task_exit(void)
5075 {
5076         struct mm_struct *mm = current->active_mm;
5077
5078         BUG_ON(cpu_online(smp_processor_id()));
5079
5080         if (mm != &init_mm)
5081                 switch_mm(mm, &init_mm, current);
5082         mmdrop(mm);
5083 }
5084
5085 /* called under rq->lock with disabled interrupts */
5086 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5087 {
5088         struct rq *rq = cpu_rq(dead_cpu);
5089
5090         /* Must be exiting, otherwise would be on tasklist. */
5091         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5092
5093         /* Cannot have done final schedule yet: would have vanished. */
5094         BUG_ON(p->state == TASK_DEAD);
5095
5096         get_task_struct(p);
5097
5098         /*
5099          * Drop lock around migration; if someone else moves it,
5100          * that's OK.  No task can be added to this CPU, so iteration is
5101          * fine.
5102          * NOTE: interrupts should be left disabled  --dev@
5103          */
5104         spin_unlock(&rq->lock);
5105         move_task_off_dead_cpu(dead_cpu, p);
5106         spin_lock(&rq->lock);
5107
5108         put_task_struct(p);
5109 }
5110
5111 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5112 static void migrate_dead_tasks(unsigned int dead_cpu)
5113 {
5114         struct rq *rq = cpu_rq(dead_cpu);
5115         struct task_struct *next;
5116
5117         for ( ; ; ) {
5118                 if (!rq->nr_running)
5119                         break;
5120                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5121                 if (!next)
5122                         break;
5123                 migrate_dead(dead_cpu, next);
5124         }
5125 }
5126 #endif /* CONFIG_HOTPLUG_CPU */
5127
5128 /*
5129  * migration_call - callback that gets triggered when a CPU is added.
5130  * Here we can start up the necessary migration thread for the new CPU.
5131  */
5132 static int __cpuinit
5133 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5134 {
5135         struct task_struct *p;
5136         int cpu = (long)hcpu;
5137         unsigned long flags;
5138         struct rq *rq;
5139
5140         switch (action) {
5141         case CPU_LOCK_ACQUIRE:
5142                 mutex_lock(&sched_hotcpu_mutex);
5143                 break;
5144
5145         case CPU_UP_PREPARE:
5146         case CPU_UP_PREPARE_FROZEN:
5147                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5148                 if (IS_ERR(p))
5149                         return NOTIFY_BAD;
5150                 p->flags |= PF_NOFREEZE;
5151                 kthread_bind(p, cpu);
5152                 /* Must be high prio: stop_machine expects to yield to it. */
5153                 rq = task_rq_lock(p, &flags);
5154                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5155                 task_rq_unlock(rq, &flags);
5156                 cpu_rq(cpu)->migration_thread = p;
5157                 break;
5158
5159         case CPU_ONLINE:
5160         case CPU_ONLINE_FROZEN:
5161                 /* Strictly unneccessary, as first user will wake it. */
5162                 wake_up_process(cpu_rq(cpu)->migration_thread);
5163                 break;
5164
5165 #ifdef CONFIG_HOTPLUG_CPU
5166         case CPU_UP_CANCELED:
5167         case CPU_UP_CANCELED_FROZEN:
5168                 if (!cpu_rq(cpu)->migration_thread)
5169                         break;
5170                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5171                 kthread_bind(cpu_rq(cpu)->migration_thread,
5172                              any_online_cpu(cpu_online_map));
5173                 kthread_stop(cpu_rq(cpu)->migration_thread);
5174                 cpu_rq(cpu)->migration_thread = NULL;
5175                 break;
5176
5177         case CPU_DEAD:
5178         case CPU_DEAD_FROZEN:
5179                 migrate_live_tasks(cpu);
5180                 rq = cpu_rq(cpu);
5181                 kthread_stop(rq->migration_thread);
5182                 rq->migration_thread = NULL;
5183                 /* Idle task back to normal (off runqueue, low prio) */
5184                 rq = task_rq_lock(rq->idle, &flags);
5185                 deactivate_task(rq, rq->idle, 0);
5186                 rq->idle->static_prio = MAX_PRIO;
5187                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5188                 rq->idle->sched_class = &idle_sched_class;
5189                 migrate_dead_tasks(cpu);
5190                 task_rq_unlock(rq, &flags);
5191                 migrate_nr_uninterruptible(rq);
5192                 BUG_ON(rq->nr_running != 0);
5193
5194                 /* No need to migrate the tasks: it was best-effort if
5195                  * they didn't take sched_hotcpu_mutex.  Just wake up
5196                  * the requestors. */
5197                 spin_lock_irq(&rq->lock);
5198                 while (!list_empty(&rq->migration_queue)) {
5199                         struct migration_req *req;
5200
5201                         req = list_entry(rq->migration_queue.next,
5202                                          struct migration_req, list);
5203                         list_del_init(&req->list);
5204                         complete(&req->done);
5205                 }
5206                 spin_unlock_irq(&rq->lock);
5207                 break;
5208 #endif
5209         case CPU_LOCK_RELEASE:
5210                 mutex_unlock(&sched_hotcpu_mutex);
5211                 break;
5212         }
5213         return NOTIFY_OK;
5214 }
5215
5216 /* Register at highest priority so that task migration (migrate_all_tasks)
5217  * happens before everything else.
5218  */
5219 static struct notifier_block __cpuinitdata migration_notifier = {
5220         .notifier_call = migration_call,
5221         .priority = 10
5222 };
5223
5224 int __init migration_init(void)
5225 {
5226         void *cpu = (void *)(long)smp_processor_id();
5227         int err;
5228
5229         /* Start one for the boot CPU: */
5230         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5231         BUG_ON(err == NOTIFY_BAD);
5232         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5233         register_cpu_notifier(&migration_notifier);
5234
5235         return 0;
5236 }
5237 #endif
5238
5239 #ifdef CONFIG_SMP
5240
5241 /* Number of possible processor ids */
5242 int nr_cpu_ids __read_mostly = NR_CPUS;
5243 EXPORT_SYMBOL(nr_cpu_ids);
5244
5245 #undef SCHED_DOMAIN_DEBUG
5246 #ifdef SCHED_DOMAIN_DEBUG
5247 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5248 {
5249         int level = 0;
5250
5251         if (!sd) {
5252                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5253                 return;
5254         }
5255
5256         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5257
5258         do {
5259                 int i;
5260                 char str[NR_CPUS];
5261                 struct sched_group *group = sd->groups;
5262                 cpumask_t groupmask;
5263
5264                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5265                 cpus_clear(groupmask);
5266
5267                 printk(KERN_DEBUG);
5268                 for (i = 0; i < level + 1; i++)
5269                         printk(" ");
5270                 printk("domain %d: ", level);
5271
5272                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5273                         printk("does not load-balance\n");
5274                         if (sd->parent)
5275                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5276                                                 " has parent");
5277                         break;
5278                 }
5279
5280                 printk("span %s\n", str);
5281
5282                 if (!cpu_isset(cpu, sd->span))
5283                         printk(KERN_ERR "ERROR: domain->span does not contain "
5284                                         "CPU%d\n", cpu);
5285                 if (!cpu_isset(cpu, group->cpumask))
5286                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5287                                         " CPU%d\n", cpu);
5288
5289                 printk(KERN_DEBUG);
5290                 for (i = 0; i < level + 2; i++)
5291                         printk(" ");
5292                 printk("groups:");
5293                 do {
5294                         if (!group) {
5295                                 printk("\n");
5296                                 printk(KERN_ERR "ERROR: group is NULL\n");
5297                                 break;
5298                         }
5299
5300                         if (!group->__cpu_power) {
5301                                 printk("\n");
5302                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5303                                                 "set\n");
5304                         }
5305
5306                         if (!cpus_weight(group->cpumask)) {
5307                                 printk("\n");
5308                                 printk(KERN_ERR "ERROR: empty group\n");
5309                         }
5310
5311                         if (cpus_intersects(groupmask, group->cpumask)) {
5312                                 printk("\n");
5313                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5314                         }
5315
5316                         cpus_or(groupmask, groupmask, group->cpumask);
5317
5318                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5319                         printk(" %s", str);
5320
5321                         group = group->next;
5322                 } while (group != sd->groups);
5323                 printk("\n");
5324
5325                 if (!cpus_equal(sd->span, groupmask))
5326                         printk(KERN_ERR "ERROR: groups don't span "
5327                                         "domain->span\n");
5328
5329                 level++;
5330                 sd = sd->parent;
5331                 if (!sd)
5332                         continue;
5333
5334                 if (!cpus_subset(groupmask, sd->span))
5335                         printk(KERN_ERR "ERROR: parent span is not a superset "
5336                                 "of domain->span\n");
5337
5338         } while (sd);
5339 }
5340 #else
5341 # define sched_domain_debug(sd, cpu) do { } while (0)
5342 #endif
5343
5344 static int sd_degenerate(struct sched_domain *sd)
5345 {
5346         if (cpus_weight(sd->span) == 1)
5347                 return 1;
5348
5349         /* Following flags need at least 2 groups */
5350         if (sd->flags & (SD_LOAD_BALANCE |
5351                          SD_BALANCE_NEWIDLE |
5352                          SD_BALANCE_FORK |
5353                          SD_BALANCE_EXEC |
5354                          SD_SHARE_CPUPOWER |
5355                          SD_SHARE_PKG_RESOURCES)) {
5356                 if (sd->groups != sd->groups->next)
5357                         return 0;
5358         }
5359
5360         /* Following flags don't use groups */
5361         if (sd->flags & (SD_WAKE_IDLE |
5362                          SD_WAKE_AFFINE |
5363                          SD_WAKE_BALANCE))
5364                 return 0;
5365
5366         return 1;
5367 }
5368
5369 static int
5370 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5371 {
5372         unsigned long cflags = sd->flags, pflags = parent->flags;
5373
5374         if (sd_degenerate(parent))
5375                 return 1;
5376
5377         if (!cpus_equal(sd->span, parent->span))
5378                 return 0;
5379
5380         /* Does parent contain flags not in child? */
5381         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5382         if (cflags & SD_WAKE_AFFINE)
5383                 pflags &= ~SD_WAKE_BALANCE;
5384         /* Flags needing groups don't count if only 1 group in parent */
5385         if (parent->groups == parent->groups->next) {
5386                 pflags &= ~(SD_LOAD_BALANCE |
5387                                 SD_BALANCE_NEWIDLE |
5388                                 SD_BALANCE_FORK |
5389                                 SD_BALANCE_EXEC |
5390                                 SD_SHARE_CPUPOWER |
5391                                 SD_SHARE_PKG_RESOURCES);
5392         }
5393         if (~cflags & pflags)
5394                 return 0;
5395
5396         return 1;
5397 }
5398
5399 /*
5400  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5401  * hold the hotplug lock.
5402  */
5403 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5404 {
5405         struct rq *rq = cpu_rq(cpu);
5406         struct sched_domain *tmp;
5407
5408         /* Remove the sched domains which do not contribute to scheduling. */
5409         for (tmp = sd; tmp; tmp = tmp->parent) {
5410                 struct sched_domain *parent = tmp->parent;
5411                 if (!parent)
5412                         break;
5413                 if (sd_parent_degenerate(tmp, parent)) {
5414                         tmp->parent = parent->parent;
5415                         if (parent->parent)
5416                                 parent->parent->child = tmp;
5417                 }
5418         }
5419
5420         if (sd && sd_degenerate(sd)) {
5421                 sd = sd->parent;
5422                 if (sd)
5423                         sd->child = NULL;
5424         }
5425
5426         sched_domain_debug(sd, cpu);
5427
5428         rcu_assign_pointer(rq->sd, sd);
5429 }
5430
5431 /* cpus with isolated domains */
5432 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5433
5434 /* Setup the mask of cpus configured for isolated domains */
5435 static int __init isolated_cpu_setup(char *str)
5436 {
5437         int ints[NR_CPUS], i;
5438
5439         str = get_options(str, ARRAY_SIZE(ints), ints);
5440         cpus_clear(cpu_isolated_map);
5441         for (i = 1; i <= ints[0]; i++)
5442                 if (ints[i] < NR_CPUS)
5443                         cpu_set(ints[i], cpu_isolated_map);
5444         return 1;
5445 }
5446
5447 __setup ("isolcpus=", isolated_cpu_setup);
5448
5449 /*
5450  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5451  * to a function which identifies what group(along with sched group) a CPU
5452  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5453  * (due to the fact that we keep track of groups covered with a cpumask_t).
5454  *
5455  * init_sched_build_groups will build a circular linked list of the groups
5456  * covered by the given span, and will set each group's ->cpumask correctly,
5457  * and ->cpu_power to 0.
5458  */
5459 static void
5460 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5461                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5462                                         struct sched_group **sg))
5463 {
5464         struct sched_group *first = NULL, *last = NULL;
5465         cpumask_t covered = CPU_MASK_NONE;
5466         int i;
5467
5468         for_each_cpu_mask(i, span) {
5469                 struct sched_group *sg;
5470                 int group = group_fn(i, cpu_map, &sg);
5471                 int j;
5472
5473                 if (cpu_isset(i, covered))
5474                         continue;
5475
5476                 sg->cpumask = CPU_MASK_NONE;
5477                 sg->__cpu_power = 0;
5478
5479                 for_each_cpu_mask(j, span) {
5480                         if (group_fn(j, cpu_map, NULL) != group)
5481                                 continue;
5482
5483                         cpu_set(j, covered);
5484                         cpu_set(j, sg->cpumask);
5485                 }
5486                 if (!first)
5487                         first = sg;
5488                 if (last)
5489                         last->next = sg;
5490                 last = sg;
5491         }
5492         last->next = first;
5493 }
5494
5495 #define SD_NODES_PER_DOMAIN 16
5496
5497 #ifdef CONFIG_NUMA
5498
5499 /**
5500  * find_next_best_node - find the next node to include in a sched_domain
5501  * @node: node whose sched_domain we're building
5502  * @used_nodes: nodes already in the sched_domain
5503  *
5504  * Find the next node to include in a given scheduling domain.  Simply
5505  * finds the closest node not already in the @used_nodes map.
5506  *
5507  * Should use nodemask_t.
5508  */
5509 static int find_next_best_node(int node, unsigned long *used_nodes)
5510 {
5511         int i, n, val, min_val, best_node = 0;
5512
5513         min_val = INT_MAX;
5514
5515         for (i = 0; i < MAX_NUMNODES; i++) {
5516                 /* Start at @node */
5517                 n = (node + i) % MAX_NUMNODES;
5518
5519                 if (!nr_cpus_node(n))
5520                         continue;
5521
5522                 /* Skip already used nodes */
5523                 if (test_bit(n, used_nodes))
5524                         continue;
5525
5526                 /* Simple min distance search */
5527                 val = node_distance(node, n);
5528
5529                 if (val < min_val) {
5530                         min_val = val;
5531                         best_node = n;
5532                 }
5533         }
5534
5535         set_bit(best_node, used_nodes);
5536         return best_node;
5537 }
5538
5539 /**
5540  * sched_domain_node_span - get a cpumask for a node's sched_domain
5541  * @node: node whose cpumask we're constructing
5542  * @size: number of nodes to include in this span
5543  *
5544  * Given a node, construct a good cpumask for its sched_domain to span.  It
5545  * should be one that prevents unnecessary balancing, but also spreads tasks
5546  * out optimally.
5547  */
5548 static cpumask_t sched_domain_node_span(int node)
5549 {
5550         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5551         cpumask_t span, nodemask;
5552         int i;
5553
5554         cpus_clear(span);
5555         bitmap_zero(used_nodes, MAX_NUMNODES);
5556
5557         nodemask = node_to_cpumask(node);
5558         cpus_or(span, span, nodemask);
5559         set_bit(node, used_nodes);
5560
5561         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5562                 int next_node = find_next_best_node(node, used_nodes);
5563
5564                 nodemask = node_to_cpumask(next_node);
5565                 cpus_or(span, span, nodemask);
5566         }
5567
5568         return span;
5569 }
5570 #endif
5571
5572 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5573
5574 /*
5575  * SMT sched-domains:
5576  */
5577 #ifdef CONFIG_SCHED_SMT
5578 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5579 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5580
5581 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5582                             struct sched_group **sg)
5583 {
5584         if (sg)
5585                 *sg = &per_cpu(sched_group_cpus, cpu);
5586         return cpu;
5587 }
5588 #endif
5589
5590 /*
5591  * multi-core sched-domains:
5592  */
5593 #ifdef CONFIG_SCHED_MC
5594 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5595 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5596 #endif
5597
5598 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5599 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5600                              struct sched_group **sg)
5601 {
5602         int group;
5603         cpumask_t mask = cpu_sibling_map[cpu];
5604         cpus_and(mask, mask, *cpu_map);
5605         group = first_cpu(mask);
5606         if (sg)
5607                 *sg = &per_cpu(sched_group_core, group);
5608         return group;
5609 }
5610 #elif defined(CONFIG_SCHED_MC)
5611 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5612                              struct sched_group **sg)
5613 {
5614         if (sg)
5615                 *sg = &per_cpu(sched_group_core, cpu);
5616         return cpu;
5617 }
5618 #endif
5619
5620 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5621 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5622
5623 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5624                              struct sched_group **sg)
5625 {
5626         int group;
5627 #ifdef CONFIG_SCHED_MC
5628         cpumask_t mask = cpu_coregroup_map(cpu);
5629         cpus_and(mask, mask, *cpu_map);
5630         group = first_cpu(mask);
5631 #elif defined(CONFIG_SCHED_SMT)
5632         cpumask_t mask = cpu_sibling_map[cpu];
5633         cpus_and(mask, mask, *cpu_map);
5634         group = first_cpu(mask);
5635 #else
5636         group = cpu;
5637 #endif
5638         if (sg)
5639                 *sg = &per_cpu(sched_group_phys, group);
5640         return group;
5641 }
5642
5643 #ifdef CONFIG_NUMA
5644 /*
5645  * The init_sched_build_groups can't handle what we want to do with node
5646  * groups, so roll our own. Now each node has its own list of groups which
5647  * gets dynamically allocated.
5648  */
5649 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5650 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5651
5652 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5653 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5654
5655 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5656                                  struct sched_group **sg)
5657 {
5658         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5659         int group;
5660
5661         cpus_and(nodemask, nodemask, *cpu_map);
5662         group = first_cpu(nodemask);
5663
5664         if (sg)
5665                 *sg = &per_cpu(sched_group_allnodes, group);
5666         return group;
5667 }
5668
5669 static void init_numa_sched_groups_power(struct sched_group *group_head)
5670 {
5671         struct sched_group *sg = group_head;
5672         int j;
5673
5674         if (!sg)
5675                 return;
5676 next_sg:
5677         for_each_cpu_mask(j, sg->cpumask) {
5678                 struct sched_domain *sd;
5679
5680                 sd = &per_cpu(phys_domains, j);
5681                 if (j != first_cpu(sd->groups->cpumask)) {
5682                         /*
5683                          * Only add "power" once for each
5684                          * physical package.
5685                          */
5686                         continue;
5687                 }
5688
5689                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5690         }
5691         sg = sg->next;
5692         if (sg != group_head)
5693                 goto next_sg;
5694 }
5695 #endif
5696
5697 #ifdef CONFIG_NUMA
5698 /* Free memory allocated for various sched_group structures */
5699 static void free_sched_groups(const cpumask_t *cpu_map)
5700 {
5701         int cpu, i;
5702
5703         for_each_cpu_mask(cpu, *cpu_map) {
5704                 struct sched_group **sched_group_nodes
5705                         = sched_group_nodes_bycpu[cpu];
5706
5707                 if (!sched_group_nodes)
5708                         continue;
5709
5710                 for (i = 0; i < MAX_NUMNODES; i++) {
5711                         cpumask_t nodemask = node_to_cpumask(i);
5712                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5713
5714                         cpus_and(nodemask, nodemask, *cpu_map);
5715                         if (cpus_empty(nodemask))
5716                                 continue;
5717
5718                         if (sg == NULL)
5719                                 continue;
5720                         sg = sg->next;
5721 next_sg:
5722                         oldsg = sg;
5723                         sg = sg->next;
5724                         kfree(oldsg);
5725                         if (oldsg != sched_group_nodes[i])
5726                                 goto next_sg;
5727                 }
5728                 kfree(sched_group_nodes);
5729                 sched_group_nodes_bycpu[cpu] = NULL;
5730         }
5731 }
5732 #else
5733 static void free_sched_groups(const cpumask_t *cpu_map)
5734 {
5735 }
5736 #endif
5737
5738 /*
5739  * Initialize sched groups cpu_power.
5740  *
5741  * cpu_power indicates the capacity of sched group, which is used while
5742  * distributing the load between different sched groups in a sched domain.
5743  * Typically cpu_power for all the groups in a sched domain will be same unless
5744  * there are asymmetries in the topology. If there are asymmetries, group
5745  * having more cpu_power will pickup more load compared to the group having
5746  * less cpu_power.
5747  *
5748  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5749  * the maximum number of tasks a group can handle in the presence of other idle
5750  * or lightly loaded groups in the same sched domain.
5751  */
5752 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5753 {
5754         struct sched_domain *child;
5755         struct sched_group *group;
5756
5757         WARN_ON(!sd || !sd->groups);
5758
5759         if (cpu != first_cpu(sd->groups->cpumask))
5760                 return;
5761
5762         child = sd->child;
5763
5764         sd->groups->__cpu_power = 0;
5765
5766         /*
5767          * For perf policy, if the groups in child domain share resources
5768          * (for example cores sharing some portions of the cache hierarchy
5769          * or SMT), then set this domain groups cpu_power such that each group
5770          * can handle only one task, when there are other idle groups in the
5771          * same sched domain.
5772          */
5773         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5774                        (child->flags &
5775                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5776                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5777                 return;
5778         }
5779
5780         /*
5781          * add cpu_power of each child group to this groups cpu_power
5782          */
5783         group = child->groups;
5784         do {
5785                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5786                 group = group->next;
5787         } while (group != child->groups);
5788 }
5789
5790 /*
5791  * Build sched domains for a given set of cpus and attach the sched domains
5792  * to the individual cpus
5793  */
5794 static int build_sched_domains(const cpumask_t *cpu_map)
5795 {
5796         int i;
5797 #ifdef CONFIG_NUMA
5798         struct sched_group **sched_group_nodes = NULL;
5799         int sd_allnodes = 0;
5800
5801         /*
5802          * Allocate the per-node list of sched groups
5803          */
5804         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5805                                            GFP_KERNEL);
5806         if (!sched_group_nodes) {
5807                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5808                 return -ENOMEM;
5809         }
5810         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5811 #endif
5812
5813         /*
5814          * Set up domains for cpus specified by the cpu_map.
5815          */
5816         for_each_cpu_mask(i, *cpu_map) {
5817                 struct sched_domain *sd = NULL, *p;
5818                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5819
5820                 cpus_and(nodemask, nodemask, *cpu_map);
5821
5822 #ifdef CONFIG_NUMA
5823                 if (cpus_weight(*cpu_map) >
5824                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5825                         sd = &per_cpu(allnodes_domains, i);
5826                         *sd = SD_ALLNODES_INIT;
5827                         sd->span = *cpu_map;
5828                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5829                         p = sd;
5830                         sd_allnodes = 1;
5831                 } else
5832                         p = NULL;
5833
5834                 sd = &per_cpu(node_domains, i);
5835                 *sd = SD_NODE_INIT;
5836                 sd->span = sched_domain_node_span(cpu_to_node(i));
5837                 sd->parent = p;
5838                 if (p)
5839                         p->child = sd;
5840                 cpus_and(sd->span, sd->span, *cpu_map);
5841 #endif
5842
5843                 p = sd;
5844                 sd = &per_cpu(phys_domains, i);
5845                 *sd = SD_CPU_INIT;
5846                 sd->span = nodemask;
5847                 sd->parent = p;
5848                 if (p)
5849                         p->child = sd;
5850                 cpu_to_phys_group(i, cpu_map, &sd->groups);
5851
5852 #ifdef CONFIG_SCHED_MC
5853                 p = sd;
5854                 sd = &per_cpu(core_domains, i);
5855                 *sd = SD_MC_INIT;
5856                 sd->span = cpu_coregroup_map(i);
5857                 cpus_and(sd->span, sd->span, *cpu_map);
5858                 sd->parent = p;
5859                 p->child = sd;
5860                 cpu_to_core_group(i, cpu_map, &sd->groups);
5861 #endif
5862
5863 #ifdef CONFIG_SCHED_SMT
5864                 p = sd;
5865                 sd = &per_cpu(cpu_domains, i);
5866                 *sd = SD_SIBLING_INIT;
5867                 sd->span = cpu_sibling_map[i];
5868                 cpus_and(sd->span, sd->span, *cpu_map);
5869                 sd->parent = p;
5870                 p->child = sd;
5871                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5872 #endif
5873         }
5874
5875 #ifdef CONFIG_SCHED_SMT
5876         /* Set up CPU (sibling) groups */
5877         for_each_cpu_mask(i, *cpu_map) {
5878                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5879                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5880                 if (i != first_cpu(this_sibling_map))
5881                         continue;
5882
5883                 init_sched_build_groups(this_sibling_map, cpu_map,
5884                                         &cpu_to_cpu_group);
5885         }
5886 #endif
5887
5888 #ifdef CONFIG_SCHED_MC
5889         /* Set up multi-core groups */
5890         for_each_cpu_mask(i, *cpu_map) {
5891                 cpumask_t this_core_map = cpu_coregroup_map(i);
5892                 cpus_and(this_core_map, this_core_map, *cpu_map);
5893                 if (i != first_cpu(this_core_map))
5894                         continue;
5895                 init_sched_build_groups(this_core_map, cpu_map,
5896                                         &cpu_to_core_group);
5897         }
5898 #endif
5899
5900         /* Set up physical groups */
5901         for (i = 0; i < MAX_NUMNODES; i++) {
5902                 cpumask_t nodemask = node_to_cpumask(i);
5903
5904                 cpus_and(nodemask, nodemask, *cpu_map);
5905                 if (cpus_empty(nodemask))
5906                         continue;
5907
5908                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5909         }
5910
5911 #ifdef CONFIG_NUMA
5912         /* Set up node groups */
5913         if (sd_allnodes)
5914                 init_sched_build_groups(*cpu_map, cpu_map,
5915                                         &cpu_to_allnodes_group);
5916
5917         for (i = 0; i < MAX_NUMNODES; i++) {
5918                 /* Set up node groups */
5919                 struct sched_group *sg, *prev;
5920                 cpumask_t nodemask = node_to_cpumask(i);
5921                 cpumask_t domainspan;
5922                 cpumask_t covered = CPU_MASK_NONE;
5923                 int j;
5924
5925                 cpus_and(nodemask, nodemask, *cpu_map);
5926                 if (cpus_empty(nodemask)) {
5927                         sched_group_nodes[i] = NULL;
5928                         continue;
5929                 }
5930
5931                 domainspan = sched_domain_node_span(i);
5932                 cpus_and(domainspan, domainspan, *cpu_map);
5933
5934                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5935                 if (!sg) {
5936                         printk(KERN_WARNING "Can not alloc domain group for "
5937                                 "node %d\n", i);
5938                         goto error;
5939                 }
5940                 sched_group_nodes[i] = sg;
5941                 for_each_cpu_mask(j, nodemask) {
5942                         struct sched_domain *sd;
5943
5944                         sd = &per_cpu(node_domains, j);
5945                         sd->groups = sg;
5946                 }
5947                 sg->__cpu_power = 0;
5948                 sg->cpumask = nodemask;
5949                 sg->next = sg;
5950                 cpus_or(covered, covered, nodemask);
5951                 prev = sg;
5952
5953                 for (j = 0; j < MAX_NUMNODES; j++) {
5954                         cpumask_t tmp, notcovered;
5955                         int n = (i + j) % MAX_NUMNODES;
5956
5957                         cpus_complement(notcovered, covered);
5958                         cpus_and(tmp, notcovered, *cpu_map);
5959                         cpus_and(tmp, tmp, domainspan);
5960                         if (cpus_empty(tmp))
5961                                 break;
5962
5963                         nodemask = node_to_cpumask(n);
5964                         cpus_and(tmp, tmp, nodemask);
5965                         if (cpus_empty(tmp))
5966                                 continue;
5967
5968                         sg = kmalloc_node(sizeof(struct sched_group),
5969                                           GFP_KERNEL, i);
5970                         if (!sg) {
5971                                 printk(KERN_WARNING
5972                                 "Can not alloc domain group for node %d\n", j);
5973                                 goto error;
5974                         }
5975                         sg->__cpu_power = 0;
5976                         sg->cpumask = tmp;
5977                         sg->next = prev->next;
5978                         cpus_or(covered, covered, tmp);
5979                         prev->next = sg;
5980                         prev = sg;
5981                 }
5982         }
5983 #endif
5984
5985         /* Calculate CPU power for physical packages and nodes */
5986 #ifdef CONFIG_SCHED_SMT
5987         for_each_cpu_mask(i, *cpu_map) {
5988                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5989
5990                 init_sched_groups_power(i, sd);
5991         }
5992 #endif
5993 #ifdef CONFIG_SCHED_MC
5994         for_each_cpu_mask(i, *cpu_map) {
5995                 struct sched_domain *sd = &per_cpu(core_domains, i);
5996
5997                 init_sched_groups_power(i, sd);
5998         }
5999 #endif
6000
6001         for_each_cpu_mask(i, *cpu_map) {
6002                 struct sched_domain *sd = &per_cpu(phys_domains, i);
6003
6004                 init_sched_groups_power(i, sd);
6005         }
6006
6007 #ifdef CONFIG_NUMA
6008         for (i = 0; i < MAX_NUMNODES; i++)
6009                 init_numa_sched_groups_power(sched_group_nodes[i]);
6010
6011         if (sd_allnodes) {
6012                 struct sched_group *sg;
6013
6014                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6015                 init_numa_sched_groups_power(sg);
6016         }
6017 #endif
6018
6019         /* Attach the domains */
6020         for_each_cpu_mask(i, *cpu_map) {
6021                 struct sched_domain *sd;
6022 #ifdef CONFIG_SCHED_SMT
6023                 sd = &per_cpu(cpu_domains, i);
6024 #elif defined(CONFIG_SCHED_MC)
6025                 sd = &per_cpu(core_domains, i);
6026 #else
6027                 sd = &per_cpu(phys_domains, i);
6028 #endif
6029                 cpu_attach_domain(sd, i);
6030         }
6031
6032         return 0;
6033
6034 #ifdef CONFIG_NUMA
6035 error:
6036         free_sched_groups(cpu_map);
6037         return -ENOMEM;
6038 #endif
6039 }
6040 /*
6041  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6042  */
6043 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6044 {
6045         cpumask_t cpu_default_map;
6046         int err;
6047
6048         /*
6049          * Setup mask for cpus without special case scheduling requirements.
6050          * For now this just excludes isolated cpus, but could be used to
6051          * exclude other special cases in the future.
6052          */
6053         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6054
6055         err = build_sched_domains(&cpu_default_map);
6056
6057         return err;
6058 }
6059
6060 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6061 {
6062         free_sched_groups(cpu_map);
6063 }
6064
6065 /*
6066  * Detach sched domains from a group of cpus specified in cpu_map
6067  * These cpus will now be attached to the NULL domain
6068  */
6069 static void detach_destroy_domains(const cpumask_t *cpu_map)
6070 {
6071         int i;
6072
6073         for_each_cpu_mask(i, *cpu_map)
6074                 cpu_attach_domain(NULL, i);
6075         synchronize_sched();
6076         arch_destroy_sched_domains(cpu_map);
6077 }
6078
6079 /*
6080  * Partition sched domains as specified by the cpumasks below.
6081  * This attaches all cpus from the cpumasks to the NULL domain,
6082  * waits for a RCU quiescent period, recalculates sched
6083  * domain information and then attaches them back to the
6084  * correct sched domains
6085  * Call with hotplug lock held
6086  */
6087 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6088 {
6089         cpumask_t change_map;
6090         int err = 0;
6091
6092         cpus_and(*partition1, *partition1, cpu_online_map);
6093         cpus_and(*partition2, *partition2, cpu_online_map);
6094         cpus_or(change_map, *partition1, *partition2);
6095
6096         /* Detach sched domains from all of the affected cpus */
6097         detach_destroy_domains(&change_map);
6098         if (!cpus_empty(*partition1))
6099                 err = build_sched_domains(partition1);
6100         if (!err && !cpus_empty(*partition2))
6101                 err = build_sched_domains(partition2);
6102
6103         return err;
6104 }
6105
6106 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6107 int arch_reinit_sched_domains(void)
6108 {
6109         int err;
6110
6111         mutex_lock(&sched_hotcpu_mutex);
6112         detach_destroy_domains(&cpu_online_map);
6113         err = arch_init_sched_domains(&cpu_online_map);
6114         mutex_unlock(&sched_hotcpu_mutex);
6115
6116         return err;
6117 }
6118
6119 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6120 {
6121         int ret;
6122
6123         if (buf[0] != '0' && buf[0] != '1')
6124                 return -EINVAL;
6125
6126         if (smt)
6127                 sched_smt_power_savings = (buf[0] == '1');
6128         else
6129                 sched_mc_power_savings = (buf[0] == '1');
6130
6131         ret = arch_reinit_sched_domains();
6132
6133         return ret ? ret : count;
6134 }
6135
6136 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6137 {
6138         int err = 0;
6139
6140 #ifdef CONFIG_SCHED_SMT
6141         if (smt_capable())
6142                 err = sysfs_create_file(&cls->kset.kobj,
6143                                         &attr_sched_smt_power_savings.attr);
6144 #endif
6145 #ifdef CONFIG_SCHED_MC
6146         if (!err && mc_capable())
6147                 err = sysfs_create_file(&cls->kset.kobj,
6148                                         &attr_sched_mc_power_savings.attr);
6149 #endif
6150         return err;
6151 }
6152 #endif
6153
6154 #ifdef CONFIG_SCHED_MC
6155 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6156 {
6157         return sprintf(page, "%u\n", sched_mc_power_savings);
6158 }
6159 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6160                                             const char *buf, size_t count)
6161 {
6162         return sched_power_savings_store(buf, count, 0);
6163 }
6164 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6165             sched_mc_power_savings_store);
6166 #endif
6167
6168 #ifdef CONFIG_SCHED_SMT
6169 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6170 {
6171         return sprintf(page, "%u\n", sched_smt_power_savings);
6172 }
6173 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6174                                              const char *buf, size_t count)
6175 {
6176         return sched_power_savings_store(buf, count, 1);
6177 }
6178 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6179             sched_smt_power_savings_store);
6180 #endif
6181
6182 /*
6183  * Force a reinitialization of the sched domains hierarchy.  The domains
6184  * and groups cannot be updated in place without racing with the balancing
6185  * code, so we temporarily attach all running cpus to the NULL domain
6186  * which will prevent rebalancing while the sched domains are recalculated.
6187  */
6188 static int update_sched_domains(struct notifier_block *nfb,
6189                                 unsigned long action, void *hcpu)
6190 {
6191         switch (action) {
6192         case CPU_UP_PREPARE:
6193         case CPU_UP_PREPARE_FROZEN:
6194         case CPU_DOWN_PREPARE:
6195         case CPU_DOWN_PREPARE_FROZEN:
6196                 detach_destroy_domains(&cpu_online_map);
6197                 return NOTIFY_OK;
6198
6199         case CPU_UP_CANCELED:
6200         case CPU_UP_CANCELED_FROZEN:
6201         case CPU_DOWN_FAILED:
6202         case CPU_DOWN_FAILED_FROZEN:
6203         case CPU_ONLINE:
6204         case CPU_ONLINE_FROZEN:
6205         case CPU_DEAD:
6206         case CPU_DEAD_FROZEN:
6207                 /*
6208                  * Fall through and re-initialise the domains.
6209                  */
6210                 break;
6211         default:
6212                 return NOTIFY_DONE;
6213         }
6214
6215         /* The hotplug lock is already held by cpu_up/cpu_down */
6216         arch_init_sched_domains(&cpu_online_map);
6217
6218         return NOTIFY_OK;
6219 }
6220
6221 void __init sched_init_smp(void)
6222 {
6223         cpumask_t non_isolated_cpus;
6224
6225         mutex_lock(&sched_hotcpu_mutex);
6226         arch_init_sched_domains(&cpu_online_map);
6227         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6228         if (cpus_empty(non_isolated_cpus))
6229                 cpu_set(smp_processor_id(), non_isolated_cpus);
6230         mutex_unlock(&sched_hotcpu_mutex);
6231         /* XXX: Theoretical race here - CPU may be hotplugged now */
6232         hotcpu_notifier(update_sched_domains, 0);
6233
6234         /* Move init over to a non-isolated CPU */
6235         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6236                 BUG();
6237         sched_init_granularity();
6238 }
6239 #else
6240 void __init sched_init_smp(void)
6241 {
6242         sched_init_granularity();
6243 }
6244 #endif /* CONFIG_SMP */
6245
6246 int in_sched_functions(unsigned long addr)
6247 {
6248         /* Linker adds these: start and end of __sched functions */
6249         extern char __sched_text_start[], __sched_text_end[];
6250
6251         return in_lock_functions(addr) ||
6252                 (addr >= (unsigned long)__sched_text_start
6253                 && addr < (unsigned long)__sched_text_end);
6254 }
6255
6256 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6257 {
6258         cfs_rq->tasks_timeline = RB_ROOT;
6259         cfs_rq->fair_clock = 1;
6260 #ifdef CONFIG_FAIR_GROUP_SCHED
6261         cfs_rq->rq = rq;
6262 #endif
6263 }
6264
6265 void __init sched_init(void)
6266 {
6267         u64 now = sched_clock();
6268         int highest_cpu = 0;
6269         int i, j;
6270
6271         /*
6272          * Link up the scheduling class hierarchy:
6273          */
6274         rt_sched_class.next = &fair_sched_class;
6275         fair_sched_class.next = &idle_sched_class;
6276         idle_sched_class.next = NULL;
6277
6278         for_each_possible_cpu(i) {
6279                 struct rt_prio_array *array;
6280                 struct rq *rq;
6281
6282                 rq = cpu_rq(i);
6283                 spin_lock_init(&rq->lock);
6284                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6285                 rq->nr_running = 0;
6286                 rq->clock = 1;
6287                 init_cfs_rq(&rq->cfs, rq);
6288 #ifdef CONFIG_FAIR_GROUP_SCHED
6289                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6290                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6291 #endif
6292                 rq->ls.load_update_last = now;
6293                 rq->ls.load_update_start = now;
6294
6295                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6296                         rq->cpu_load[j] = 0;
6297 #ifdef CONFIG_SMP
6298                 rq->sd = NULL;
6299                 rq->active_balance = 0;
6300                 rq->next_balance = jiffies;
6301                 rq->push_cpu = 0;
6302                 rq->cpu = i;
6303                 rq->migration_thread = NULL;
6304                 INIT_LIST_HEAD(&rq->migration_queue);
6305 #endif
6306                 atomic_set(&rq->nr_iowait, 0);
6307
6308                 array = &rq->rt.active;
6309                 for (j = 0; j < MAX_RT_PRIO; j++) {
6310                         INIT_LIST_HEAD(array->queue + j);
6311                         __clear_bit(j, array->bitmap);
6312                 }
6313                 highest_cpu = i;
6314                 /* delimiter for bitsearch: */
6315                 __set_bit(MAX_RT_PRIO, array->bitmap);
6316         }
6317
6318         set_load_weight(&init_task);
6319
6320 #ifdef CONFIG_SMP
6321         nr_cpu_ids = highest_cpu + 1;
6322         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6323 #endif
6324
6325 #ifdef CONFIG_RT_MUTEXES
6326         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6327 #endif
6328
6329         /*
6330          * The boot idle thread does lazy MMU switching as well:
6331          */
6332         atomic_inc(&init_mm.mm_count);
6333         enter_lazy_tlb(&init_mm, current);
6334
6335         /*
6336          * Make us the idle thread. Technically, schedule() should not be
6337          * called from this thread, however somewhere below it might be,
6338          * but because we are the idle thread, we just pick up running again
6339          * when this runqueue becomes "idle".
6340          */
6341         init_idle(current, smp_processor_id());
6342         /*
6343          * During early bootup we pretend to be a normal task:
6344          */
6345         current->sched_class = &fair_sched_class;
6346 }
6347
6348 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6349 void __might_sleep(char *file, int line)
6350 {
6351 #ifdef in_atomic
6352         static unsigned long prev_jiffy;        /* ratelimiting */
6353
6354         if ((in_atomic() || irqs_disabled()) &&
6355             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6356                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6357                         return;
6358                 prev_jiffy = jiffies;
6359                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6360                                 " context at %s:%d\n", file, line);
6361                 printk("in_atomic():%d, irqs_disabled():%d\n",
6362                         in_atomic(), irqs_disabled());
6363                 debug_show_held_locks(current);
6364                 if (irqs_disabled())
6365                         print_irqtrace_events(current);
6366                 dump_stack();
6367         }
6368 #endif
6369 }
6370 EXPORT_SYMBOL(__might_sleep);
6371 #endif
6372
6373 #ifdef CONFIG_MAGIC_SYSRQ
6374 void normalize_rt_tasks(void)
6375 {
6376         struct task_struct *g, *p;
6377         unsigned long flags;
6378         struct rq *rq;
6379         int on_rq;
6380
6381         read_lock_irq(&tasklist_lock);
6382         do_each_thread(g, p) {
6383                 p->se.fair_key                  = 0;
6384                 p->se.wait_runtime              = 0;
6385                 p->se.wait_start_fair           = 0;
6386                 p->se.wait_start                = 0;
6387                 p->se.exec_start                = 0;
6388                 p->se.sleep_start               = 0;
6389                 p->se.sleep_start_fair          = 0;
6390                 p->se.block_start               = 0;
6391                 task_rq(p)->cfs.fair_clock      = 0;
6392                 task_rq(p)->clock               = 0;
6393
6394                 if (!rt_task(p)) {
6395                         /*
6396                          * Renice negative nice level userspace
6397                          * tasks back to 0:
6398                          */
6399                         if (TASK_NICE(p) < 0 && p->mm)
6400                                 set_user_nice(p, 0);
6401                         continue;
6402                 }
6403
6404                 spin_lock_irqsave(&p->pi_lock, flags);
6405                 rq = __task_rq_lock(p);
6406 #ifdef CONFIG_SMP
6407                 /*
6408                  * Do not touch the migration thread:
6409                  */
6410                 if (p == rq->migration_thread)
6411                         goto out_unlock;
6412 #endif
6413
6414                 on_rq = p->se.on_rq;
6415                 if (on_rq)
6416                         deactivate_task(task_rq(p), p, 0);
6417                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6418                 if (on_rq) {
6419                         activate_task(task_rq(p), p, 0);
6420                         resched_task(rq->curr);
6421                 }
6422 #ifdef CONFIG_SMP
6423  out_unlock:
6424 #endif
6425                 __task_rq_unlock(rq);
6426                 spin_unlock_irqrestore(&p->pi_lock, flags);
6427         } while_each_thread(g, p);
6428
6429         read_unlock_irq(&tasklist_lock);
6430 }
6431
6432 #endif /* CONFIG_MAGIC_SYSRQ */
6433
6434 #ifdef CONFIG_IA64
6435 /*
6436  * These functions are only useful for the IA64 MCA handling.
6437  *
6438  * They can only be called when the whole system has been
6439  * stopped - every CPU needs to be quiescent, and no scheduling
6440  * activity can take place. Using them for anything else would
6441  * be a serious bug, and as a result, they aren't even visible
6442  * under any other configuration.
6443  */
6444
6445 /**
6446  * curr_task - return the current task for a given cpu.
6447  * @cpu: the processor in question.
6448  *
6449  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6450  */
6451 struct task_struct *curr_task(int cpu)
6452 {
6453         return cpu_curr(cpu);
6454 }
6455
6456 /**
6457  * set_curr_task - set the current task for a given cpu.
6458  * @cpu: the processor in question.
6459  * @p: the task pointer to set.
6460  *
6461  * Description: This function must only be used when non-maskable interrupts
6462  * are serviced on a separate stack.  It allows the architecture to switch the
6463  * notion of the current task on a cpu in a non-blocking manner.  This function
6464  * must be called with all CPU's synchronized, and interrupts disabled, the
6465  * and caller must save the original value of the current task (see
6466  * curr_task() above) and restore that value before reenabling interrupts and
6467  * re-starting the system.
6468  *
6469  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6470  */
6471 void set_curr_task(int cpu, struct task_struct *p)
6472 {
6473         cpu_curr(cpu) = p;
6474 }
6475
6476 #endif