4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
28 #include <linux/module.h>
29 #include <linux/nmi.h>
30 #include <linux/init.h>
31 #include <linux/uaccess.h>
32 #include <linux/highmem.h>
33 #include <linux/smp_lock.h>
34 #include <asm/mmu_context.h>
35 #include <linux/interrupt.h>
36 #include <linux/capability.h>
37 #include <linux/completion.h>
38 #include <linux/kernel_stat.h>
39 #include <linux/debug_locks.h>
40 #include <linux/security.h>
41 #include <linux/notifier.h>
42 #include <linux/profile.h>
43 #include <linux/freezer.h>
44 #include <linux/vmalloc.h>
45 #include <linux/blkdev.h>
46 #include <linux/delay.h>
47 #include <linux/smp.h>
48 #include <linux/threads.h>
49 #include <linux/timer.h>
50 #include <linux/rcupdate.h>
51 #include <linux/cpu.h>
52 #include <linux/cpuset.h>
53 #include <linux/percpu.h>
54 #include <linux/kthread.h>
55 #include <linux/seq_file.h>
56 #include <linux/syscalls.h>
57 #include <linux/times.h>
58 #include <linux/tsacct_kern.h>
59 #include <linux/kprobes.h>
60 #include <linux/delayacct.h>
61 #include <linux/reciprocal_div.h>
62 #include <linux/unistd.h>
67 * Scheduler clock - returns current time in nanosec units.
68 * This is default implementation.
69 * Architectures and sub-architectures can override this.
71 unsigned long long __attribute__((weak
)) sched_clock(void)
73 return (unsigned long long)jiffies
* (1000000000 / HZ
);
77 * Convert user-nice values [ -20 ... 0 ... 19 ]
78 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
81 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
82 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
83 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
86 * 'User priority' is the nice value converted to something we
87 * can work with better when scaling various scheduler parameters,
88 * it's a [ 0 ... 39 ] range.
90 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
91 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
92 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
95 * Some helpers for converting nanosecond timing to jiffy resolution
97 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
98 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100 #define NICE_0_LOAD SCHED_LOAD_SCALE
101 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
104 * These are the 'tuning knobs' of the scheduler:
106 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
107 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
108 * Timeslices get refilled after they expire.
110 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
111 #define DEF_TIMESLICE (100 * HZ / 1000)
115 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
116 * Since cpu_power is a 'constant', we can use a reciprocal divide.
118 static inline u32
sg_div_cpu_power(const struct sched_group
*sg
, u32 load
)
120 return reciprocal_divide(load
, sg
->reciprocal_cpu_power
);
124 * Each time a sched group cpu_power is changed,
125 * we must compute its reciprocal value
127 static inline void sg_inc_cpu_power(struct sched_group
*sg
, u32 val
)
129 sg
->__cpu_power
+= val
;
130 sg
->reciprocal_cpu_power
= reciprocal_value(sg
->__cpu_power
);
134 #define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
139 * to time slice values: [800ms ... 100ms ... 5ms]
141 static unsigned int static_prio_timeslice(int static_prio
)
143 if (static_prio
== NICE_TO_PRIO(19))
146 if (static_prio
< NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE
* 4, static_prio
);
149 return SCALE_PRIO(DEF_TIMESLICE
, static_prio
);
152 static inline int rt_policy(int policy
)
154 if (unlikely(policy
== SCHED_FIFO
) || unlikely(policy
== SCHED_RR
))
159 static inline int task_has_rt_policy(struct task_struct
*p
)
161 return rt_policy(p
->policy
);
165 * This is the priority-queue data structure of the RT scheduling class:
167 struct rt_prio_array
{
168 DECLARE_BITMAP(bitmap
, MAX_RT_PRIO
+1); /* include 1 bit for delimiter */
169 struct list_head queue
[MAX_RT_PRIO
];
173 struct load_weight load
;
174 u64 load_update_start
, load_update_last
;
175 unsigned long delta_fair
, delta_exec
, delta_stat
;
178 /* CFS-related fields in a runqueue */
180 struct load_weight load
;
181 unsigned long nr_running
;
187 unsigned long wait_runtime_overruns
, wait_runtime_underruns
;
189 struct rb_root tasks_timeline
;
190 struct rb_node
*rb_leftmost
;
191 struct rb_node
*rb_load_balance_curr
;
192 #ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
196 struct sched_entity
*curr
;
197 struct rq
*rq
; /* cpu runqueue to which this cfs_rq is attached */
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
206 struct list_head leaf_cfs_rq_list
; /* Better name : task_cfs_rq_list? */
210 /* Real-Time classes' related field in a runqueue: */
212 struct rt_prio_array active
;
213 int rt_load_balance_idx
;
214 struct list_head
*rt_load_balance_head
, *rt_load_balance_curr
;
218 * This is the main, per-CPU runqueue data structure.
220 * Locking rule: those places that want to lock multiple runqueues
221 * (such as the load balancing or the thread migration code), lock
222 * acquire operations must be ordered by ascending &runqueue.
225 spinlock_t lock
; /* runqueue lock */
228 * nr_running and cpu_load should be in the same cacheline because
229 * remote CPUs use both these fields when doing load calculation.
231 unsigned long nr_running
;
232 #define CPU_LOAD_IDX_MAX 5
233 unsigned long cpu_load
[CPU_LOAD_IDX_MAX
];
234 unsigned char idle_at_tick
;
236 unsigned char in_nohz_recently
;
238 struct load_stat ls
; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates
;
243 #ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list
; /* list of leaf cfs_rq on this cpu */
249 * This is part of a global counter where only the total sum
250 * over all CPUs matters. A task can increase this counter on
251 * one CPU and if it got migrated afterwards it may decrease
252 * it on another CPU. Always updated under the runqueue lock:
254 unsigned long nr_uninterruptible
;
256 struct task_struct
*curr
, *idle
;
257 unsigned long next_balance
;
258 struct mm_struct
*prev_mm
;
260 u64 clock
, prev_clock_raw
;
263 unsigned int clock_warps
, clock_overflows
;
264 unsigned int clock_unstable_events
;
266 struct sched_class
*load_balance_class
;
271 struct sched_domain
*sd
;
273 /* For active balancing */
276 int cpu
; /* cpu of this runqueue */
278 struct task_struct
*migration_thread
;
279 struct list_head migration_queue
;
282 #ifdef CONFIG_SCHEDSTATS
284 struct sched_info rq_sched_info
;
286 /* sys_sched_yield() stats */
287 unsigned long yld_exp_empty
;
288 unsigned long yld_act_empty
;
289 unsigned long yld_both_empty
;
290 unsigned long yld_cnt
;
292 /* schedule() stats */
293 unsigned long sched_switch
;
294 unsigned long sched_cnt
;
295 unsigned long sched_goidle
;
297 /* try_to_wake_up() stats */
298 unsigned long ttwu_cnt
;
299 unsigned long ttwu_local
;
301 struct lock_class_key rq_lock_key
;
304 static DEFINE_PER_CPU(struct rq
, runqueues
) ____cacheline_aligned_in_smp
;
305 static DEFINE_MUTEX(sched_hotcpu_mutex
);
307 static inline void check_preempt_curr(struct rq
*rq
, struct task_struct
*p
)
309 rq
->curr
->sched_class
->check_preempt_curr(rq
, p
);
312 static inline int cpu_of(struct rq
*rq
)
322 * Per-runqueue clock, as finegrained as the platform can give us:
324 static unsigned long long __rq_clock(struct rq
*rq
)
326 u64 prev_raw
= rq
->prev_clock_raw
;
327 u64 now
= sched_clock();
328 s64 delta
= now
- prev_raw
;
329 u64 clock
= rq
->clock
;
332 * Protect against sched_clock() occasionally going backwards:
334 if (unlikely(delta
< 0)) {
339 * Catch too large forward jumps too:
341 if (unlikely(delta
> 2*TICK_NSEC
)) {
343 rq
->clock_overflows
++;
345 if (unlikely(delta
> rq
->clock_max_delta
))
346 rq
->clock_max_delta
= delta
;
351 rq
->prev_clock_raw
= now
;
357 static inline unsigned long long rq_clock(struct rq
*rq
)
359 int this_cpu
= smp_processor_id();
361 if (this_cpu
== cpu_of(rq
))
362 return __rq_clock(rq
);
368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
369 * See detach_destroy_domains: synchronize_sched for details.
371 * The domain tree of any CPU may only be accessed from within
372 * preempt-disabled sections.
374 #define for_each_domain(cpu, __sd) \
375 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
377 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
378 #define this_rq() (&__get_cpu_var(runqueues))
379 #define task_rq(p) cpu_rq(task_cpu(p))
380 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
382 #ifdef CONFIG_FAIR_GROUP_SCHED
383 /* Change a task's ->cfs_rq if it moves across CPUs */
384 static inline void set_task_cfs_rq(struct task_struct
*p
)
386 p
->se
.cfs_rq
= &task_rq(p
)->cfs
;
389 static inline void set_task_cfs_rq(struct task_struct
*p
)
394 #ifndef prepare_arch_switch
395 # define prepare_arch_switch(next) do { } while (0)
397 #ifndef finish_arch_switch
398 # define finish_arch_switch(prev) do { } while (0)
401 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
402 static inline int task_running(struct rq
*rq
, struct task_struct
*p
)
404 return rq
->curr
== p
;
407 static inline void prepare_lock_switch(struct rq
*rq
, struct task_struct
*next
)
411 static inline void finish_lock_switch(struct rq
*rq
, struct task_struct
*prev
)
413 #ifdef CONFIG_DEBUG_SPINLOCK
414 /* this is a valid case when another task releases the spinlock */
415 rq
->lock
.owner
= current
;
418 * If we are tracking spinlock dependencies then we have to
419 * fix up the runqueue lock - which gets 'carried over' from
422 spin_acquire(&rq
->lock
.dep_map
, 0, 0, _THIS_IP_
);
424 spin_unlock_irq(&rq
->lock
);
427 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
428 static inline int task_running(struct rq
*rq
, struct task_struct
*p
)
433 return rq
->curr
== p
;
437 static inline void prepare_lock_switch(struct rq
*rq
, struct task_struct
*next
)
441 * We can optimise this out completely for !SMP, because the
442 * SMP rebalancing from interrupt is the only thing that cares
447 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
448 spin_unlock_irq(&rq
->lock
);
450 spin_unlock(&rq
->lock
);
454 static inline void finish_lock_switch(struct rq
*rq
, struct task_struct
*prev
)
458 * After ->oncpu is cleared, the task can be moved to a different CPU.
459 * We must ensure this doesn't happen until the switch is completely
465 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
469 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
472 * __task_rq_lock - lock the runqueue a given task resides on.
473 * Must be called interrupts disabled.
475 static inline struct rq
*__task_rq_lock(struct task_struct
*p
)
482 spin_lock(&rq
->lock
);
483 if (unlikely(rq
!= task_rq(p
))) {
484 spin_unlock(&rq
->lock
);
485 goto repeat_lock_task
;
491 * task_rq_lock - lock the runqueue a given task resides on and disable
492 * interrupts. Note the ordering: we can safely lookup the task_rq without
493 * explicitly disabling preemption.
495 static struct rq
*task_rq_lock(struct task_struct
*p
, unsigned long *flags
)
501 local_irq_save(*flags
);
503 spin_lock(&rq
->lock
);
504 if (unlikely(rq
!= task_rq(p
))) {
505 spin_unlock_irqrestore(&rq
->lock
, *flags
);
506 goto repeat_lock_task
;
511 static inline void __task_rq_unlock(struct rq
*rq
)
514 spin_unlock(&rq
->lock
);
517 static inline void task_rq_unlock(struct rq
*rq
, unsigned long *flags
)
520 spin_unlock_irqrestore(&rq
->lock
, *flags
);
524 * this_rq_lock - lock this runqueue and disable interrupts.
526 static inline struct rq
*this_rq_lock(void)
533 spin_lock(&rq
->lock
);
539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
541 void sched_clock_unstable_event(void)
546 rq
= task_rq_lock(current
, &flags
);
547 rq
->prev_clock_raw
= sched_clock();
548 rq
->clock_unstable_events
++;
549 task_rq_unlock(rq
, &flags
);
553 * resched_task - mark a task 'to be rescheduled now'.
555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
561 #ifndef tsk_is_polling
562 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
565 static void resched_task(struct task_struct
*p
)
569 assert_spin_locked(&task_rq(p
)->lock
);
571 if (unlikely(test_tsk_thread_flag(p
, TIF_NEED_RESCHED
)))
574 set_tsk_thread_flag(p
, TIF_NEED_RESCHED
);
577 if (cpu
== smp_processor_id())
580 /* NEED_RESCHED must be visible before we test polling */
582 if (!tsk_is_polling(p
))
583 smp_send_reschedule(cpu
);
586 static void resched_cpu(int cpu
)
588 struct rq
*rq
= cpu_rq(cpu
);
591 if (!spin_trylock_irqsave(&rq
->lock
, flags
))
593 resched_task(cpu_curr(cpu
));
594 spin_unlock_irqrestore(&rq
->lock
, flags
);
597 static inline void resched_task(struct task_struct
*p
)
599 assert_spin_locked(&task_rq(p
)->lock
);
600 set_tsk_need_resched(p
);
604 static u64
div64_likely32(u64 divident
, unsigned long divisor
)
606 #if BITS_PER_LONG == 32
607 if (likely(divident
<= 0xffffffffULL
))
608 return (u32
)divident
/ divisor
;
609 do_div(divident
, divisor
);
613 return divident
/ divisor
;
617 #if BITS_PER_LONG == 32
618 # define WMULT_CONST (~0UL)
620 # define WMULT_CONST (1UL << 32)
623 #define WMULT_SHIFT 32
625 static inline unsigned long
626 calc_delta_mine(unsigned long delta_exec
, unsigned long weight
,
627 struct load_weight
*lw
)
631 if (unlikely(!lw
->inv_weight
))
632 lw
->inv_weight
= WMULT_CONST
/ lw
->weight
;
634 tmp
= (u64
)delta_exec
* weight
;
636 * Check whether we'd overflow the 64-bit multiplication:
638 if (unlikely(tmp
> WMULT_CONST
)) {
639 tmp
= ((tmp
>> WMULT_SHIFT
/2) * lw
->inv_weight
)
642 tmp
= (tmp
* lw
->inv_weight
) >> WMULT_SHIFT
;
645 return (unsigned long)min(tmp
, (u64
)sysctl_sched_runtime_limit
);
648 static inline unsigned long
649 calc_delta_fair(unsigned long delta_exec
, struct load_weight
*lw
)
651 return calc_delta_mine(delta_exec
, NICE_0_LOAD
, lw
);
654 static void update_load_add(struct load_weight
*lw
, unsigned long inc
)
660 static void update_load_sub(struct load_weight
*lw
, unsigned long dec
)
666 static void __update_curr_load(struct rq
*rq
, struct load_stat
*ls
)
668 if (rq
->curr
!= rq
->idle
&& ls
->load
.weight
) {
669 ls
->delta_exec
+= ls
->delta_stat
;
670 ls
->delta_fair
+= calc_delta_fair(ls
->delta_stat
, &ls
->load
);
676 * Update delta_exec, delta_fair fields for rq.
678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
690 static void update_curr_load(struct rq
*rq
, u64 now
)
692 struct load_stat
*ls
= &rq
->ls
;
695 start
= ls
->load_update_start
;
696 ls
->load_update_start
= now
;
697 ls
->delta_stat
+= now
- start
;
699 * Stagger updates to ls->delta_fair. Very frequent updates
702 if (ls
->delta_stat
>= sysctl_sched_stat_granularity
)
703 __update_curr_load(rq
, ls
);
707 * To aid in avoiding the subversion of "niceness" due to uneven distribution
708 * of tasks with abnormal "nice" values across CPUs the contribution that
709 * each task makes to its run queue's load is weighted according to its
710 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
711 * scaled version of the new time slice allocation that they receive on time
716 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
717 * If static_prio_timeslice() is ever changed to break this assumption then
718 * this code will need modification
720 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
721 #define load_weight(lp) \
722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
723 #define PRIO_TO_LOAD_WEIGHT(prio) \
724 load_weight(static_prio_timeslice(prio))
725 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
728 #define WEIGHT_IDLEPRIO 2
729 #define WMULT_IDLEPRIO (1 << 31)
732 * Nice levels are multiplicative, with a gentle 10% change for every
733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
735 * that remained on nice 0.
737 * The "10% effect" is relative and cumulative: from _any_ nice level,
738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
739 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
740 * If a task goes up by ~10% and another task goes down by ~10% then
741 * the relative distance between them is ~25%.)
743 static const int prio_to_weight
[40] = {
744 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
745 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
746 /* 0 */ NICE_0_LOAD
/* 1024 */,
747 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
748 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
752 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
754 * In cases where the weight does not change often, we can use the
755 * precalculated inverse to speed up arithmetics by turning divisions
756 * into multiplications:
758 static const u32 prio_to_wmult
[40] = {
759 48356, 60446, 75558, 94446, 118058, 147573,
760 184467, 230589, 288233, 360285, 450347,
761 562979, 703746, 879575, 1099582, 1374389,
762 1717986, 2147483, 2684354, 3355443, 4194304,
763 5244160, 6557201, 8196502, 10250518, 12782640,
764 16025997, 19976592, 24970740, 31350126, 39045157,
765 49367440, 61356675, 76695844, 95443717, 119304647,
766 148102320, 186737708, 238609294, 286331153,
770 inc_load(struct rq
*rq
, const struct task_struct
*p
, u64 now
)
772 update_curr_load(rq
, now
);
773 update_load_add(&rq
->ls
.load
, p
->se
.load
.weight
);
777 dec_load(struct rq
*rq
, const struct task_struct
*p
, u64 now
)
779 update_curr_load(rq
, now
);
780 update_load_sub(&rq
->ls
.load
, p
->se
.load
.weight
);
783 static inline void inc_nr_running(struct task_struct
*p
, struct rq
*rq
, u64 now
)
786 inc_load(rq
, p
, now
);
789 static inline void dec_nr_running(struct task_struct
*p
, struct rq
*rq
, u64 now
)
792 dec_load(rq
, p
, now
);
795 static void activate_task(struct rq
*rq
, struct task_struct
*p
, int wakeup
);
798 * runqueue iterator, to support SMP load-balancing between different
799 * scheduling classes, without having to expose their internal data
800 * structures to the load-balancing proper:
804 struct task_struct
*(*start
)(void *);
805 struct task_struct
*(*next
)(void *);
808 static int balance_tasks(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
809 unsigned long max_nr_move
, unsigned long max_load_move
,
810 struct sched_domain
*sd
, enum cpu_idle_type idle
,
811 int *all_pinned
, unsigned long *load_moved
,
812 int this_best_prio
, int best_prio
, int best_prio_seen
,
813 struct rq_iterator
*iterator
);
815 #include "sched_stats.h"
816 #include "sched_rt.c"
817 #include "sched_fair.c"
818 #include "sched_idletask.c"
819 #ifdef CONFIG_SCHED_DEBUG
820 # include "sched_debug.c"
823 #define sched_class_highest (&rt_sched_class)
825 static void set_load_weight(struct task_struct
*p
)
827 task_rq(p
)->cfs
.wait_runtime
-= p
->se
.wait_runtime
;
828 p
->se
.wait_runtime
= 0;
830 if (task_has_rt_policy(p
)) {
831 p
->se
.load
.weight
= prio_to_weight
[0] * 2;
832 p
->se
.load
.inv_weight
= prio_to_wmult
[0] >> 1;
837 * SCHED_IDLE tasks get minimal weight:
839 if (p
->policy
== SCHED_IDLE
) {
840 p
->se
.load
.weight
= WEIGHT_IDLEPRIO
;
841 p
->se
.load
.inv_weight
= WMULT_IDLEPRIO
;
845 p
->se
.load
.weight
= prio_to_weight
[p
->static_prio
- MAX_RT_PRIO
];
846 p
->se
.load
.inv_weight
= prio_to_wmult
[p
->static_prio
- MAX_RT_PRIO
];
850 enqueue_task(struct rq
*rq
, struct task_struct
*p
, int wakeup
, u64 now
)
852 sched_info_queued(p
);
853 p
->sched_class
->enqueue_task(rq
, p
, wakeup
, now
);
858 dequeue_task(struct rq
*rq
, struct task_struct
*p
, int sleep
, u64 now
)
860 p
->sched_class
->dequeue_task(rq
, p
, sleep
, now
);
865 * __normal_prio - return the priority that is based on the static prio
867 static inline int __normal_prio(struct task_struct
*p
)
869 return p
->static_prio
;
873 * Calculate the expected normal priority: i.e. priority
874 * without taking RT-inheritance into account. Might be
875 * boosted by interactivity modifiers. Changes upon fork,
876 * setprio syscalls, and whenever the interactivity
877 * estimator recalculates.
879 static inline int normal_prio(struct task_struct
*p
)
883 if (task_has_rt_policy(p
))
884 prio
= MAX_RT_PRIO
-1 - p
->rt_priority
;
886 prio
= __normal_prio(p
);
891 * Calculate the current priority, i.e. the priority
892 * taken into account by the scheduler. This value might
893 * be boosted by RT tasks, or might be boosted by
894 * interactivity modifiers. Will be RT if the task got
895 * RT-boosted. If not then it returns p->normal_prio.
897 static int effective_prio(struct task_struct
*p
)
899 p
->normal_prio
= normal_prio(p
);
901 * If we are RT tasks or we were boosted to RT priority,
902 * keep the priority unchanged. Otherwise, update priority
903 * to the normal priority:
905 if (!rt_prio(p
->prio
))
906 return p
->normal_prio
;
911 * activate_task - move a task to the runqueue.
913 static void activate_task(struct rq
*rq
, struct task_struct
*p
, int wakeup
)
915 u64 now
= rq_clock(rq
);
917 if (p
->state
== TASK_UNINTERRUPTIBLE
)
918 rq
->nr_uninterruptible
--;
920 enqueue_task(rq
, p
, wakeup
, now
);
921 inc_nr_running(p
, rq
, now
);
925 * activate_idle_task - move idle task to the _front_ of runqueue.
927 static inline void activate_idle_task(struct task_struct
*p
, struct rq
*rq
)
929 u64 now
= rq_clock(rq
);
931 if (p
->state
== TASK_UNINTERRUPTIBLE
)
932 rq
->nr_uninterruptible
--;
934 enqueue_task(rq
, p
, 0, now
);
935 inc_nr_running(p
, rq
, now
);
939 * deactivate_task - remove a task from the runqueue.
941 static void deactivate_task(struct rq
*rq
, struct task_struct
*p
, int sleep
)
943 u64 now
= rq_clock(rq
);
945 if (p
->state
== TASK_UNINTERRUPTIBLE
)
946 rq
->nr_uninterruptible
++;
948 dequeue_task(rq
, p
, sleep
, now
);
949 dec_nr_running(p
, rq
, now
);
953 * task_curr - is this task currently executing on a CPU?
954 * @p: the task in question.
956 inline int task_curr(const struct task_struct
*p
)
958 return cpu_curr(task_cpu(p
)) == p
;
961 /* Used instead of source_load when we know the type == 0 */
962 unsigned long weighted_cpuload(const int cpu
)
964 return cpu_rq(cpu
)->ls
.load
.weight
;
967 static inline void __set_task_cpu(struct task_struct
*p
, unsigned int cpu
)
970 task_thread_info(p
)->cpu
= cpu
;
977 void set_task_cpu(struct task_struct
*p
, unsigned int new_cpu
)
979 int old_cpu
= task_cpu(p
);
980 struct rq
*old_rq
= cpu_rq(old_cpu
), *new_rq
= cpu_rq(new_cpu
);
981 u64 clock_offset
, fair_clock_offset
;
983 clock_offset
= old_rq
->clock
- new_rq
->clock
;
984 fair_clock_offset
= old_rq
->cfs
.fair_clock
-
985 new_rq
->cfs
.fair_clock
;
986 if (p
->se
.wait_start
)
987 p
->se
.wait_start
-= clock_offset
;
988 if (p
->se
.wait_start_fair
)
989 p
->se
.wait_start_fair
-= fair_clock_offset
;
990 if (p
->se
.sleep_start
)
991 p
->se
.sleep_start
-= clock_offset
;
992 if (p
->se
.block_start
)
993 p
->se
.block_start
-= clock_offset
;
994 if (p
->se
.sleep_start_fair
)
995 p
->se
.sleep_start_fair
-= fair_clock_offset
;
997 __set_task_cpu(p
, new_cpu
);
1000 struct migration_req
{
1001 struct list_head list
;
1003 struct task_struct
*task
;
1006 struct completion done
;
1010 * The task's runqueue lock must be held.
1011 * Returns true if you have to wait for migration thread.
1014 migrate_task(struct task_struct
*p
, int dest_cpu
, struct migration_req
*req
)
1016 struct rq
*rq
= task_rq(p
);
1019 * If the task is not on a runqueue (and not running), then
1020 * it is sufficient to simply update the task's cpu field.
1022 if (!p
->se
.on_rq
&& !task_running(rq
, p
)) {
1023 set_task_cpu(p
, dest_cpu
);
1027 init_completion(&req
->done
);
1029 req
->dest_cpu
= dest_cpu
;
1030 list_add(&req
->list
, &rq
->migration_queue
);
1036 * wait_task_inactive - wait for a thread to unschedule.
1038 * The caller must ensure that the task *will* unschedule sometime soon,
1039 * else this function might spin for a *long* time. This function can't
1040 * be called with interrupts off, or it may introduce deadlock with
1041 * smp_call_function() if an IPI is sent by the same process we are
1042 * waiting to become inactive.
1044 void wait_task_inactive(struct task_struct
*p
)
1046 unsigned long flags
;
1052 * We do the initial early heuristics without holding
1053 * any task-queue locks at all. We'll only try to get
1054 * the runqueue lock when things look like they will
1060 * If the task is actively running on another CPU
1061 * still, just relax and busy-wait without holding
1064 * NOTE! Since we don't hold any locks, it's not
1065 * even sure that "rq" stays as the right runqueue!
1066 * But we don't care, since "task_running()" will
1067 * return false if the runqueue has changed and p
1068 * is actually now running somewhere else!
1070 while (task_running(rq
, p
))
1074 * Ok, time to look more closely! We need the rq
1075 * lock now, to be *sure*. If we're wrong, we'll
1076 * just go back and repeat.
1078 rq
= task_rq_lock(p
, &flags
);
1079 running
= task_running(rq
, p
);
1080 on_rq
= p
->se
.on_rq
;
1081 task_rq_unlock(rq
, &flags
);
1084 * Was it really running after all now that we
1085 * checked with the proper locks actually held?
1087 * Oops. Go back and try again..
1089 if (unlikely(running
)) {
1095 * It's not enough that it's not actively running,
1096 * it must be off the runqueue _entirely_, and not
1099 * So if it wa still runnable (but just not actively
1100 * running right now), it's preempted, and we should
1101 * yield - it could be a while.
1103 if (unlikely(on_rq
)) {
1109 * Ahh, all good. It wasn't running, and it wasn't
1110 * runnable, which means that it will never become
1111 * running in the future either. We're all done!
1116 * kick_process - kick a running thread to enter/exit the kernel
1117 * @p: the to-be-kicked thread
1119 * Cause a process which is running on another CPU to enter
1120 * kernel-mode, without any delay. (to get signals handled.)
1122 * NOTE: this function doesnt have to take the runqueue lock,
1123 * because all it wants to ensure is that the remote task enters
1124 * the kernel. If the IPI races and the task has been migrated
1125 * to another CPU then no harm is done and the purpose has been
1128 void kick_process(struct task_struct
*p
)
1134 if ((cpu
!= smp_processor_id()) && task_curr(p
))
1135 smp_send_reschedule(cpu
);
1140 * Return a low guess at the load of a migration-source cpu weighted
1141 * according to the scheduling class and "nice" value.
1143 * We want to under-estimate the load of migration sources, to
1144 * balance conservatively.
1146 static inline unsigned long source_load(int cpu
, int type
)
1148 struct rq
*rq
= cpu_rq(cpu
);
1149 unsigned long total
= weighted_cpuload(cpu
);
1154 return min(rq
->cpu_load
[type
-1], total
);
1158 * Return a high guess at the load of a migration-target cpu weighted
1159 * according to the scheduling class and "nice" value.
1161 static inline unsigned long target_load(int cpu
, int type
)
1163 struct rq
*rq
= cpu_rq(cpu
);
1164 unsigned long total
= weighted_cpuload(cpu
);
1169 return max(rq
->cpu_load
[type
-1], total
);
1173 * Return the average load per task on the cpu's run queue
1175 static inline unsigned long cpu_avg_load_per_task(int cpu
)
1177 struct rq
*rq
= cpu_rq(cpu
);
1178 unsigned long total
= weighted_cpuload(cpu
);
1179 unsigned long n
= rq
->nr_running
;
1181 return n
? total
/ n
: SCHED_LOAD_SCALE
;
1185 * find_idlest_group finds and returns the least busy CPU group within the
1188 static struct sched_group
*
1189 find_idlest_group(struct sched_domain
*sd
, struct task_struct
*p
, int this_cpu
)
1191 struct sched_group
*idlest
= NULL
, *this = NULL
, *group
= sd
->groups
;
1192 unsigned long min_load
= ULONG_MAX
, this_load
= 0;
1193 int load_idx
= sd
->forkexec_idx
;
1194 int imbalance
= 100 + (sd
->imbalance_pct
-100)/2;
1197 unsigned long load
, avg_load
;
1201 /* Skip over this group if it has no CPUs allowed */
1202 if (!cpus_intersects(group
->cpumask
, p
->cpus_allowed
))
1205 local_group
= cpu_isset(this_cpu
, group
->cpumask
);
1207 /* Tally up the load of all CPUs in the group */
1210 for_each_cpu_mask(i
, group
->cpumask
) {
1211 /* Bias balancing toward cpus of our domain */
1213 load
= source_load(i
, load_idx
);
1215 load
= target_load(i
, load_idx
);
1220 /* Adjust by relative CPU power of the group */
1221 avg_load
= sg_div_cpu_power(group
,
1222 avg_load
* SCHED_LOAD_SCALE
);
1225 this_load
= avg_load
;
1227 } else if (avg_load
< min_load
) {
1228 min_load
= avg_load
;
1232 group
= group
->next
;
1233 } while (group
!= sd
->groups
);
1235 if (!idlest
|| 100*this_load
< imbalance
*min_load
)
1241 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1244 find_idlest_cpu(struct sched_group
*group
, struct task_struct
*p
, int this_cpu
)
1247 unsigned long load
, min_load
= ULONG_MAX
;
1251 /* Traverse only the allowed CPUs */
1252 cpus_and(tmp
, group
->cpumask
, p
->cpus_allowed
);
1254 for_each_cpu_mask(i
, tmp
) {
1255 load
= weighted_cpuload(i
);
1257 if (load
< min_load
|| (load
== min_load
&& i
== this_cpu
)) {
1267 * sched_balance_self: balance the current task (running on cpu) in domains
1268 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1271 * Balance, ie. select the least loaded group.
1273 * Returns the target CPU number, or the same CPU if no balancing is needed.
1275 * preempt must be disabled.
1277 static int sched_balance_self(int cpu
, int flag
)
1279 struct task_struct
*t
= current
;
1280 struct sched_domain
*tmp
, *sd
= NULL
;
1282 for_each_domain(cpu
, tmp
) {
1284 * If power savings logic is enabled for a domain, stop there.
1286 if (tmp
->flags
& SD_POWERSAVINGS_BALANCE
)
1288 if (tmp
->flags
& flag
)
1294 struct sched_group
*group
;
1295 int new_cpu
, weight
;
1297 if (!(sd
->flags
& flag
)) {
1303 group
= find_idlest_group(sd
, t
, cpu
);
1309 new_cpu
= find_idlest_cpu(group
, t
, cpu
);
1310 if (new_cpu
== -1 || new_cpu
== cpu
) {
1311 /* Now try balancing at a lower domain level of cpu */
1316 /* Now try balancing at a lower domain level of new_cpu */
1319 weight
= cpus_weight(span
);
1320 for_each_domain(cpu
, tmp
) {
1321 if (weight
<= cpus_weight(tmp
->span
))
1323 if (tmp
->flags
& flag
)
1326 /* while loop will break here if sd == NULL */
1332 #endif /* CONFIG_SMP */
1335 * wake_idle() will wake a task on an idle cpu if task->cpu is
1336 * not idle and an idle cpu is available. The span of cpus to
1337 * search starts with cpus closest then further out as needed,
1338 * so we always favor a closer, idle cpu.
1340 * Returns the CPU we should wake onto.
1342 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1343 static int wake_idle(int cpu
, struct task_struct
*p
)
1346 struct sched_domain
*sd
;
1350 * If it is idle, then it is the best cpu to run this task.
1352 * This cpu is also the best, if it has more than one task already.
1353 * Siblings must be also busy(in most cases) as they didn't already
1354 * pickup the extra load from this cpu and hence we need not check
1355 * sibling runqueue info. This will avoid the checks and cache miss
1356 * penalities associated with that.
1358 if (idle_cpu(cpu
) || cpu_rq(cpu
)->nr_running
> 1)
1361 for_each_domain(cpu
, sd
) {
1362 if (sd
->flags
& SD_WAKE_IDLE
) {
1363 cpus_and(tmp
, sd
->span
, p
->cpus_allowed
);
1364 for_each_cpu_mask(i
, tmp
) {
1375 static inline int wake_idle(int cpu
, struct task_struct
*p
)
1382 * try_to_wake_up - wake up a thread
1383 * @p: the to-be-woken-up thread
1384 * @state: the mask of task states that can be woken
1385 * @sync: do a synchronous wakeup?
1387 * Put it on the run-queue if it's not already there. The "current"
1388 * thread is always on the run-queue (except when the actual
1389 * re-schedule is in progress), and as such you're allowed to do
1390 * the simpler "current->state = TASK_RUNNING" to mark yourself
1391 * runnable without the overhead of this.
1393 * returns failure only if the task is already active.
1395 static int try_to_wake_up(struct task_struct
*p
, unsigned int state
, int sync
)
1397 int cpu
, this_cpu
, success
= 0;
1398 unsigned long flags
;
1402 struct sched_domain
*sd
, *this_sd
= NULL
;
1403 unsigned long load
, this_load
;
1407 rq
= task_rq_lock(p
, &flags
);
1408 old_state
= p
->state
;
1409 if (!(old_state
& state
))
1416 this_cpu
= smp_processor_id();
1419 if (unlikely(task_running(rq
, p
)))
1424 schedstat_inc(rq
, ttwu_cnt
);
1425 if (cpu
== this_cpu
) {
1426 schedstat_inc(rq
, ttwu_local
);
1430 for_each_domain(this_cpu
, sd
) {
1431 if (cpu_isset(cpu
, sd
->span
)) {
1432 schedstat_inc(sd
, ttwu_wake_remote
);
1438 if (unlikely(!cpu_isset(this_cpu
, p
->cpus_allowed
)))
1442 * Check for affine wakeup and passive balancing possibilities.
1445 int idx
= this_sd
->wake_idx
;
1446 unsigned int imbalance
;
1448 imbalance
= 100 + (this_sd
->imbalance_pct
- 100) / 2;
1450 load
= source_load(cpu
, idx
);
1451 this_load
= target_load(this_cpu
, idx
);
1453 new_cpu
= this_cpu
; /* Wake to this CPU if we can */
1455 if (this_sd
->flags
& SD_WAKE_AFFINE
) {
1456 unsigned long tl
= this_load
;
1457 unsigned long tl_per_task
;
1459 tl_per_task
= cpu_avg_load_per_task(this_cpu
);
1462 * If sync wakeup then subtract the (maximum possible)
1463 * effect of the currently running task from the load
1464 * of the current CPU:
1467 tl
-= current
->se
.load
.weight
;
1470 tl
+ target_load(cpu
, idx
) <= tl_per_task
) ||
1471 100*(tl
+ p
->se
.load
.weight
) <= imbalance
*load
) {
1473 * This domain has SD_WAKE_AFFINE and
1474 * p is cache cold in this domain, and
1475 * there is no bad imbalance.
1477 schedstat_inc(this_sd
, ttwu_move_affine
);
1483 * Start passive balancing when half the imbalance_pct
1486 if (this_sd
->flags
& SD_WAKE_BALANCE
) {
1487 if (imbalance
*this_load
<= 100*load
) {
1488 schedstat_inc(this_sd
, ttwu_move_balance
);
1494 new_cpu
= cpu
; /* Could not wake to this_cpu. Wake to cpu instead */
1496 new_cpu
= wake_idle(new_cpu
, p
);
1497 if (new_cpu
!= cpu
) {
1498 set_task_cpu(p
, new_cpu
);
1499 task_rq_unlock(rq
, &flags
);
1500 /* might preempt at this point */
1501 rq
= task_rq_lock(p
, &flags
);
1502 old_state
= p
->state
;
1503 if (!(old_state
& state
))
1508 this_cpu
= smp_processor_id();
1513 #endif /* CONFIG_SMP */
1514 activate_task(rq
, p
, 1);
1516 * Sync wakeups (i.e. those types of wakeups where the waker
1517 * has indicated that it will leave the CPU in short order)
1518 * don't trigger a preemption, if the woken up task will run on
1519 * this cpu. (in this case the 'I will reschedule' promise of
1520 * the waker guarantees that the freshly woken up task is going
1521 * to be considered on this CPU.)
1523 if (!sync
|| cpu
!= this_cpu
)
1524 check_preempt_curr(rq
, p
);
1528 p
->state
= TASK_RUNNING
;
1530 task_rq_unlock(rq
, &flags
);
1535 int fastcall
wake_up_process(struct task_struct
*p
)
1537 return try_to_wake_up(p
, TASK_STOPPED
| TASK_TRACED
|
1538 TASK_INTERRUPTIBLE
| TASK_UNINTERRUPTIBLE
, 0);
1540 EXPORT_SYMBOL(wake_up_process
);
1542 int fastcall
wake_up_state(struct task_struct
*p
, unsigned int state
)
1544 return try_to_wake_up(p
, state
, 0);
1548 * Perform scheduler related setup for a newly forked process p.
1549 * p is forked by current.
1551 * __sched_fork() is basic setup used by init_idle() too:
1553 static void __sched_fork(struct task_struct
*p
)
1555 p
->se
.wait_start_fair
= 0;
1556 p
->se
.wait_start
= 0;
1557 p
->se
.exec_start
= 0;
1558 p
->se
.sum_exec_runtime
= 0;
1559 p
->se
.delta_exec
= 0;
1560 p
->se
.delta_fair_run
= 0;
1561 p
->se
.delta_fair_sleep
= 0;
1562 p
->se
.wait_runtime
= 0;
1563 p
->se
.sum_wait_runtime
= 0;
1564 p
->se
.sum_sleep_runtime
= 0;
1565 p
->se
.sleep_start
= 0;
1566 p
->se
.sleep_start_fair
= 0;
1567 p
->se
.block_start
= 0;
1568 p
->se
.sleep_max
= 0;
1569 p
->se
.block_max
= 0;
1572 p
->se
.wait_runtime_overruns
= 0;
1573 p
->se
.wait_runtime_underruns
= 0;
1575 INIT_LIST_HEAD(&p
->run_list
);
1579 * We mark the process as running here, but have not actually
1580 * inserted it onto the runqueue yet. This guarantees that
1581 * nobody will actually run it, and a signal or other external
1582 * event cannot wake it up and insert it on the runqueue either.
1584 p
->state
= TASK_RUNNING
;
1588 * fork()/clone()-time setup:
1590 void sched_fork(struct task_struct
*p
, int clone_flags
)
1592 int cpu
= get_cpu();
1597 cpu
= sched_balance_self(cpu
, SD_BALANCE_FORK
);
1599 __set_task_cpu(p
, cpu
);
1602 * Make sure we do not leak PI boosting priority to the child:
1604 p
->prio
= current
->normal_prio
;
1606 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1607 if (likely(sched_info_on()))
1608 memset(&p
->sched_info
, 0, sizeof(p
->sched_info
));
1610 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1613 #ifdef CONFIG_PREEMPT
1614 /* Want to start with kernel preemption disabled. */
1615 task_thread_info(p
)->preempt_count
= 1;
1621 * After fork, child runs first. (default) If set to 0 then
1622 * parent will (try to) run first.
1624 unsigned int __read_mostly sysctl_sched_child_runs_first
= 1;
1627 * wake_up_new_task - wake up a newly created task for the first time.
1629 * This function will do some initial scheduler statistics housekeeping
1630 * that must be done for every newly created context, then puts the task
1631 * on the runqueue and wakes it.
1633 void fastcall
wake_up_new_task(struct task_struct
*p
, unsigned long clone_flags
)
1635 unsigned long flags
;
1639 rq
= task_rq_lock(p
, &flags
);
1640 BUG_ON(p
->state
!= TASK_RUNNING
);
1641 this_cpu
= smp_processor_id(); /* parent's CPU */
1643 p
->prio
= effective_prio(p
);
1645 if (!sysctl_sched_child_runs_first
|| (clone_flags
& CLONE_VM
) ||
1646 task_cpu(p
) != this_cpu
|| !current
->se
.on_rq
) {
1647 activate_task(rq
, p
, 0);
1650 * Let the scheduling class do new task startup
1651 * management (if any):
1653 p
->sched_class
->task_new(rq
, p
);
1655 check_preempt_curr(rq
, p
);
1656 task_rq_unlock(rq
, &flags
);
1660 * prepare_task_switch - prepare to switch tasks
1661 * @rq: the runqueue preparing to switch
1662 * @next: the task we are going to switch to.
1664 * This is called with the rq lock held and interrupts off. It must
1665 * be paired with a subsequent finish_task_switch after the context
1668 * prepare_task_switch sets up locking and calls architecture specific
1671 static inline void prepare_task_switch(struct rq
*rq
, struct task_struct
*next
)
1673 prepare_lock_switch(rq
, next
);
1674 prepare_arch_switch(next
);
1678 * finish_task_switch - clean up after a task-switch
1679 * @rq: runqueue associated with task-switch
1680 * @prev: the thread we just switched away from.
1682 * finish_task_switch must be called after the context switch, paired
1683 * with a prepare_task_switch call before the context switch.
1684 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1685 * and do any other architecture-specific cleanup actions.
1687 * Note that we may have delayed dropping an mm in context_switch(). If
1688 * so, we finish that here outside of the runqueue lock. (Doing it
1689 * with the lock held can cause deadlocks; see schedule() for
1692 static inline void finish_task_switch(struct rq
*rq
, struct task_struct
*prev
)
1693 __releases(rq
->lock
)
1695 struct mm_struct
*mm
= rq
->prev_mm
;
1701 * A task struct has one reference for the use as "current".
1702 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1703 * schedule one last time. The schedule call will never return, and
1704 * the scheduled task must drop that reference.
1705 * The test for TASK_DEAD must occur while the runqueue locks are
1706 * still held, otherwise prev could be scheduled on another cpu, die
1707 * there before we look at prev->state, and then the reference would
1709 * Manfred Spraul <manfred@colorfullife.com>
1711 prev_state
= prev
->state
;
1712 finish_arch_switch(prev
);
1713 finish_lock_switch(rq
, prev
);
1716 if (unlikely(prev_state
== TASK_DEAD
)) {
1718 * Remove function-return probe instances associated with this
1719 * task and put them back on the free list.
1721 kprobe_flush_task(prev
);
1722 put_task_struct(prev
);
1727 * schedule_tail - first thing a freshly forked thread must call.
1728 * @prev: the thread we just switched away from.
1730 asmlinkage
void schedule_tail(struct task_struct
*prev
)
1731 __releases(rq
->lock
)
1733 struct rq
*rq
= this_rq();
1735 finish_task_switch(rq
, prev
);
1736 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1737 /* In this case, finish_task_switch does not reenable preemption */
1740 if (current
->set_child_tid
)
1741 put_user(current
->pid
, current
->set_child_tid
);
1745 * context_switch - switch to the new MM and the new
1746 * thread's register state.
1749 context_switch(struct rq
*rq
, struct task_struct
*prev
,
1750 struct task_struct
*next
)
1752 struct mm_struct
*mm
, *oldmm
;
1754 prepare_task_switch(rq
, next
);
1756 oldmm
= prev
->active_mm
;
1758 * For paravirt, this is coupled with an exit in switch_to to
1759 * combine the page table reload and the switch backend into
1762 arch_enter_lazy_cpu_mode();
1764 if (unlikely(!mm
)) {
1765 next
->active_mm
= oldmm
;
1766 atomic_inc(&oldmm
->mm_count
);
1767 enter_lazy_tlb(oldmm
, next
);
1769 switch_mm(oldmm
, mm
, next
);
1771 if (unlikely(!prev
->mm
)) {
1772 prev
->active_mm
= NULL
;
1773 rq
->prev_mm
= oldmm
;
1776 * Since the runqueue lock will be released by the next
1777 * task (which is an invalid locking op but in the case
1778 * of the scheduler it's an obvious special-case), so we
1779 * do an early lockdep release here:
1781 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1782 spin_release(&rq
->lock
.dep_map
, 1, _THIS_IP_
);
1785 /* Here we just switch the register state and the stack. */
1786 switch_to(prev
, next
, prev
);
1790 * this_rq must be evaluated again because prev may have moved
1791 * CPUs since it called schedule(), thus the 'rq' on its stack
1792 * frame will be invalid.
1794 finish_task_switch(this_rq(), prev
);
1798 * nr_running, nr_uninterruptible and nr_context_switches:
1800 * externally visible scheduler statistics: current number of runnable
1801 * threads, current number of uninterruptible-sleeping threads, total
1802 * number of context switches performed since bootup.
1804 unsigned long nr_running(void)
1806 unsigned long i
, sum
= 0;
1808 for_each_online_cpu(i
)
1809 sum
+= cpu_rq(i
)->nr_running
;
1814 unsigned long nr_uninterruptible(void)
1816 unsigned long i
, sum
= 0;
1818 for_each_possible_cpu(i
)
1819 sum
+= cpu_rq(i
)->nr_uninterruptible
;
1822 * Since we read the counters lockless, it might be slightly
1823 * inaccurate. Do not allow it to go below zero though:
1825 if (unlikely((long)sum
< 0))
1831 unsigned long long nr_context_switches(void)
1834 unsigned long long sum
= 0;
1836 for_each_possible_cpu(i
)
1837 sum
+= cpu_rq(i
)->nr_switches
;
1842 unsigned long nr_iowait(void)
1844 unsigned long i
, sum
= 0;
1846 for_each_possible_cpu(i
)
1847 sum
+= atomic_read(&cpu_rq(i
)->nr_iowait
);
1852 unsigned long nr_active(void)
1854 unsigned long i
, running
= 0, uninterruptible
= 0;
1856 for_each_online_cpu(i
) {
1857 running
+= cpu_rq(i
)->nr_running
;
1858 uninterruptible
+= cpu_rq(i
)->nr_uninterruptible
;
1861 if (unlikely((long)uninterruptible
< 0))
1862 uninterruptible
= 0;
1864 return running
+ uninterruptible
;
1868 * Update rq->cpu_load[] statistics. This function is usually called every
1869 * scheduler tick (TICK_NSEC).
1871 static void update_cpu_load(struct rq
*this_rq
)
1873 u64 fair_delta64
, exec_delta64
, idle_delta64
, sample_interval64
, tmp64
;
1874 unsigned long total_load
= this_rq
->ls
.load
.weight
;
1875 unsigned long this_load
= total_load
;
1876 struct load_stat
*ls
= &this_rq
->ls
;
1877 u64 now
= __rq_clock(this_rq
);
1880 this_rq
->nr_load_updates
++;
1881 if (unlikely(!(sysctl_sched_features
& SCHED_FEAT_PRECISE_CPU_LOAD
)))
1884 /* Update delta_fair/delta_exec fields first */
1885 update_curr_load(this_rq
, now
);
1887 fair_delta64
= ls
->delta_fair
+ 1;
1890 exec_delta64
= ls
->delta_exec
+ 1;
1893 sample_interval64
= now
- ls
->load_update_last
;
1894 ls
->load_update_last
= now
;
1896 if ((s64
)sample_interval64
< (s64
)TICK_NSEC
)
1897 sample_interval64
= TICK_NSEC
;
1899 if (exec_delta64
> sample_interval64
)
1900 exec_delta64
= sample_interval64
;
1902 idle_delta64
= sample_interval64
- exec_delta64
;
1904 tmp64
= div64_64(SCHED_LOAD_SCALE
* exec_delta64
, fair_delta64
);
1905 tmp64
= div64_64(tmp64
* exec_delta64
, sample_interval64
);
1907 this_load
= (unsigned long)tmp64
;
1911 /* Update our load: */
1912 for (i
= 0, scale
= 1; i
< CPU_LOAD_IDX_MAX
; i
++, scale
+= scale
) {
1913 unsigned long old_load
, new_load
;
1915 /* scale is effectively 1 << i now, and >> i divides by scale */
1917 old_load
= this_rq
->cpu_load
[i
];
1918 new_load
= this_load
;
1920 this_rq
->cpu_load
[i
] = (old_load
*(scale
-1) + new_load
) >> i
;
1927 * double_rq_lock - safely lock two runqueues
1929 * Note this does not disable interrupts like task_rq_lock,
1930 * you need to do so manually before calling.
1932 static void double_rq_lock(struct rq
*rq1
, struct rq
*rq2
)
1933 __acquires(rq1
->lock
)
1934 __acquires(rq2
->lock
)
1936 BUG_ON(!irqs_disabled());
1938 spin_lock(&rq1
->lock
);
1939 __acquire(rq2
->lock
); /* Fake it out ;) */
1942 spin_lock(&rq1
->lock
);
1943 spin_lock(&rq2
->lock
);
1945 spin_lock(&rq2
->lock
);
1946 spin_lock(&rq1
->lock
);
1952 * double_rq_unlock - safely unlock two runqueues
1954 * Note this does not restore interrupts like task_rq_unlock,
1955 * you need to do so manually after calling.
1957 static void double_rq_unlock(struct rq
*rq1
, struct rq
*rq2
)
1958 __releases(rq1
->lock
)
1959 __releases(rq2
->lock
)
1961 spin_unlock(&rq1
->lock
);
1963 spin_unlock(&rq2
->lock
);
1965 __release(rq2
->lock
);
1969 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1971 static void double_lock_balance(struct rq
*this_rq
, struct rq
*busiest
)
1972 __releases(this_rq
->lock
)
1973 __acquires(busiest
->lock
)
1974 __acquires(this_rq
->lock
)
1976 if (unlikely(!irqs_disabled())) {
1977 /* printk() doesn't work good under rq->lock */
1978 spin_unlock(&this_rq
->lock
);
1981 if (unlikely(!spin_trylock(&busiest
->lock
))) {
1982 if (busiest
< this_rq
) {
1983 spin_unlock(&this_rq
->lock
);
1984 spin_lock(&busiest
->lock
);
1985 spin_lock(&this_rq
->lock
);
1987 spin_lock(&busiest
->lock
);
1992 * If dest_cpu is allowed for this process, migrate the task to it.
1993 * This is accomplished by forcing the cpu_allowed mask to only
1994 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1995 * the cpu_allowed mask is restored.
1997 static void sched_migrate_task(struct task_struct
*p
, int dest_cpu
)
1999 struct migration_req req
;
2000 unsigned long flags
;
2003 rq
= task_rq_lock(p
, &flags
);
2004 if (!cpu_isset(dest_cpu
, p
->cpus_allowed
)
2005 || unlikely(cpu_is_offline(dest_cpu
)))
2008 /* force the process onto the specified CPU */
2009 if (migrate_task(p
, dest_cpu
, &req
)) {
2010 /* Need to wait for migration thread (might exit: take ref). */
2011 struct task_struct
*mt
= rq
->migration_thread
;
2013 get_task_struct(mt
);
2014 task_rq_unlock(rq
, &flags
);
2015 wake_up_process(mt
);
2016 put_task_struct(mt
);
2017 wait_for_completion(&req
.done
);
2022 task_rq_unlock(rq
, &flags
);
2026 * sched_exec - execve() is a valuable balancing opportunity, because at
2027 * this point the task has the smallest effective memory and cache footprint.
2029 void sched_exec(void)
2031 int new_cpu
, this_cpu
= get_cpu();
2032 new_cpu
= sched_balance_self(this_cpu
, SD_BALANCE_EXEC
);
2034 if (new_cpu
!= this_cpu
)
2035 sched_migrate_task(current
, new_cpu
);
2039 * pull_task - move a task from a remote runqueue to the local runqueue.
2040 * Both runqueues must be locked.
2042 static void pull_task(struct rq
*src_rq
, struct task_struct
*p
,
2043 struct rq
*this_rq
, int this_cpu
)
2045 deactivate_task(src_rq
, p
, 0);
2046 set_task_cpu(p
, this_cpu
);
2047 activate_task(this_rq
, p
, 0);
2049 * Note that idle threads have a prio of MAX_PRIO, for this test
2050 * to be always true for them.
2052 check_preempt_curr(this_rq
, p
);
2056 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2059 int can_migrate_task(struct task_struct
*p
, struct rq
*rq
, int this_cpu
,
2060 struct sched_domain
*sd
, enum cpu_idle_type idle
,
2064 * We do not migrate tasks that are:
2065 * 1) running (obviously), or
2066 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2067 * 3) are cache-hot on their current CPU.
2069 if (!cpu_isset(this_cpu
, p
->cpus_allowed
))
2073 if (task_running(rq
, p
))
2077 * Aggressive migration if too many balance attempts have failed:
2079 if (sd
->nr_balance_failed
> sd
->cache_nice_tries
)
2085 static int balance_tasks(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
2086 unsigned long max_nr_move
, unsigned long max_load_move
,
2087 struct sched_domain
*sd
, enum cpu_idle_type idle
,
2088 int *all_pinned
, unsigned long *load_moved
,
2089 int this_best_prio
, int best_prio
, int best_prio_seen
,
2090 struct rq_iterator
*iterator
)
2092 int pulled
= 0, pinned
= 0, skip_for_load
;
2093 struct task_struct
*p
;
2094 long rem_load_move
= max_load_move
;
2096 if (max_nr_move
== 0 || max_load_move
== 0)
2102 * Start the load-balancing iterator:
2104 p
= iterator
->start(iterator
->arg
);
2109 * To help distribute high priority tasks accross CPUs we don't
2110 * skip a task if it will be the highest priority task (i.e. smallest
2111 * prio value) on its new queue regardless of its load weight
2113 skip_for_load
= (p
->se
.load
.weight
>> 1) > rem_load_move
+
2114 SCHED_LOAD_SCALE_FUZZ
;
2115 if (skip_for_load
&& p
->prio
< this_best_prio
)
2116 skip_for_load
= !best_prio_seen
&& p
->prio
== best_prio
;
2117 if (skip_for_load
||
2118 !can_migrate_task(p
, busiest
, this_cpu
, sd
, idle
, &pinned
)) {
2120 best_prio_seen
|= p
->prio
== best_prio
;
2121 p
= iterator
->next(iterator
->arg
);
2125 pull_task(busiest
, p
, this_rq
, this_cpu
);
2127 rem_load_move
-= p
->se
.load
.weight
;
2130 * We only want to steal up to the prescribed number of tasks
2131 * and the prescribed amount of weighted load.
2133 if (pulled
< max_nr_move
&& rem_load_move
> 0) {
2134 if (p
->prio
< this_best_prio
)
2135 this_best_prio
= p
->prio
;
2136 p
= iterator
->next(iterator
->arg
);
2141 * Right now, this is the only place pull_task() is called,
2142 * so we can safely collect pull_task() stats here rather than
2143 * inside pull_task().
2145 schedstat_add(sd
, lb_gained
[idle
], pulled
);
2148 *all_pinned
= pinned
;
2149 *load_moved
= max_load_move
- rem_load_move
;
2154 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2155 * load from busiest to this_rq, as part of a balancing operation within
2156 * "domain". Returns the number of tasks moved.
2158 * Called with both runqueues locked.
2160 static int move_tasks(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
2161 unsigned long max_nr_move
, unsigned long max_load_move
,
2162 struct sched_domain
*sd
, enum cpu_idle_type idle
,
2165 struct sched_class
*class = sched_class_highest
;
2166 unsigned long load_moved
, total_nr_moved
= 0, nr_moved
;
2167 long rem_load_move
= max_load_move
;
2170 nr_moved
= class->load_balance(this_rq
, this_cpu
, busiest
,
2171 max_nr_move
, (unsigned long)rem_load_move
,
2172 sd
, idle
, all_pinned
, &load_moved
);
2173 total_nr_moved
+= nr_moved
;
2174 max_nr_move
-= nr_moved
;
2175 rem_load_move
-= load_moved
;
2176 class = class->next
;
2177 } while (class && max_nr_move
&& rem_load_move
> 0);
2179 return total_nr_moved
;
2183 * find_busiest_group finds and returns the busiest CPU group within the
2184 * domain. It calculates and returns the amount of weighted load which
2185 * should be moved to restore balance via the imbalance parameter.
2187 static struct sched_group
*
2188 find_busiest_group(struct sched_domain
*sd
, int this_cpu
,
2189 unsigned long *imbalance
, enum cpu_idle_type idle
,
2190 int *sd_idle
, cpumask_t
*cpus
, int *balance
)
2192 struct sched_group
*busiest
= NULL
, *this = NULL
, *group
= sd
->groups
;
2193 unsigned long max_load
, avg_load
, total_load
, this_load
, total_pwr
;
2194 unsigned long max_pull
;
2195 unsigned long busiest_load_per_task
, busiest_nr_running
;
2196 unsigned long this_load_per_task
, this_nr_running
;
2198 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2199 int power_savings_balance
= 1;
2200 unsigned long leader_nr_running
= 0, min_load_per_task
= 0;
2201 unsigned long min_nr_running
= ULONG_MAX
;
2202 struct sched_group
*group_min
= NULL
, *group_leader
= NULL
;
2205 max_load
= this_load
= total_load
= total_pwr
= 0;
2206 busiest_load_per_task
= busiest_nr_running
= 0;
2207 this_load_per_task
= this_nr_running
= 0;
2208 if (idle
== CPU_NOT_IDLE
)
2209 load_idx
= sd
->busy_idx
;
2210 else if (idle
== CPU_NEWLY_IDLE
)
2211 load_idx
= sd
->newidle_idx
;
2213 load_idx
= sd
->idle_idx
;
2216 unsigned long load
, group_capacity
;
2219 unsigned int balance_cpu
= -1, first_idle_cpu
= 0;
2220 unsigned long sum_nr_running
, sum_weighted_load
;
2222 local_group
= cpu_isset(this_cpu
, group
->cpumask
);
2225 balance_cpu
= first_cpu(group
->cpumask
);
2227 /* Tally up the load of all CPUs in the group */
2228 sum_weighted_load
= sum_nr_running
= avg_load
= 0;
2230 for_each_cpu_mask(i
, group
->cpumask
) {
2233 if (!cpu_isset(i
, *cpus
))
2238 if (*sd_idle
&& !idle_cpu(i
))
2241 /* Bias balancing toward cpus of our domain */
2243 if (idle_cpu(i
) && !first_idle_cpu
) {
2248 load
= target_load(i
, load_idx
);
2250 load
= source_load(i
, load_idx
);
2253 sum_nr_running
+= rq
->nr_running
;
2254 sum_weighted_load
+= weighted_cpuload(i
);
2258 * First idle cpu or the first cpu(busiest) in this sched group
2259 * is eligible for doing load balancing at this and above
2262 if (local_group
&& balance_cpu
!= this_cpu
&& balance
) {
2267 total_load
+= avg_load
;
2268 total_pwr
+= group
->__cpu_power
;
2270 /* Adjust by relative CPU power of the group */
2271 avg_load
= sg_div_cpu_power(group
,
2272 avg_load
* SCHED_LOAD_SCALE
);
2274 group_capacity
= group
->__cpu_power
/ SCHED_LOAD_SCALE
;
2277 this_load
= avg_load
;
2279 this_nr_running
= sum_nr_running
;
2280 this_load_per_task
= sum_weighted_load
;
2281 } else if (avg_load
> max_load
&&
2282 sum_nr_running
> group_capacity
) {
2283 max_load
= avg_load
;
2285 busiest_nr_running
= sum_nr_running
;
2286 busiest_load_per_task
= sum_weighted_load
;
2289 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2291 * Busy processors will not participate in power savings
2294 if (idle
== CPU_NOT_IDLE
||
2295 !(sd
->flags
& SD_POWERSAVINGS_BALANCE
))
2299 * If the local group is idle or completely loaded
2300 * no need to do power savings balance at this domain
2302 if (local_group
&& (this_nr_running
>= group_capacity
||
2304 power_savings_balance
= 0;
2307 * If a group is already running at full capacity or idle,
2308 * don't include that group in power savings calculations
2310 if (!power_savings_balance
|| sum_nr_running
>= group_capacity
2315 * Calculate the group which has the least non-idle load.
2316 * This is the group from where we need to pick up the load
2319 if ((sum_nr_running
< min_nr_running
) ||
2320 (sum_nr_running
== min_nr_running
&&
2321 first_cpu(group
->cpumask
) <
2322 first_cpu(group_min
->cpumask
))) {
2324 min_nr_running
= sum_nr_running
;
2325 min_load_per_task
= sum_weighted_load
/
2330 * Calculate the group which is almost near its
2331 * capacity but still has some space to pick up some load
2332 * from other group and save more power
2334 if (sum_nr_running
<= group_capacity
- 1) {
2335 if (sum_nr_running
> leader_nr_running
||
2336 (sum_nr_running
== leader_nr_running
&&
2337 first_cpu(group
->cpumask
) >
2338 first_cpu(group_leader
->cpumask
))) {
2339 group_leader
= group
;
2340 leader_nr_running
= sum_nr_running
;
2345 group
= group
->next
;
2346 } while (group
!= sd
->groups
);
2348 if (!busiest
|| this_load
>= max_load
|| busiest_nr_running
== 0)
2351 avg_load
= (SCHED_LOAD_SCALE
* total_load
) / total_pwr
;
2353 if (this_load
>= avg_load
||
2354 100*max_load
<= sd
->imbalance_pct
*this_load
)
2357 busiest_load_per_task
/= busiest_nr_running
;
2359 * We're trying to get all the cpus to the average_load, so we don't
2360 * want to push ourselves above the average load, nor do we wish to
2361 * reduce the max loaded cpu below the average load, as either of these
2362 * actions would just result in more rebalancing later, and ping-pong
2363 * tasks around. Thus we look for the minimum possible imbalance.
2364 * Negative imbalances (*we* are more loaded than anyone else) will
2365 * be counted as no imbalance for these purposes -- we can't fix that
2366 * by pulling tasks to us. Be careful of negative numbers as they'll
2367 * appear as very large values with unsigned longs.
2369 if (max_load
<= busiest_load_per_task
)
2373 * In the presence of smp nice balancing, certain scenarios can have
2374 * max load less than avg load(as we skip the groups at or below
2375 * its cpu_power, while calculating max_load..)
2377 if (max_load
< avg_load
) {
2379 goto small_imbalance
;
2382 /* Don't want to pull so many tasks that a group would go idle */
2383 max_pull
= min(max_load
- avg_load
, max_load
- busiest_load_per_task
);
2385 /* How much load to actually move to equalise the imbalance */
2386 *imbalance
= min(max_pull
* busiest
->__cpu_power
,
2387 (avg_load
- this_load
) * this->__cpu_power
)
2391 * if *imbalance is less than the average load per runnable task
2392 * there is no gaurantee that any tasks will be moved so we'll have
2393 * a think about bumping its value to force at least one task to be
2396 if (*imbalance
+ SCHED_LOAD_SCALE_FUZZ
< busiest_load_per_task
/2) {
2397 unsigned long tmp
, pwr_now
, pwr_move
;
2401 pwr_move
= pwr_now
= 0;
2403 if (this_nr_running
) {
2404 this_load_per_task
/= this_nr_running
;
2405 if (busiest_load_per_task
> this_load_per_task
)
2408 this_load_per_task
= SCHED_LOAD_SCALE
;
2410 if (max_load
- this_load
+ SCHED_LOAD_SCALE_FUZZ
>=
2411 busiest_load_per_task
* imbn
) {
2412 *imbalance
= busiest_load_per_task
;
2417 * OK, we don't have enough imbalance to justify moving tasks,
2418 * however we may be able to increase total CPU power used by
2422 pwr_now
+= busiest
->__cpu_power
*
2423 min(busiest_load_per_task
, max_load
);
2424 pwr_now
+= this->__cpu_power
*
2425 min(this_load_per_task
, this_load
);
2426 pwr_now
/= SCHED_LOAD_SCALE
;
2428 /* Amount of load we'd subtract */
2429 tmp
= sg_div_cpu_power(busiest
,
2430 busiest_load_per_task
* SCHED_LOAD_SCALE
);
2432 pwr_move
+= busiest
->__cpu_power
*
2433 min(busiest_load_per_task
, max_load
- tmp
);
2435 /* Amount of load we'd add */
2436 if (max_load
* busiest
->__cpu_power
<
2437 busiest_load_per_task
* SCHED_LOAD_SCALE
)
2438 tmp
= sg_div_cpu_power(this,
2439 max_load
* busiest
->__cpu_power
);
2441 tmp
= sg_div_cpu_power(this,
2442 busiest_load_per_task
* SCHED_LOAD_SCALE
);
2443 pwr_move
+= this->__cpu_power
*
2444 min(this_load_per_task
, this_load
+ tmp
);
2445 pwr_move
/= SCHED_LOAD_SCALE
;
2447 /* Move if we gain throughput */
2448 if (pwr_move
<= pwr_now
)
2451 *imbalance
= busiest_load_per_task
;
2457 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2458 if (idle
== CPU_NOT_IDLE
|| !(sd
->flags
& SD_POWERSAVINGS_BALANCE
))
2461 if (this == group_leader
&& group_leader
!= group_min
) {
2462 *imbalance
= min_load_per_task
;
2472 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2475 find_busiest_queue(struct sched_group
*group
, enum cpu_idle_type idle
,
2476 unsigned long imbalance
, cpumask_t
*cpus
)
2478 struct rq
*busiest
= NULL
, *rq
;
2479 unsigned long max_load
= 0;
2482 for_each_cpu_mask(i
, group
->cpumask
) {
2485 if (!cpu_isset(i
, *cpus
))
2489 wl
= weighted_cpuload(i
);
2491 if (rq
->nr_running
== 1 && wl
> imbalance
)
2494 if (wl
> max_load
) {
2504 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2505 * so long as it is large enough.
2507 #define MAX_PINNED_INTERVAL 512
2509 static inline unsigned long minus_1_or_zero(unsigned long n
)
2511 return n
> 0 ? n
- 1 : 0;
2515 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2516 * tasks if there is an imbalance.
2518 static int load_balance(int this_cpu
, struct rq
*this_rq
,
2519 struct sched_domain
*sd
, enum cpu_idle_type idle
,
2522 int nr_moved
, all_pinned
= 0, active_balance
= 0, sd_idle
= 0;
2523 struct sched_group
*group
;
2524 unsigned long imbalance
;
2526 cpumask_t cpus
= CPU_MASK_ALL
;
2527 unsigned long flags
;
2530 * When power savings policy is enabled for the parent domain, idle
2531 * sibling can pick up load irrespective of busy siblings. In this case,
2532 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2533 * portraying it as CPU_NOT_IDLE.
2535 if (idle
!= CPU_NOT_IDLE
&& sd
->flags
& SD_SHARE_CPUPOWER
&&
2536 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2539 schedstat_inc(sd
, lb_cnt
[idle
]);
2542 group
= find_busiest_group(sd
, this_cpu
, &imbalance
, idle
, &sd_idle
,
2549 schedstat_inc(sd
, lb_nobusyg
[idle
]);
2553 busiest
= find_busiest_queue(group
, idle
, imbalance
, &cpus
);
2555 schedstat_inc(sd
, lb_nobusyq
[idle
]);
2559 BUG_ON(busiest
== this_rq
);
2561 schedstat_add(sd
, lb_imbalance
[idle
], imbalance
);
2564 if (busiest
->nr_running
> 1) {
2566 * Attempt to move tasks. If find_busiest_group has found
2567 * an imbalance but busiest->nr_running <= 1, the group is
2568 * still unbalanced. nr_moved simply stays zero, so it is
2569 * correctly treated as an imbalance.
2571 local_irq_save(flags
);
2572 double_rq_lock(this_rq
, busiest
);
2573 nr_moved
= move_tasks(this_rq
, this_cpu
, busiest
,
2574 minus_1_or_zero(busiest
->nr_running
),
2575 imbalance
, sd
, idle
, &all_pinned
);
2576 double_rq_unlock(this_rq
, busiest
);
2577 local_irq_restore(flags
);
2580 * some other cpu did the load balance for us.
2582 if (nr_moved
&& this_cpu
!= smp_processor_id())
2583 resched_cpu(this_cpu
);
2585 /* All tasks on this runqueue were pinned by CPU affinity */
2586 if (unlikely(all_pinned
)) {
2587 cpu_clear(cpu_of(busiest
), cpus
);
2588 if (!cpus_empty(cpus
))
2595 schedstat_inc(sd
, lb_failed
[idle
]);
2596 sd
->nr_balance_failed
++;
2598 if (unlikely(sd
->nr_balance_failed
> sd
->cache_nice_tries
+2)) {
2600 spin_lock_irqsave(&busiest
->lock
, flags
);
2602 /* don't kick the migration_thread, if the curr
2603 * task on busiest cpu can't be moved to this_cpu
2605 if (!cpu_isset(this_cpu
, busiest
->curr
->cpus_allowed
)) {
2606 spin_unlock_irqrestore(&busiest
->lock
, flags
);
2608 goto out_one_pinned
;
2611 if (!busiest
->active_balance
) {
2612 busiest
->active_balance
= 1;
2613 busiest
->push_cpu
= this_cpu
;
2616 spin_unlock_irqrestore(&busiest
->lock
, flags
);
2618 wake_up_process(busiest
->migration_thread
);
2621 * We've kicked active balancing, reset the failure
2624 sd
->nr_balance_failed
= sd
->cache_nice_tries
+1;
2627 sd
->nr_balance_failed
= 0;
2629 if (likely(!active_balance
)) {
2630 /* We were unbalanced, so reset the balancing interval */
2631 sd
->balance_interval
= sd
->min_interval
;
2634 * If we've begun active balancing, start to back off. This
2635 * case may not be covered by the all_pinned logic if there
2636 * is only 1 task on the busy runqueue (because we don't call
2639 if (sd
->balance_interval
< sd
->max_interval
)
2640 sd
->balance_interval
*= 2;
2643 if (!nr_moved
&& !sd_idle
&& sd
->flags
& SD_SHARE_CPUPOWER
&&
2644 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2649 schedstat_inc(sd
, lb_balanced
[idle
]);
2651 sd
->nr_balance_failed
= 0;
2654 /* tune up the balancing interval */
2655 if ((all_pinned
&& sd
->balance_interval
< MAX_PINNED_INTERVAL
) ||
2656 (sd
->balance_interval
< sd
->max_interval
))
2657 sd
->balance_interval
*= 2;
2659 if (!sd_idle
&& sd
->flags
& SD_SHARE_CPUPOWER
&&
2660 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2666 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2667 * tasks if there is an imbalance.
2669 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2670 * this_rq is locked.
2673 load_balance_newidle(int this_cpu
, struct rq
*this_rq
, struct sched_domain
*sd
)
2675 struct sched_group
*group
;
2676 struct rq
*busiest
= NULL
;
2677 unsigned long imbalance
;
2680 cpumask_t cpus
= CPU_MASK_ALL
;
2683 * When power savings policy is enabled for the parent domain, idle
2684 * sibling can pick up load irrespective of busy siblings. In this case,
2685 * let the state of idle sibling percolate up as IDLE, instead of
2686 * portraying it as CPU_NOT_IDLE.
2688 if (sd
->flags
& SD_SHARE_CPUPOWER
&&
2689 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2692 schedstat_inc(sd
, lb_cnt
[CPU_NEWLY_IDLE
]);
2694 group
= find_busiest_group(sd
, this_cpu
, &imbalance
, CPU_NEWLY_IDLE
,
2695 &sd_idle
, &cpus
, NULL
);
2697 schedstat_inc(sd
, lb_nobusyg
[CPU_NEWLY_IDLE
]);
2701 busiest
= find_busiest_queue(group
, CPU_NEWLY_IDLE
, imbalance
,
2704 schedstat_inc(sd
, lb_nobusyq
[CPU_NEWLY_IDLE
]);
2708 BUG_ON(busiest
== this_rq
);
2710 schedstat_add(sd
, lb_imbalance
[CPU_NEWLY_IDLE
], imbalance
);
2713 if (busiest
->nr_running
> 1) {
2714 /* Attempt to move tasks */
2715 double_lock_balance(this_rq
, busiest
);
2716 nr_moved
= move_tasks(this_rq
, this_cpu
, busiest
,
2717 minus_1_or_zero(busiest
->nr_running
),
2718 imbalance
, sd
, CPU_NEWLY_IDLE
, NULL
);
2719 spin_unlock(&busiest
->lock
);
2722 cpu_clear(cpu_of(busiest
), cpus
);
2723 if (!cpus_empty(cpus
))
2729 schedstat_inc(sd
, lb_failed
[CPU_NEWLY_IDLE
]);
2730 if (!sd_idle
&& sd
->flags
& SD_SHARE_CPUPOWER
&&
2731 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2734 sd
->nr_balance_failed
= 0;
2739 schedstat_inc(sd
, lb_balanced
[CPU_NEWLY_IDLE
]);
2740 if (!sd_idle
&& sd
->flags
& SD_SHARE_CPUPOWER
&&
2741 !test_sd_parent(sd
, SD_POWERSAVINGS_BALANCE
))
2743 sd
->nr_balance_failed
= 0;
2749 * idle_balance is called by schedule() if this_cpu is about to become
2750 * idle. Attempts to pull tasks from other CPUs.
2752 static void idle_balance(int this_cpu
, struct rq
*this_rq
)
2754 struct sched_domain
*sd
;
2755 int pulled_task
= -1;
2756 unsigned long next_balance
= jiffies
+ HZ
;
2758 for_each_domain(this_cpu
, sd
) {
2759 unsigned long interval
;
2761 if (!(sd
->flags
& SD_LOAD_BALANCE
))
2764 if (sd
->flags
& SD_BALANCE_NEWIDLE
)
2765 /* If we've pulled tasks over stop searching: */
2766 pulled_task
= load_balance_newidle(this_cpu
,
2769 interval
= msecs_to_jiffies(sd
->balance_interval
);
2770 if (time_after(next_balance
, sd
->last_balance
+ interval
))
2771 next_balance
= sd
->last_balance
+ interval
;
2775 if (pulled_task
|| time_after(jiffies
, this_rq
->next_balance
)) {
2777 * We are going idle. next_balance may be set based on
2778 * a busy processor. So reset next_balance.
2780 this_rq
->next_balance
= next_balance
;
2785 * active_load_balance is run by migration threads. It pushes running tasks
2786 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2787 * running on each physical CPU where possible, and avoids physical /
2788 * logical imbalances.
2790 * Called with busiest_rq locked.
2792 static void active_load_balance(struct rq
*busiest_rq
, int busiest_cpu
)
2794 int target_cpu
= busiest_rq
->push_cpu
;
2795 struct sched_domain
*sd
;
2796 struct rq
*target_rq
;
2798 /* Is there any task to move? */
2799 if (busiest_rq
->nr_running
<= 1)
2802 target_rq
= cpu_rq(target_cpu
);
2805 * This condition is "impossible", if it occurs
2806 * we need to fix it. Originally reported by
2807 * Bjorn Helgaas on a 128-cpu setup.
2809 BUG_ON(busiest_rq
== target_rq
);
2811 /* move a task from busiest_rq to target_rq */
2812 double_lock_balance(busiest_rq
, target_rq
);
2814 /* Search for an sd spanning us and the target CPU. */
2815 for_each_domain(target_cpu
, sd
) {
2816 if ((sd
->flags
& SD_LOAD_BALANCE
) &&
2817 cpu_isset(busiest_cpu
, sd
->span
))
2822 schedstat_inc(sd
, alb_cnt
);
2824 if (move_tasks(target_rq
, target_cpu
, busiest_rq
, 1,
2825 RTPRIO_TO_LOAD_WEIGHT(100), sd
, CPU_IDLE
,
2827 schedstat_inc(sd
, alb_pushed
);
2829 schedstat_inc(sd
, alb_failed
);
2831 spin_unlock(&target_rq
->lock
);
2836 atomic_t load_balancer
;
2838 } nohz ____cacheline_aligned
= {
2839 .load_balancer
= ATOMIC_INIT(-1),
2840 .cpu_mask
= CPU_MASK_NONE
,
2844 * This routine will try to nominate the ilb (idle load balancing)
2845 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2846 * load balancing on behalf of all those cpus. If all the cpus in the system
2847 * go into this tickless mode, then there will be no ilb owner (as there is
2848 * no need for one) and all the cpus will sleep till the next wakeup event
2851 * For the ilb owner, tick is not stopped. And this tick will be used
2852 * for idle load balancing. ilb owner will still be part of
2855 * While stopping the tick, this cpu will become the ilb owner if there
2856 * is no other owner. And will be the owner till that cpu becomes busy
2857 * or if all cpus in the system stop their ticks at which point
2858 * there is no need for ilb owner.
2860 * When the ilb owner becomes busy, it nominates another owner, during the
2861 * next busy scheduler_tick()
2863 int select_nohz_load_balancer(int stop_tick
)
2865 int cpu
= smp_processor_id();
2868 cpu_set(cpu
, nohz
.cpu_mask
);
2869 cpu_rq(cpu
)->in_nohz_recently
= 1;
2872 * If we are going offline and still the leader, give up!
2874 if (cpu_is_offline(cpu
) &&
2875 atomic_read(&nohz
.load_balancer
) == cpu
) {
2876 if (atomic_cmpxchg(&nohz
.load_balancer
, cpu
, -1) != cpu
)
2881 /* time for ilb owner also to sleep */
2882 if (cpus_weight(nohz
.cpu_mask
) == num_online_cpus()) {
2883 if (atomic_read(&nohz
.load_balancer
) == cpu
)
2884 atomic_set(&nohz
.load_balancer
, -1);
2888 if (atomic_read(&nohz
.load_balancer
) == -1) {
2889 /* make me the ilb owner */
2890 if (atomic_cmpxchg(&nohz
.load_balancer
, -1, cpu
) == -1)
2892 } else if (atomic_read(&nohz
.load_balancer
) == cpu
)
2895 if (!cpu_isset(cpu
, nohz
.cpu_mask
))
2898 cpu_clear(cpu
, nohz
.cpu_mask
);
2900 if (atomic_read(&nohz
.load_balancer
) == cpu
)
2901 if (atomic_cmpxchg(&nohz
.load_balancer
, cpu
, -1) != cpu
)
2908 static DEFINE_SPINLOCK(balancing
);
2911 * It checks each scheduling domain to see if it is due to be balanced,
2912 * and initiates a balancing operation if so.
2914 * Balancing parameters are set up in arch_init_sched_domains.
2916 static inline void rebalance_domains(int cpu
, enum cpu_idle_type idle
)
2919 struct rq
*rq
= cpu_rq(cpu
);
2920 unsigned long interval
;
2921 struct sched_domain
*sd
;
2922 /* Earliest time when we have to do rebalance again */
2923 unsigned long next_balance
= jiffies
+ 60*HZ
;
2925 for_each_domain(cpu
, sd
) {
2926 if (!(sd
->flags
& SD_LOAD_BALANCE
))
2929 interval
= sd
->balance_interval
;
2930 if (idle
!= CPU_IDLE
)
2931 interval
*= sd
->busy_factor
;
2933 /* scale ms to jiffies */
2934 interval
= msecs_to_jiffies(interval
);
2935 if (unlikely(!interval
))
2937 if (interval
> HZ
*NR_CPUS
/10)
2938 interval
= HZ
*NR_CPUS
/10;
2941 if (sd
->flags
& SD_SERIALIZE
) {
2942 if (!spin_trylock(&balancing
))
2946 if (time_after_eq(jiffies
, sd
->last_balance
+ interval
)) {
2947 if (load_balance(cpu
, rq
, sd
, idle
, &balance
)) {
2949 * We've pulled tasks over so either we're no
2950 * longer idle, or one of our SMT siblings is
2953 idle
= CPU_NOT_IDLE
;
2955 sd
->last_balance
= jiffies
;
2957 if (sd
->flags
& SD_SERIALIZE
)
2958 spin_unlock(&balancing
);
2960 if (time_after(next_balance
, sd
->last_balance
+ interval
))
2961 next_balance
= sd
->last_balance
+ interval
;
2964 * Stop the load balance at this level. There is another
2965 * CPU in our sched group which is doing load balancing more
2971 rq
->next_balance
= next_balance
;
2975 * run_rebalance_domains is triggered when needed from the scheduler tick.
2976 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2977 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2979 static void run_rebalance_domains(struct softirq_action
*h
)
2981 int this_cpu
= smp_processor_id();
2982 struct rq
*this_rq
= cpu_rq(this_cpu
);
2983 enum cpu_idle_type idle
= this_rq
->idle_at_tick
?
2984 CPU_IDLE
: CPU_NOT_IDLE
;
2986 rebalance_domains(this_cpu
, idle
);
2990 * If this cpu is the owner for idle load balancing, then do the
2991 * balancing on behalf of the other idle cpus whose ticks are
2994 if (this_rq
->idle_at_tick
&&
2995 atomic_read(&nohz
.load_balancer
) == this_cpu
) {
2996 cpumask_t cpus
= nohz
.cpu_mask
;
3000 cpu_clear(this_cpu
, cpus
);
3001 for_each_cpu_mask(balance_cpu
, cpus
) {
3003 * If this cpu gets work to do, stop the load balancing
3004 * work being done for other cpus. Next load
3005 * balancing owner will pick it up.
3010 rebalance_domains(balance_cpu
, SCHED_IDLE
);
3012 rq
= cpu_rq(balance_cpu
);
3013 if (time_after(this_rq
->next_balance
, rq
->next_balance
))
3014 this_rq
->next_balance
= rq
->next_balance
;
3021 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3023 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3024 * idle load balancing owner or decide to stop the periodic load balancing,
3025 * if the whole system is idle.
3027 static inline void trigger_load_balance(struct rq
*rq
, int cpu
)
3031 * If we were in the nohz mode recently and busy at the current
3032 * scheduler tick, then check if we need to nominate new idle
3035 if (rq
->in_nohz_recently
&& !rq
->idle_at_tick
) {
3036 rq
->in_nohz_recently
= 0;
3038 if (atomic_read(&nohz
.load_balancer
) == cpu
) {
3039 cpu_clear(cpu
, nohz
.cpu_mask
);
3040 atomic_set(&nohz
.load_balancer
, -1);
3043 if (atomic_read(&nohz
.load_balancer
) == -1) {
3045 * simple selection for now: Nominate the
3046 * first cpu in the nohz list to be the next
3049 * TBD: Traverse the sched domains and nominate
3050 * the nearest cpu in the nohz.cpu_mask.
3052 int ilb
= first_cpu(nohz
.cpu_mask
);
3060 * If this cpu is idle and doing idle load balancing for all the
3061 * cpus with ticks stopped, is it time for that to stop?
3063 if (rq
->idle_at_tick
&& atomic_read(&nohz
.load_balancer
) == cpu
&&
3064 cpus_weight(nohz
.cpu_mask
) == num_online_cpus()) {
3070 * If this cpu is idle and the idle load balancing is done by
3071 * someone else, then no need raise the SCHED_SOFTIRQ
3073 if (rq
->idle_at_tick
&& atomic_read(&nohz
.load_balancer
) != cpu
&&
3074 cpu_isset(cpu
, nohz
.cpu_mask
))
3077 if (time_after_eq(jiffies
, rq
->next_balance
))
3078 raise_softirq(SCHED_SOFTIRQ
);
3081 #else /* CONFIG_SMP */
3084 * on UP we do not need to balance between CPUs:
3086 static inline void idle_balance(int cpu
, struct rq
*rq
)
3090 /* Avoid "used but not defined" warning on UP */
3091 static int balance_tasks(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
3092 unsigned long max_nr_move
, unsigned long max_load_move
,
3093 struct sched_domain
*sd
, enum cpu_idle_type idle
,
3094 int *all_pinned
, unsigned long *load_moved
,
3095 int this_best_prio
, int best_prio
, int best_prio_seen
,
3096 struct rq_iterator
*iterator
)
3105 DEFINE_PER_CPU(struct kernel_stat
, kstat
);
3107 EXPORT_PER_CPU_SYMBOL(kstat
);
3110 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3111 * that have not yet been banked in case the task is currently running.
3113 unsigned long long task_sched_runtime(struct task_struct
*p
)
3115 unsigned long flags
;
3119 rq
= task_rq_lock(p
, &flags
);
3120 ns
= p
->se
.sum_exec_runtime
;
3121 if (rq
->curr
== p
) {
3122 delta_exec
= rq_clock(rq
) - p
->se
.exec_start
;
3123 if ((s64
)delta_exec
> 0)
3126 task_rq_unlock(rq
, &flags
);
3132 * Account user cpu time to a process.
3133 * @p: the process that the cpu time gets accounted to
3134 * @hardirq_offset: the offset to subtract from hardirq_count()
3135 * @cputime: the cpu time spent in user space since the last update
3137 void account_user_time(struct task_struct
*p
, cputime_t cputime
)
3139 struct cpu_usage_stat
*cpustat
= &kstat_this_cpu
.cpustat
;
3142 p
->utime
= cputime_add(p
->utime
, cputime
);
3144 /* Add user time to cpustat. */
3145 tmp
= cputime_to_cputime64(cputime
);
3146 if (TASK_NICE(p
) > 0)
3147 cpustat
->nice
= cputime64_add(cpustat
->nice
, tmp
);
3149 cpustat
->user
= cputime64_add(cpustat
->user
, tmp
);
3153 * Account system cpu time to a process.
3154 * @p: the process that the cpu time gets accounted to
3155 * @hardirq_offset: the offset to subtract from hardirq_count()
3156 * @cputime: the cpu time spent in kernel space since the last update
3158 void account_system_time(struct task_struct
*p
, int hardirq_offset
,
3161 struct cpu_usage_stat
*cpustat
= &kstat_this_cpu
.cpustat
;
3162 struct rq
*rq
= this_rq();
3165 p
->stime
= cputime_add(p
->stime
, cputime
);
3167 /* Add system time to cpustat. */
3168 tmp
= cputime_to_cputime64(cputime
);
3169 if (hardirq_count() - hardirq_offset
)
3170 cpustat
->irq
= cputime64_add(cpustat
->irq
, tmp
);
3171 else if (softirq_count())
3172 cpustat
->softirq
= cputime64_add(cpustat
->softirq
, tmp
);
3173 else if (p
!= rq
->idle
)
3174 cpustat
->system
= cputime64_add(cpustat
->system
, tmp
);
3175 else if (atomic_read(&rq
->nr_iowait
) > 0)
3176 cpustat
->iowait
= cputime64_add(cpustat
->iowait
, tmp
);
3178 cpustat
->idle
= cputime64_add(cpustat
->idle
, tmp
);
3179 /* Account for system time used */
3180 acct_update_integrals(p
);
3184 * Account for involuntary wait time.
3185 * @p: the process from which the cpu time has been stolen
3186 * @steal: the cpu time spent in involuntary wait
3188 void account_steal_time(struct task_struct
*p
, cputime_t steal
)
3190 struct cpu_usage_stat
*cpustat
= &kstat_this_cpu
.cpustat
;
3191 cputime64_t tmp
= cputime_to_cputime64(steal
);
3192 struct rq
*rq
= this_rq();
3194 if (p
== rq
->idle
) {
3195 p
->stime
= cputime_add(p
->stime
, steal
);
3196 if (atomic_read(&rq
->nr_iowait
) > 0)
3197 cpustat
->iowait
= cputime64_add(cpustat
->iowait
, tmp
);
3199 cpustat
->idle
= cputime64_add(cpustat
->idle
, tmp
);
3201 cpustat
->steal
= cputime64_add(cpustat
->steal
, tmp
);
3205 * This function gets called by the timer code, with HZ frequency.
3206 * We call it with interrupts disabled.
3208 * It also gets called by the fork code, when changing the parent's
3211 void scheduler_tick(void)
3213 int cpu
= smp_processor_id();
3214 struct rq
*rq
= cpu_rq(cpu
);
3215 struct task_struct
*curr
= rq
->curr
;
3217 spin_lock(&rq
->lock
);
3218 if (curr
!= rq
->idle
) /* FIXME: needed? */
3219 curr
->sched_class
->task_tick(rq
, curr
);
3220 update_cpu_load(rq
);
3221 spin_unlock(&rq
->lock
);
3224 rq
->idle_at_tick
= idle_cpu(cpu
);
3225 trigger_load_balance(rq
, cpu
);
3229 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3231 void fastcall
add_preempt_count(int val
)
3236 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3238 preempt_count() += val
;
3240 * Spinlock count overflowing soon?
3242 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK
) >=
3245 EXPORT_SYMBOL(add_preempt_count
);
3247 void fastcall
sub_preempt_count(int val
)
3252 if (DEBUG_LOCKS_WARN_ON(val
> preempt_count()))
3255 * Is the spinlock portion underflowing?
3257 if (DEBUG_LOCKS_WARN_ON((val
< PREEMPT_MASK
) &&
3258 !(preempt_count() & PREEMPT_MASK
)))
3261 preempt_count() -= val
;
3263 EXPORT_SYMBOL(sub_preempt_count
);
3268 * Print scheduling while atomic bug:
3270 static noinline
void __schedule_bug(struct task_struct
*prev
)
3272 printk(KERN_ERR
"BUG: scheduling while atomic: %s/0x%08x/%d\n",
3273 prev
->comm
, preempt_count(), prev
->pid
);
3274 debug_show_held_locks(prev
);
3275 if (irqs_disabled())
3276 print_irqtrace_events(prev
);
3281 * Various schedule()-time debugging checks and statistics:
3283 static inline void schedule_debug(struct task_struct
*prev
)
3286 * Test if we are atomic. Since do_exit() needs to call into
3287 * schedule() atomically, we ignore that path for now.
3288 * Otherwise, whine if we are scheduling when we should not be.
3290 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev
->exit_state
))
3291 __schedule_bug(prev
);
3293 profile_hit(SCHED_PROFILING
, __builtin_return_address(0));
3295 schedstat_inc(this_rq(), sched_cnt
);
3299 * Pick up the highest-prio task:
3301 static inline struct task_struct
*
3302 pick_next_task(struct rq
*rq
, struct task_struct
*prev
, u64 now
)
3304 struct sched_class
*class;
3305 struct task_struct
*p
;
3308 * Optimization: we know that if all tasks are in
3309 * the fair class we can call that function directly:
3311 if (likely(rq
->nr_running
== rq
->cfs
.nr_running
)) {
3312 p
= fair_sched_class
.pick_next_task(rq
, now
);
3317 class = sched_class_highest
;
3319 p
= class->pick_next_task(rq
, now
);
3323 * Will never be NULL as the idle class always
3324 * returns a non-NULL p:
3326 class = class->next
;
3331 * schedule() is the main scheduler function.
3333 asmlinkage
void __sched
schedule(void)
3335 struct task_struct
*prev
, *next
;
3343 cpu
= smp_processor_id();
3347 switch_count
= &prev
->nivcsw
;
3349 release_kernel_lock(prev
);
3350 need_resched_nonpreemptible
:
3352 schedule_debug(prev
);
3354 spin_lock_irq(&rq
->lock
);
3355 clear_tsk_need_resched(prev
);
3357 if (prev
->state
&& !(preempt_count() & PREEMPT_ACTIVE
)) {
3358 if (unlikely((prev
->state
& TASK_INTERRUPTIBLE
) &&
3359 unlikely(signal_pending(prev
)))) {
3360 prev
->state
= TASK_RUNNING
;
3362 deactivate_task(rq
, prev
, 1);
3364 switch_count
= &prev
->nvcsw
;
3367 if (unlikely(!rq
->nr_running
))
3368 idle_balance(cpu
, rq
);
3370 now
= __rq_clock(rq
);
3371 prev
->sched_class
->put_prev_task(rq
, prev
, now
);
3372 next
= pick_next_task(rq
, prev
, now
);
3374 sched_info_switch(prev
, next
);
3376 if (likely(prev
!= next
)) {
3381 context_switch(rq
, prev
, next
); /* unlocks the rq */
3383 spin_unlock_irq(&rq
->lock
);
3385 if (unlikely(reacquire_kernel_lock(current
) < 0)) {
3386 cpu
= smp_processor_id();
3388 goto need_resched_nonpreemptible
;
3390 preempt_enable_no_resched();
3391 if (unlikely(test_thread_flag(TIF_NEED_RESCHED
)))
3394 EXPORT_SYMBOL(schedule
);
3396 #ifdef CONFIG_PREEMPT
3398 * this is the entry point to schedule() from in-kernel preemption
3399 * off of preempt_enable. Kernel preemptions off return from interrupt
3400 * occur there and call schedule directly.
3402 asmlinkage
void __sched
preempt_schedule(void)
3404 struct thread_info
*ti
= current_thread_info();
3405 #ifdef CONFIG_PREEMPT_BKL
3406 struct task_struct
*task
= current
;
3407 int saved_lock_depth
;
3410 * If there is a non-zero preempt_count or interrupts are disabled,
3411 * we do not want to preempt the current task. Just return..
3413 if (likely(ti
->preempt_count
|| irqs_disabled()))
3417 add_preempt_count(PREEMPT_ACTIVE
);
3419 * We keep the big kernel semaphore locked, but we
3420 * clear ->lock_depth so that schedule() doesnt
3421 * auto-release the semaphore:
3423 #ifdef CONFIG_PREEMPT_BKL
3424 saved_lock_depth
= task
->lock_depth
;
3425 task
->lock_depth
= -1;
3428 #ifdef CONFIG_PREEMPT_BKL
3429 task
->lock_depth
= saved_lock_depth
;
3431 sub_preempt_count(PREEMPT_ACTIVE
);
3433 /* we could miss a preemption opportunity between schedule and now */
3435 if (unlikely(test_thread_flag(TIF_NEED_RESCHED
)))
3438 EXPORT_SYMBOL(preempt_schedule
);
3441 * this is the entry point to schedule() from kernel preemption
3442 * off of irq context.
3443 * Note, that this is called and return with irqs disabled. This will
3444 * protect us against recursive calling from irq.
3446 asmlinkage
void __sched
preempt_schedule_irq(void)
3448 struct thread_info
*ti
= current_thread_info();
3449 #ifdef CONFIG_PREEMPT_BKL
3450 struct task_struct
*task
= current
;
3451 int saved_lock_depth
;
3453 /* Catch callers which need to be fixed */
3454 BUG_ON(ti
->preempt_count
|| !irqs_disabled());
3457 add_preempt_count(PREEMPT_ACTIVE
);
3459 * We keep the big kernel semaphore locked, but we
3460 * clear ->lock_depth so that schedule() doesnt
3461 * auto-release the semaphore:
3463 #ifdef CONFIG_PREEMPT_BKL
3464 saved_lock_depth
= task
->lock_depth
;
3465 task
->lock_depth
= -1;
3469 local_irq_disable();
3470 #ifdef CONFIG_PREEMPT_BKL
3471 task
->lock_depth
= saved_lock_depth
;
3473 sub_preempt_count(PREEMPT_ACTIVE
);
3475 /* we could miss a preemption opportunity between schedule and now */
3477 if (unlikely(test_thread_flag(TIF_NEED_RESCHED
)))
3481 #endif /* CONFIG_PREEMPT */
3483 int default_wake_function(wait_queue_t
*curr
, unsigned mode
, int sync
,
3486 return try_to_wake_up(curr
->private, mode
, sync
);
3488 EXPORT_SYMBOL(default_wake_function
);
3491 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3492 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3493 * number) then we wake all the non-exclusive tasks and one exclusive task.
3495 * There are circumstances in which we can try to wake a task which has already
3496 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3497 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3499 static void __wake_up_common(wait_queue_head_t
*q
, unsigned int mode
,
3500 int nr_exclusive
, int sync
, void *key
)
3502 struct list_head
*tmp
, *next
;
3504 list_for_each_safe(tmp
, next
, &q
->task_list
) {
3505 wait_queue_t
*curr
= list_entry(tmp
, wait_queue_t
, task_list
);
3506 unsigned flags
= curr
->flags
;
3508 if (curr
->func(curr
, mode
, sync
, key
) &&
3509 (flags
& WQ_FLAG_EXCLUSIVE
) && !--nr_exclusive
)
3515 * __wake_up - wake up threads blocked on a waitqueue.
3517 * @mode: which threads
3518 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3519 * @key: is directly passed to the wakeup function
3521 void fastcall
__wake_up(wait_queue_head_t
*q
, unsigned int mode
,
3522 int nr_exclusive
, void *key
)
3524 unsigned long flags
;
3526 spin_lock_irqsave(&q
->lock
, flags
);
3527 __wake_up_common(q
, mode
, nr_exclusive
, 0, key
);
3528 spin_unlock_irqrestore(&q
->lock
, flags
);
3530 EXPORT_SYMBOL(__wake_up
);
3533 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3535 void fastcall
__wake_up_locked(wait_queue_head_t
*q
, unsigned int mode
)
3537 __wake_up_common(q
, mode
, 1, 0, NULL
);
3541 * __wake_up_sync - wake up threads blocked on a waitqueue.
3543 * @mode: which threads
3544 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3546 * The sync wakeup differs that the waker knows that it will schedule
3547 * away soon, so while the target thread will be woken up, it will not
3548 * be migrated to another CPU - ie. the two threads are 'synchronized'
3549 * with each other. This can prevent needless bouncing between CPUs.
3551 * On UP it can prevent extra preemption.
3554 __wake_up_sync(wait_queue_head_t
*q
, unsigned int mode
, int nr_exclusive
)
3556 unsigned long flags
;
3562 if (unlikely(!nr_exclusive
))
3565 spin_lock_irqsave(&q
->lock
, flags
);
3566 __wake_up_common(q
, mode
, nr_exclusive
, sync
, NULL
);
3567 spin_unlock_irqrestore(&q
->lock
, flags
);
3569 EXPORT_SYMBOL_GPL(__wake_up_sync
); /* For internal use only */
3571 void fastcall
complete(struct completion
*x
)
3573 unsigned long flags
;
3575 spin_lock_irqsave(&x
->wait
.lock
, flags
);
3577 __wake_up_common(&x
->wait
, TASK_UNINTERRUPTIBLE
| TASK_INTERRUPTIBLE
,
3579 spin_unlock_irqrestore(&x
->wait
.lock
, flags
);
3581 EXPORT_SYMBOL(complete
);
3583 void fastcall
complete_all(struct completion
*x
)
3585 unsigned long flags
;
3587 spin_lock_irqsave(&x
->wait
.lock
, flags
);
3588 x
->done
+= UINT_MAX
/2;
3589 __wake_up_common(&x
->wait
, TASK_UNINTERRUPTIBLE
| TASK_INTERRUPTIBLE
,
3591 spin_unlock_irqrestore(&x
->wait
.lock
, flags
);
3593 EXPORT_SYMBOL(complete_all
);
3595 void fastcall __sched
wait_for_completion(struct completion
*x
)
3599 spin_lock_irq(&x
->wait
.lock
);
3601 DECLARE_WAITQUEUE(wait
, current
);
3603 wait
.flags
|= WQ_FLAG_EXCLUSIVE
;
3604 __add_wait_queue_tail(&x
->wait
, &wait
);
3606 __set_current_state(TASK_UNINTERRUPTIBLE
);
3607 spin_unlock_irq(&x
->wait
.lock
);
3609 spin_lock_irq(&x
->wait
.lock
);
3611 __remove_wait_queue(&x
->wait
, &wait
);
3614 spin_unlock_irq(&x
->wait
.lock
);
3616 EXPORT_SYMBOL(wait_for_completion
);
3618 unsigned long fastcall __sched
3619 wait_for_completion_timeout(struct completion
*x
, unsigned long timeout
)
3623 spin_lock_irq(&x
->wait
.lock
);
3625 DECLARE_WAITQUEUE(wait
, current
);
3627 wait
.flags
|= WQ_FLAG_EXCLUSIVE
;
3628 __add_wait_queue_tail(&x
->wait
, &wait
);
3630 __set_current_state(TASK_UNINTERRUPTIBLE
);
3631 spin_unlock_irq(&x
->wait
.lock
);
3632 timeout
= schedule_timeout(timeout
);
3633 spin_lock_irq(&x
->wait
.lock
);
3635 __remove_wait_queue(&x
->wait
, &wait
);
3639 __remove_wait_queue(&x
->wait
, &wait
);
3643 spin_unlock_irq(&x
->wait
.lock
);
3646 EXPORT_SYMBOL(wait_for_completion_timeout
);
3648 int fastcall __sched
wait_for_completion_interruptible(struct completion
*x
)
3654 spin_lock_irq(&x
->wait
.lock
);
3656 DECLARE_WAITQUEUE(wait
, current
);
3658 wait
.flags
|= WQ_FLAG_EXCLUSIVE
;
3659 __add_wait_queue_tail(&x
->wait
, &wait
);
3661 if (signal_pending(current
)) {
3663 __remove_wait_queue(&x
->wait
, &wait
);
3666 __set_current_state(TASK_INTERRUPTIBLE
);
3667 spin_unlock_irq(&x
->wait
.lock
);
3669 spin_lock_irq(&x
->wait
.lock
);
3671 __remove_wait_queue(&x
->wait
, &wait
);
3675 spin_unlock_irq(&x
->wait
.lock
);
3679 EXPORT_SYMBOL(wait_for_completion_interruptible
);
3681 unsigned long fastcall __sched
3682 wait_for_completion_interruptible_timeout(struct completion
*x
,
3683 unsigned long timeout
)
3687 spin_lock_irq(&x
->wait
.lock
);
3689 DECLARE_WAITQUEUE(wait
, current
);
3691 wait
.flags
|= WQ_FLAG_EXCLUSIVE
;
3692 __add_wait_queue_tail(&x
->wait
, &wait
);
3694 if (signal_pending(current
)) {
3695 timeout
= -ERESTARTSYS
;
3696 __remove_wait_queue(&x
->wait
, &wait
);
3699 __set_current_state(TASK_INTERRUPTIBLE
);
3700 spin_unlock_irq(&x
->wait
.lock
);
3701 timeout
= schedule_timeout(timeout
);
3702 spin_lock_irq(&x
->wait
.lock
);
3704 __remove_wait_queue(&x
->wait
, &wait
);
3708 __remove_wait_queue(&x
->wait
, &wait
);
3712 spin_unlock_irq(&x
->wait
.lock
);
3715 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout
);
3718 sleep_on_head(wait_queue_head_t
*q
, wait_queue_t
*wait
, unsigned long *flags
)
3720 spin_lock_irqsave(&q
->lock
, *flags
);
3721 __add_wait_queue(q
, wait
);
3722 spin_unlock(&q
->lock
);
3726 sleep_on_tail(wait_queue_head_t
*q
, wait_queue_t
*wait
, unsigned long *flags
)
3728 spin_lock_irq(&q
->lock
);
3729 __remove_wait_queue(q
, wait
);
3730 spin_unlock_irqrestore(&q
->lock
, *flags
);
3733 void __sched
interruptible_sleep_on(wait_queue_head_t
*q
)
3735 unsigned long flags
;
3738 init_waitqueue_entry(&wait
, current
);
3740 current
->state
= TASK_INTERRUPTIBLE
;
3742 sleep_on_head(q
, &wait
, &flags
);
3744 sleep_on_tail(q
, &wait
, &flags
);
3746 EXPORT_SYMBOL(interruptible_sleep_on
);
3749 interruptible_sleep_on_timeout(wait_queue_head_t
*q
, long timeout
)
3751 unsigned long flags
;
3754 init_waitqueue_entry(&wait
, current
);
3756 current
->state
= TASK_INTERRUPTIBLE
;
3758 sleep_on_head(q
, &wait
, &flags
);
3759 timeout
= schedule_timeout(timeout
);
3760 sleep_on_tail(q
, &wait
, &flags
);
3764 EXPORT_SYMBOL(interruptible_sleep_on_timeout
);
3766 void __sched
sleep_on(wait_queue_head_t
*q
)
3768 unsigned long flags
;
3771 init_waitqueue_entry(&wait
, current
);
3773 current
->state
= TASK_UNINTERRUPTIBLE
;
3775 sleep_on_head(q
, &wait
, &flags
);
3777 sleep_on_tail(q
, &wait
, &flags
);
3779 EXPORT_SYMBOL(sleep_on
);
3781 long __sched
sleep_on_timeout(wait_queue_head_t
*q
, long timeout
)
3783 unsigned long flags
;
3786 init_waitqueue_entry(&wait
, current
);
3788 current
->state
= TASK_UNINTERRUPTIBLE
;
3790 sleep_on_head(q
, &wait
, &flags
);
3791 timeout
= schedule_timeout(timeout
);
3792 sleep_on_tail(q
, &wait
, &flags
);
3796 EXPORT_SYMBOL(sleep_on_timeout
);
3798 #ifdef CONFIG_RT_MUTEXES
3801 * rt_mutex_setprio - set the current priority of a task
3803 * @prio: prio value (kernel-internal form)
3805 * This function changes the 'effective' priority of a task. It does
3806 * not touch ->normal_prio like __setscheduler().
3808 * Used by the rt_mutex code to implement priority inheritance logic.
3810 void rt_mutex_setprio(struct task_struct
*p
, int prio
)
3812 unsigned long flags
;
3817 BUG_ON(prio
< 0 || prio
> MAX_PRIO
);
3819 rq
= task_rq_lock(p
, &flags
);
3823 on_rq
= p
->se
.on_rq
;
3825 dequeue_task(rq
, p
, 0, now
);
3828 p
->sched_class
= &rt_sched_class
;
3830 p
->sched_class
= &fair_sched_class
;
3835 enqueue_task(rq
, p
, 0, now
);
3837 * Reschedule if we are currently running on this runqueue and
3838 * our priority decreased, or if we are not currently running on
3839 * this runqueue and our priority is higher than the current's
3841 if (task_running(rq
, p
)) {
3842 if (p
->prio
> oldprio
)
3843 resched_task(rq
->curr
);
3845 check_preempt_curr(rq
, p
);
3848 task_rq_unlock(rq
, &flags
);
3853 void set_user_nice(struct task_struct
*p
, long nice
)
3855 int old_prio
, delta
, on_rq
;
3856 unsigned long flags
;
3860 if (TASK_NICE(p
) == nice
|| nice
< -20 || nice
> 19)
3863 * We have to be careful, if called from sys_setpriority(),
3864 * the task might be in the middle of scheduling on another CPU.
3866 rq
= task_rq_lock(p
, &flags
);
3869 * The RT priorities are set via sched_setscheduler(), but we still
3870 * allow the 'normal' nice value to be set - but as expected
3871 * it wont have any effect on scheduling until the task is
3872 * SCHED_FIFO/SCHED_RR:
3874 if (task_has_rt_policy(p
)) {
3875 p
->static_prio
= NICE_TO_PRIO(nice
);
3878 on_rq
= p
->se
.on_rq
;
3880 dequeue_task(rq
, p
, 0, now
);
3881 dec_load(rq
, p
, now
);
3884 p
->static_prio
= NICE_TO_PRIO(nice
);
3887 p
->prio
= effective_prio(p
);
3888 delta
= p
->prio
- old_prio
;
3891 enqueue_task(rq
, p
, 0, now
);
3892 inc_load(rq
, p
, now
);
3894 * If the task increased its priority or is running and
3895 * lowered its priority, then reschedule its CPU:
3897 if (delta
< 0 || (delta
> 0 && task_running(rq
, p
)))
3898 resched_task(rq
->curr
);
3901 task_rq_unlock(rq
, &flags
);
3903 EXPORT_SYMBOL(set_user_nice
);
3906 * can_nice - check if a task can reduce its nice value
3910 int can_nice(const struct task_struct
*p
, const int nice
)
3912 /* convert nice value [19,-20] to rlimit style value [1,40] */
3913 int nice_rlim
= 20 - nice
;
3915 return (nice_rlim
<= p
->signal
->rlim
[RLIMIT_NICE
].rlim_cur
||
3916 capable(CAP_SYS_NICE
));
3919 #ifdef __ARCH_WANT_SYS_NICE
3922 * sys_nice - change the priority of the current process.
3923 * @increment: priority increment
3925 * sys_setpriority is a more generic, but much slower function that
3926 * does similar things.
3928 asmlinkage
long sys_nice(int increment
)
3933 * Setpriority might change our priority at the same moment.
3934 * We don't have to worry. Conceptually one call occurs first
3935 * and we have a single winner.
3937 if (increment
< -40)
3942 nice
= PRIO_TO_NICE(current
->static_prio
) + increment
;
3948 if (increment
< 0 && !can_nice(current
, nice
))
3951 retval
= security_task_setnice(current
, nice
);
3955 set_user_nice(current
, nice
);
3962 * task_prio - return the priority value of a given task.
3963 * @p: the task in question.
3965 * This is the priority value as seen by users in /proc.
3966 * RT tasks are offset by -200. Normal tasks are centered
3967 * around 0, value goes from -16 to +15.
3969 int task_prio(const struct task_struct
*p
)
3971 return p
->prio
- MAX_RT_PRIO
;
3975 * task_nice - return the nice value of a given task.
3976 * @p: the task in question.
3978 int task_nice(const struct task_struct
*p
)
3980 return TASK_NICE(p
);
3982 EXPORT_SYMBOL_GPL(task_nice
);
3985 * idle_cpu - is a given cpu idle currently?
3986 * @cpu: the processor in question.
3988 int idle_cpu(int cpu
)
3990 return cpu_curr(cpu
) == cpu_rq(cpu
)->idle
;
3994 * idle_task - return the idle task for a given cpu.
3995 * @cpu: the processor in question.
3997 struct task_struct
*idle_task(int cpu
)
3999 return cpu_rq(cpu
)->idle
;
4003 * find_process_by_pid - find a process with a matching PID value.
4004 * @pid: the pid in question.
4006 static inline struct task_struct
*find_process_by_pid(pid_t pid
)
4008 return pid
? find_task_by_pid(pid
) : current
;
4011 /* Actually do priority change: must hold rq lock. */
4013 __setscheduler(struct rq
*rq
, struct task_struct
*p
, int policy
, int prio
)
4015 BUG_ON(p
->se
.on_rq
);
4018 switch (p
->policy
) {
4022 p
->sched_class
= &fair_sched_class
;
4026 p
->sched_class
= &rt_sched_class
;
4030 p
->rt_priority
= prio
;
4031 p
->normal_prio
= normal_prio(p
);
4032 /* we are holding p->pi_lock already */
4033 p
->prio
= rt_mutex_getprio(p
);
4038 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4039 * @p: the task in question.
4040 * @policy: new policy.
4041 * @param: structure containing the new RT priority.
4043 * NOTE that the task may be already dead.
4045 int sched_setscheduler(struct task_struct
*p
, int policy
,
4046 struct sched_param
*param
)
4048 int retval
, oldprio
, oldpolicy
= -1, on_rq
;
4049 unsigned long flags
;
4052 /* may grab non-irq protected spin_locks */
4053 BUG_ON(in_interrupt());
4055 /* double check policy once rq lock held */
4057 policy
= oldpolicy
= p
->policy
;
4058 else if (policy
!= SCHED_FIFO
&& policy
!= SCHED_RR
&&
4059 policy
!= SCHED_NORMAL
&& policy
!= SCHED_BATCH
&&
4060 policy
!= SCHED_IDLE
)
4063 * Valid priorities for SCHED_FIFO and SCHED_RR are
4064 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4065 * SCHED_BATCH and SCHED_IDLE is 0.
4067 if (param
->sched_priority
< 0 ||
4068 (p
->mm
&& param
->sched_priority
> MAX_USER_RT_PRIO
-1) ||
4069 (!p
->mm
&& param
->sched_priority
> MAX_RT_PRIO
-1))
4071 if (rt_policy(policy
) != (param
->sched_priority
!= 0))
4075 * Allow unprivileged RT tasks to decrease priority:
4077 if (!capable(CAP_SYS_NICE
)) {
4078 if (rt_policy(policy
)) {
4079 unsigned long rlim_rtprio
;
4081 if (!lock_task_sighand(p
, &flags
))
4083 rlim_rtprio
= p
->signal
->rlim
[RLIMIT_RTPRIO
].rlim_cur
;
4084 unlock_task_sighand(p
, &flags
);
4086 /* can't set/change the rt policy */
4087 if (policy
!= p
->policy
&& !rlim_rtprio
)
4090 /* can't increase priority */
4091 if (param
->sched_priority
> p
->rt_priority
&&
4092 param
->sched_priority
> rlim_rtprio
)
4096 * Like positive nice levels, dont allow tasks to
4097 * move out of SCHED_IDLE either:
4099 if (p
->policy
== SCHED_IDLE
&& policy
!= SCHED_IDLE
)
4102 /* can't change other user's priorities */
4103 if ((current
->euid
!= p
->euid
) &&
4104 (current
->euid
!= p
->uid
))
4108 retval
= security_task_setscheduler(p
, policy
, param
);
4112 * make sure no PI-waiters arrive (or leave) while we are
4113 * changing the priority of the task:
4115 spin_lock_irqsave(&p
->pi_lock
, flags
);
4117 * To be able to change p->policy safely, the apropriate
4118 * runqueue lock must be held.
4120 rq
= __task_rq_lock(p
);
4121 /* recheck policy now with rq lock held */
4122 if (unlikely(oldpolicy
!= -1 && oldpolicy
!= p
->policy
)) {
4123 policy
= oldpolicy
= -1;
4124 __task_rq_unlock(rq
);
4125 spin_unlock_irqrestore(&p
->pi_lock
, flags
);
4128 on_rq
= p
->se
.on_rq
;
4130 deactivate_task(rq
, p
, 0);
4132 __setscheduler(rq
, p
, policy
, param
->sched_priority
);
4134 activate_task(rq
, p
, 0);
4136 * Reschedule if we are currently running on this runqueue and
4137 * our priority decreased, or if we are not currently running on
4138 * this runqueue and our priority is higher than the current's
4140 if (task_running(rq
, p
)) {
4141 if (p
->prio
> oldprio
)
4142 resched_task(rq
->curr
);
4144 check_preempt_curr(rq
, p
);
4147 __task_rq_unlock(rq
);
4148 spin_unlock_irqrestore(&p
->pi_lock
, flags
);
4150 rt_mutex_adjust_pi(p
);
4154 EXPORT_SYMBOL_GPL(sched_setscheduler
);
4157 do_sched_setscheduler(pid_t pid
, int policy
, struct sched_param __user
*param
)
4159 struct sched_param lparam
;
4160 struct task_struct
*p
;
4163 if (!param
|| pid
< 0)
4165 if (copy_from_user(&lparam
, param
, sizeof(struct sched_param
)))
4170 p
= find_process_by_pid(pid
);
4172 retval
= sched_setscheduler(p
, policy
, &lparam
);
4179 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4180 * @pid: the pid in question.
4181 * @policy: new policy.
4182 * @param: structure containing the new RT priority.
4184 asmlinkage
long sys_sched_setscheduler(pid_t pid
, int policy
,
4185 struct sched_param __user
*param
)
4187 /* negative values for policy are not valid */
4191 return do_sched_setscheduler(pid
, policy
, param
);
4195 * sys_sched_setparam - set/change the RT priority of a thread
4196 * @pid: the pid in question.
4197 * @param: structure containing the new RT priority.
4199 asmlinkage
long sys_sched_setparam(pid_t pid
, struct sched_param __user
*param
)
4201 return do_sched_setscheduler(pid
, -1, param
);
4205 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4206 * @pid: the pid in question.
4208 asmlinkage
long sys_sched_getscheduler(pid_t pid
)
4210 struct task_struct
*p
;
4211 int retval
= -EINVAL
;
4217 read_lock(&tasklist_lock
);
4218 p
= find_process_by_pid(pid
);
4220 retval
= security_task_getscheduler(p
);
4224 read_unlock(&tasklist_lock
);
4231 * sys_sched_getscheduler - get the RT priority of a thread
4232 * @pid: the pid in question.
4233 * @param: structure containing the RT priority.
4235 asmlinkage
long sys_sched_getparam(pid_t pid
, struct sched_param __user
*param
)
4237 struct sched_param lp
;
4238 struct task_struct
*p
;
4239 int retval
= -EINVAL
;
4241 if (!param
|| pid
< 0)
4244 read_lock(&tasklist_lock
);
4245 p
= find_process_by_pid(pid
);
4250 retval
= security_task_getscheduler(p
);
4254 lp
.sched_priority
= p
->rt_priority
;
4255 read_unlock(&tasklist_lock
);
4258 * This one might sleep, we cannot do it with a spinlock held ...
4260 retval
= copy_to_user(param
, &lp
, sizeof(*param
)) ? -EFAULT
: 0;
4266 read_unlock(&tasklist_lock
);
4270 long sched_setaffinity(pid_t pid
, cpumask_t new_mask
)
4272 cpumask_t cpus_allowed
;
4273 struct task_struct
*p
;
4276 mutex_lock(&sched_hotcpu_mutex
);
4277 read_lock(&tasklist_lock
);
4279 p
= find_process_by_pid(pid
);
4281 read_unlock(&tasklist_lock
);
4282 mutex_unlock(&sched_hotcpu_mutex
);
4287 * It is not safe to call set_cpus_allowed with the
4288 * tasklist_lock held. We will bump the task_struct's
4289 * usage count and then drop tasklist_lock.
4292 read_unlock(&tasklist_lock
);
4295 if ((current
->euid
!= p
->euid
) && (current
->euid
!= p
->uid
) &&
4296 !capable(CAP_SYS_NICE
))
4299 retval
= security_task_setscheduler(p
, 0, NULL
);
4303 cpus_allowed
= cpuset_cpus_allowed(p
);
4304 cpus_and(new_mask
, new_mask
, cpus_allowed
);
4305 retval
= set_cpus_allowed(p
, new_mask
);
4309 mutex_unlock(&sched_hotcpu_mutex
);
4313 static int get_user_cpu_mask(unsigned long __user
*user_mask_ptr
, unsigned len
,
4314 cpumask_t
*new_mask
)
4316 if (len
< sizeof(cpumask_t
)) {
4317 memset(new_mask
, 0, sizeof(cpumask_t
));
4318 } else if (len
> sizeof(cpumask_t
)) {
4319 len
= sizeof(cpumask_t
);
4321 return copy_from_user(new_mask
, user_mask_ptr
, len
) ? -EFAULT
: 0;
4325 * sys_sched_setaffinity - set the cpu affinity of a process
4326 * @pid: pid of the process
4327 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4328 * @user_mask_ptr: user-space pointer to the new cpu mask
4330 asmlinkage
long sys_sched_setaffinity(pid_t pid
, unsigned int len
,
4331 unsigned long __user
*user_mask_ptr
)
4336 retval
= get_user_cpu_mask(user_mask_ptr
, len
, &new_mask
);
4340 return sched_setaffinity(pid
, new_mask
);
4344 * Represents all cpu's present in the system
4345 * In systems capable of hotplug, this map could dynamically grow
4346 * as new cpu's are detected in the system via any platform specific
4347 * method, such as ACPI for e.g.
4350 cpumask_t cpu_present_map __read_mostly
;
4351 EXPORT_SYMBOL(cpu_present_map
);
4354 cpumask_t cpu_online_map __read_mostly
= CPU_MASK_ALL
;
4355 EXPORT_SYMBOL(cpu_online_map
);
4357 cpumask_t cpu_possible_map __read_mostly
= CPU_MASK_ALL
;
4358 EXPORT_SYMBOL(cpu_possible_map
);
4361 long sched_getaffinity(pid_t pid
, cpumask_t
*mask
)
4363 struct task_struct
*p
;
4366 mutex_lock(&sched_hotcpu_mutex
);
4367 read_lock(&tasklist_lock
);
4370 p
= find_process_by_pid(pid
);
4374 retval
= security_task_getscheduler(p
);
4378 cpus_and(*mask
, p
->cpus_allowed
, cpu_online_map
);
4381 read_unlock(&tasklist_lock
);
4382 mutex_unlock(&sched_hotcpu_mutex
);
4390 * sys_sched_getaffinity - get the cpu affinity of a process
4391 * @pid: pid of the process
4392 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4393 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4395 asmlinkage
long sys_sched_getaffinity(pid_t pid
, unsigned int len
,
4396 unsigned long __user
*user_mask_ptr
)
4401 if (len
< sizeof(cpumask_t
))
4404 ret
= sched_getaffinity(pid
, &mask
);
4408 if (copy_to_user(user_mask_ptr
, &mask
, sizeof(cpumask_t
)))
4411 return sizeof(cpumask_t
);
4415 * sys_sched_yield - yield the current processor to other threads.
4417 * This function yields the current CPU to other tasks. If there are no
4418 * other threads running on this CPU then this function will return.
4420 asmlinkage
long sys_sched_yield(void)
4422 struct rq
*rq
= this_rq_lock();
4424 schedstat_inc(rq
, yld_cnt
);
4425 if (unlikely(rq
->nr_running
== 1))
4426 schedstat_inc(rq
, yld_act_empty
);
4428 current
->sched_class
->yield_task(rq
, current
);
4431 * Since we are going to call schedule() anyway, there's
4432 * no need to preempt or enable interrupts:
4434 __release(rq
->lock
);
4435 spin_release(&rq
->lock
.dep_map
, 1, _THIS_IP_
);
4436 _raw_spin_unlock(&rq
->lock
);
4437 preempt_enable_no_resched();
4444 static void __cond_resched(void)
4446 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4447 __might_sleep(__FILE__
, __LINE__
);
4450 * The BKS might be reacquired before we have dropped
4451 * PREEMPT_ACTIVE, which could trigger a second
4452 * cond_resched() call.
4455 add_preempt_count(PREEMPT_ACTIVE
);
4457 sub_preempt_count(PREEMPT_ACTIVE
);
4458 } while (need_resched());
4461 int __sched
cond_resched(void)
4463 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE
) &&
4464 system_state
== SYSTEM_RUNNING
) {
4470 EXPORT_SYMBOL(cond_resched
);
4473 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4474 * call schedule, and on return reacquire the lock.
4476 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4477 * operations here to prevent schedule() from being called twice (once via
4478 * spin_unlock(), once by hand).
4480 int cond_resched_lock(spinlock_t
*lock
)
4484 if (need_lockbreak(lock
)) {
4490 if (need_resched() && system_state
== SYSTEM_RUNNING
) {
4491 spin_release(&lock
->dep_map
, 1, _THIS_IP_
);
4492 _raw_spin_unlock(lock
);
4493 preempt_enable_no_resched();
4500 EXPORT_SYMBOL(cond_resched_lock
);
4502 int __sched
cond_resched_softirq(void)
4504 BUG_ON(!in_softirq());
4506 if (need_resched() && system_state
== SYSTEM_RUNNING
) {
4514 EXPORT_SYMBOL(cond_resched_softirq
);
4517 * yield - yield the current processor to other threads.
4519 * This is a shortcut for kernel-space yielding - it marks the
4520 * thread runnable and calls sys_sched_yield().
4522 void __sched
yield(void)
4524 set_current_state(TASK_RUNNING
);
4527 EXPORT_SYMBOL(yield
);
4530 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4531 * that process accounting knows that this is a task in IO wait state.
4533 * But don't do that if it is a deliberate, throttling IO wait (this task
4534 * has set its backing_dev_info: the queue against which it should throttle)
4536 void __sched
io_schedule(void)
4538 struct rq
*rq
= &__raw_get_cpu_var(runqueues
);
4540 delayacct_blkio_start();
4541 atomic_inc(&rq
->nr_iowait
);
4543 atomic_dec(&rq
->nr_iowait
);
4544 delayacct_blkio_end();
4546 EXPORT_SYMBOL(io_schedule
);
4548 long __sched
io_schedule_timeout(long timeout
)
4550 struct rq
*rq
= &__raw_get_cpu_var(runqueues
);
4553 delayacct_blkio_start();
4554 atomic_inc(&rq
->nr_iowait
);
4555 ret
= schedule_timeout(timeout
);
4556 atomic_dec(&rq
->nr_iowait
);
4557 delayacct_blkio_end();
4562 * sys_sched_get_priority_max - return maximum RT priority.
4563 * @policy: scheduling class.
4565 * this syscall returns the maximum rt_priority that can be used
4566 * by a given scheduling class.
4568 asmlinkage
long sys_sched_get_priority_max(int policy
)
4575 ret
= MAX_USER_RT_PRIO
-1;
4587 * sys_sched_get_priority_min - return minimum RT priority.
4588 * @policy: scheduling class.
4590 * this syscall returns the minimum rt_priority that can be used
4591 * by a given scheduling class.
4593 asmlinkage
long sys_sched_get_priority_min(int policy
)
4611 * sys_sched_rr_get_interval - return the default timeslice of a process.
4612 * @pid: pid of the process.
4613 * @interval: userspace pointer to the timeslice value.
4615 * this syscall writes the default timeslice value of a given process
4616 * into the user-space timespec buffer. A value of '0' means infinity.
4619 long sys_sched_rr_get_interval(pid_t pid
, struct timespec __user
*interval
)
4621 struct task_struct
*p
;
4622 int retval
= -EINVAL
;
4629 read_lock(&tasklist_lock
);
4630 p
= find_process_by_pid(pid
);
4634 retval
= security_task_getscheduler(p
);
4638 jiffies_to_timespec(p
->policy
== SCHED_FIFO
?
4639 0 : static_prio_timeslice(p
->static_prio
), &t
);
4640 read_unlock(&tasklist_lock
);
4641 retval
= copy_to_user(interval
, &t
, sizeof(t
)) ? -EFAULT
: 0;
4645 read_unlock(&tasklist_lock
);
4649 static const char stat_nam
[] = "RSDTtZX";
4651 static void show_task(struct task_struct
*p
)
4653 unsigned long free
= 0;
4656 state
= p
->state
? __ffs(p
->state
) + 1 : 0;
4657 printk("%-13.13s %c", p
->comm
,
4658 state
< sizeof(stat_nam
) - 1 ? stat_nam
[state
] : '?');
4659 #if BITS_PER_LONG == 32
4660 if (state
== TASK_RUNNING
)
4661 printk(" running ");
4663 printk(" %08lx ", thread_saved_pc(p
));
4665 if (state
== TASK_RUNNING
)
4666 printk(" running task ");
4668 printk(" %016lx ", thread_saved_pc(p
));
4670 #ifdef CONFIG_DEBUG_STACK_USAGE
4672 unsigned long *n
= end_of_stack(p
);
4675 free
= (unsigned long)n
- (unsigned long)end_of_stack(p
);
4678 printk("%5lu %5d %6d\n", free
, p
->pid
, p
->parent
->pid
);
4680 if (state
!= TASK_RUNNING
)
4681 show_stack(p
, NULL
);
4684 void show_state_filter(unsigned long state_filter
)
4686 struct task_struct
*g
, *p
;
4688 #if BITS_PER_LONG == 32
4690 " task PC stack pid father\n");
4693 " task PC stack pid father\n");
4695 read_lock(&tasklist_lock
);
4696 do_each_thread(g
, p
) {
4698 * reset the NMI-timeout, listing all files on a slow
4699 * console might take alot of time:
4701 touch_nmi_watchdog();
4702 if (!state_filter
|| (p
->state
& state_filter
))
4704 } while_each_thread(g
, p
);
4706 touch_all_softlockup_watchdogs();
4708 #ifdef CONFIG_SCHED_DEBUG
4709 sysrq_sched_debug_show();
4711 read_unlock(&tasklist_lock
);
4713 * Only show locks if all tasks are dumped:
4715 if (state_filter
== -1)
4716 debug_show_all_locks();
4719 void __cpuinit
init_idle_bootup_task(struct task_struct
*idle
)
4721 idle
->sched_class
= &idle_sched_class
;
4725 * init_idle - set up an idle thread for a given CPU
4726 * @idle: task in question
4727 * @cpu: cpu the idle task belongs to
4729 * NOTE: this function does not set the idle thread's NEED_RESCHED
4730 * flag, to make booting more robust.
4732 void __cpuinit
init_idle(struct task_struct
*idle
, int cpu
)
4734 struct rq
*rq
= cpu_rq(cpu
);
4735 unsigned long flags
;
4738 idle
->se
.exec_start
= sched_clock();
4740 idle
->prio
= idle
->normal_prio
= MAX_PRIO
;
4741 idle
->cpus_allowed
= cpumask_of_cpu(cpu
);
4742 __set_task_cpu(idle
, cpu
);
4744 spin_lock_irqsave(&rq
->lock
, flags
);
4745 rq
->curr
= rq
->idle
= idle
;
4746 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4749 spin_unlock_irqrestore(&rq
->lock
, flags
);
4751 /* Set the preempt count _outside_ the spinlocks! */
4752 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4753 task_thread_info(idle
)->preempt_count
= (idle
->lock_depth
>= 0);
4755 task_thread_info(idle
)->preempt_count
= 0;
4758 * The idle tasks have their own, simple scheduling class:
4760 idle
->sched_class
= &idle_sched_class
;
4764 * In a system that switches off the HZ timer nohz_cpu_mask
4765 * indicates which cpus entered this state. This is used
4766 * in the rcu update to wait only for active cpus. For system
4767 * which do not switch off the HZ timer nohz_cpu_mask should
4768 * always be CPU_MASK_NONE.
4770 cpumask_t nohz_cpu_mask
= CPU_MASK_NONE
;
4773 * Increase the granularity value when there are more CPUs,
4774 * because with more CPUs the 'effective latency' as visible
4775 * to users decreases. But the relationship is not linear,
4776 * so pick a second-best guess by going with the log2 of the
4779 * This idea comes from the SD scheduler of Con Kolivas:
4781 static inline void sched_init_granularity(void)
4783 unsigned int factor
= 1 + ilog2(num_online_cpus());
4784 const unsigned long gran_limit
= 100000000;
4786 sysctl_sched_granularity
*= factor
;
4787 if (sysctl_sched_granularity
> gran_limit
)
4788 sysctl_sched_granularity
= gran_limit
;
4790 sysctl_sched_runtime_limit
= sysctl_sched_granularity
* 4;
4791 sysctl_sched_wakeup_granularity
= sysctl_sched_granularity
/ 2;
4796 * This is how migration works:
4798 * 1) we queue a struct migration_req structure in the source CPU's
4799 * runqueue and wake up that CPU's migration thread.
4800 * 2) we down() the locked semaphore => thread blocks.
4801 * 3) migration thread wakes up (implicitly it forces the migrated
4802 * thread off the CPU)
4803 * 4) it gets the migration request and checks whether the migrated
4804 * task is still in the wrong runqueue.
4805 * 5) if it's in the wrong runqueue then the migration thread removes
4806 * it and puts it into the right queue.
4807 * 6) migration thread up()s the semaphore.
4808 * 7) we wake up and the migration is done.
4812 * Change a given task's CPU affinity. Migrate the thread to a
4813 * proper CPU and schedule it away if the CPU it's executing on
4814 * is removed from the allowed bitmask.
4816 * NOTE: the caller must have a valid reference to the task, the
4817 * task must not exit() & deallocate itself prematurely. The
4818 * call is not atomic; no spinlocks may be held.
4820 int set_cpus_allowed(struct task_struct
*p
, cpumask_t new_mask
)
4822 struct migration_req req
;
4823 unsigned long flags
;
4827 rq
= task_rq_lock(p
, &flags
);
4828 if (!cpus_intersects(new_mask
, cpu_online_map
)) {
4833 p
->cpus_allowed
= new_mask
;
4834 /* Can the task run on the task's current CPU? If so, we're done */
4835 if (cpu_isset(task_cpu(p
), new_mask
))
4838 if (migrate_task(p
, any_online_cpu(new_mask
), &req
)) {
4839 /* Need help from migration thread: drop lock and wait. */
4840 task_rq_unlock(rq
, &flags
);
4841 wake_up_process(rq
->migration_thread
);
4842 wait_for_completion(&req
.done
);
4843 tlb_migrate_finish(p
->mm
);
4847 task_rq_unlock(rq
, &flags
);
4851 EXPORT_SYMBOL_GPL(set_cpus_allowed
);
4854 * Move (not current) task off this cpu, onto dest cpu. We're doing
4855 * this because either it can't run here any more (set_cpus_allowed()
4856 * away from this CPU, or CPU going down), or because we're
4857 * attempting to rebalance this task on exec (sched_exec).
4859 * So we race with normal scheduler movements, but that's OK, as long
4860 * as the task is no longer on this CPU.
4862 * Returns non-zero if task was successfully migrated.
4864 static int __migrate_task(struct task_struct
*p
, int src_cpu
, int dest_cpu
)
4866 struct rq
*rq_dest
, *rq_src
;
4869 if (unlikely(cpu_is_offline(dest_cpu
)))
4872 rq_src
= cpu_rq(src_cpu
);
4873 rq_dest
= cpu_rq(dest_cpu
);
4875 double_rq_lock(rq_src
, rq_dest
);
4876 /* Already moved. */
4877 if (task_cpu(p
) != src_cpu
)
4879 /* Affinity changed (again). */
4880 if (!cpu_isset(dest_cpu
, p
->cpus_allowed
))
4883 on_rq
= p
->se
.on_rq
;
4885 deactivate_task(rq_src
, p
, 0);
4886 set_task_cpu(p
, dest_cpu
);
4888 activate_task(rq_dest
, p
, 0);
4889 check_preempt_curr(rq_dest
, p
);
4893 double_rq_unlock(rq_src
, rq_dest
);
4898 * migration_thread - this is a highprio system thread that performs
4899 * thread migration by bumping thread off CPU then 'pushing' onto
4902 static int migration_thread(void *data
)
4904 int cpu
= (long)data
;
4908 BUG_ON(rq
->migration_thread
!= current
);
4910 set_current_state(TASK_INTERRUPTIBLE
);
4911 while (!kthread_should_stop()) {
4912 struct migration_req
*req
;
4913 struct list_head
*head
;
4917 spin_lock_irq(&rq
->lock
);
4919 if (cpu_is_offline(cpu
)) {
4920 spin_unlock_irq(&rq
->lock
);
4924 if (rq
->active_balance
) {
4925 active_load_balance(rq
, cpu
);
4926 rq
->active_balance
= 0;
4929 head
= &rq
->migration_queue
;
4931 if (list_empty(head
)) {
4932 spin_unlock_irq(&rq
->lock
);
4934 set_current_state(TASK_INTERRUPTIBLE
);
4937 req
= list_entry(head
->next
, struct migration_req
, list
);
4938 list_del_init(head
->next
);
4940 spin_unlock(&rq
->lock
);
4941 __migrate_task(req
->task
, cpu
, req
->dest_cpu
);
4944 complete(&req
->done
);
4946 __set_current_state(TASK_RUNNING
);
4950 /* Wait for kthread_stop */
4951 set_current_state(TASK_INTERRUPTIBLE
);
4952 while (!kthread_should_stop()) {
4954 set_current_state(TASK_INTERRUPTIBLE
);
4956 __set_current_state(TASK_RUNNING
);
4960 #ifdef CONFIG_HOTPLUG_CPU
4962 * Figure out where task on dead CPU should go, use force if neccessary.
4963 * NOTE: interrupts should be disabled by the caller
4965 static void move_task_off_dead_cpu(int dead_cpu
, struct task_struct
*p
)
4967 unsigned long flags
;
4974 mask
= node_to_cpumask(cpu_to_node(dead_cpu
));
4975 cpus_and(mask
, mask
, p
->cpus_allowed
);
4976 dest_cpu
= any_online_cpu(mask
);
4978 /* On any allowed CPU? */
4979 if (dest_cpu
== NR_CPUS
)
4980 dest_cpu
= any_online_cpu(p
->cpus_allowed
);
4982 /* No more Mr. Nice Guy. */
4983 if (dest_cpu
== NR_CPUS
) {
4984 rq
= task_rq_lock(p
, &flags
);
4985 cpus_setall(p
->cpus_allowed
);
4986 dest_cpu
= any_online_cpu(p
->cpus_allowed
);
4987 task_rq_unlock(rq
, &flags
);
4990 * Don't tell them about moving exiting tasks or
4991 * kernel threads (both mm NULL), since they never
4994 if (p
->mm
&& printk_ratelimit())
4995 printk(KERN_INFO
"process %d (%s) no "
4996 "longer affine to cpu%d\n",
4997 p
->pid
, p
->comm
, dead_cpu
);
4999 if (!__migrate_task(p
, dead_cpu
, dest_cpu
))
5004 * While a dead CPU has no uninterruptible tasks queued at this point,
5005 * it might still have a nonzero ->nr_uninterruptible counter, because
5006 * for performance reasons the counter is not stricly tracking tasks to
5007 * their home CPUs. So we just add the counter to another CPU's counter,
5008 * to keep the global sum constant after CPU-down:
5010 static void migrate_nr_uninterruptible(struct rq
*rq_src
)
5012 struct rq
*rq_dest
= cpu_rq(any_online_cpu(CPU_MASK_ALL
));
5013 unsigned long flags
;
5015 local_irq_save(flags
);
5016 double_rq_lock(rq_src
, rq_dest
);
5017 rq_dest
->nr_uninterruptible
+= rq_src
->nr_uninterruptible
;
5018 rq_src
->nr_uninterruptible
= 0;
5019 double_rq_unlock(rq_src
, rq_dest
);
5020 local_irq_restore(flags
);
5023 /* Run through task list and migrate tasks from the dead cpu. */
5024 static void migrate_live_tasks(int src_cpu
)
5026 struct task_struct
*p
, *t
;
5028 write_lock_irq(&tasklist_lock
);
5030 do_each_thread(t
, p
) {
5034 if (task_cpu(p
) == src_cpu
)
5035 move_task_off_dead_cpu(src_cpu
, p
);
5036 } while_each_thread(t
, p
);
5038 write_unlock_irq(&tasklist_lock
);
5042 * Schedules idle task to be the next runnable task on current CPU.
5043 * It does so by boosting its priority to highest possible and adding it to
5044 * the _front_ of the runqueue. Used by CPU offline code.
5046 void sched_idle_next(void)
5048 int this_cpu
= smp_processor_id();
5049 struct rq
*rq
= cpu_rq(this_cpu
);
5050 struct task_struct
*p
= rq
->idle
;
5051 unsigned long flags
;
5053 /* cpu has to be offline */
5054 BUG_ON(cpu_online(this_cpu
));
5057 * Strictly not necessary since rest of the CPUs are stopped by now
5058 * and interrupts disabled on the current cpu.
5060 spin_lock_irqsave(&rq
->lock
, flags
);
5062 __setscheduler(rq
, p
, SCHED_FIFO
, MAX_RT_PRIO
-1);
5064 /* Add idle task to the _front_ of its priority queue: */
5065 activate_idle_task(p
, rq
);
5067 spin_unlock_irqrestore(&rq
->lock
, flags
);
5071 * Ensures that the idle task is using init_mm right before its cpu goes
5074 void idle_task_exit(void)
5076 struct mm_struct
*mm
= current
->active_mm
;
5078 BUG_ON(cpu_online(smp_processor_id()));
5081 switch_mm(mm
, &init_mm
, current
);
5085 /* called under rq->lock with disabled interrupts */
5086 static void migrate_dead(unsigned int dead_cpu
, struct task_struct
*p
)
5088 struct rq
*rq
= cpu_rq(dead_cpu
);
5090 /* Must be exiting, otherwise would be on tasklist. */
5091 BUG_ON(p
->exit_state
!= EXIT_ZOMBIE
&& p
->exit_state
!= EXIT_DEAD
);
5093 /* Cannot have done final schedule yet: would have vanished. */
5094 BUG_ON(p
->state
== TASK_DEAD
);
5099 * Drop lock around migration; if someone else moves it,
5100 * that's OK. No task can be added to this CPU, so iteration is
5102 * NOTE: interrupts should be left disabled --dev@
5104 spin_unlock(&rq
->lock
);
5105 move_task_off_dead_cpu(dead_cpu
, p
);
5106 spin_lock(&rq
->lock
);
5111 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5112 static void migrate_dead_tasks(unsigned int dead_cpu
)
5114 struct rq
*rq
= cpu_rq(dead_cpu
);
5115 struct task_struct
*next
;
5118 if (!rq
->nr_running
)
5120 next
= pick_next_task(rq
, rq
->curr
, rq_clock(rq
));
5123 migrate_dead(dead_cpu
, next
);
5126 #endif /* CONFIG_HOTPLUG_CPU */
5129 * migration_call - callback that gets triggered when a CPU is added.
5130 * Here we can start up the necessary migration thread for the new CPU.
5132 static int __cpuinit
5133 migration_call(struct notifier_block
*nfb
, unsigned long action
, void *hcpu
)
5135 struct task_struct
*p
;
5136 int cpu
= (long)hcpu
;
5137 unsigned long flags
;
5141 case CPU_LOCK_ACQUIRE
:
5142 mutex_lock(&sched_hotcpu_mutex
);
5145 case CPU_UP_PREPARE
:
5146 case CPU_UP_PREPARE_FROZEN
:
5147 p
= kthread_create(migration_thread
, hcpu
, "migration/%d", cpu
);
5150 p
->flags
|= PF_NOFREEZE
;
5151 kthread_bind(p
, cpu
);
5152 /* Must be high prio: stop_machine expects to yield to it. */
5153 rq
= task_rq_lock(p
, &flags
);
5154 __setscheduler(rq
, p
, SCHED_FIFO
, MAX_RT_PRIO
-1);
5155 task_rq_unlock(rq
, &flags
);
5156 cpu_rq(cpu
)->migration_thread
= p
;
5160 case CPU_ONLINE_FROZEN
:
5161 /* Strictly unneccessary, as first user will wake it. */
5162 wake_up_process(cpu_rq(cpu
)->migration_thread
);
5165 #ifdef CONFIG_HOTPLUG_CPU
5166 case CPU_UP_CANCELED
:
5167 case CPU_UP_CANCELED_FROZEN
:
5168 if (!cpu_rq(cpu
)->migration_thread
)
5170 /* Unbind it from offline cpu so it can run. Fall thru. */
5171 kthread_bind(cpu_rq(cpu
)->migration_thread
,
5172 any_online_cpu(cpu_online_map
));
5173 kthread_stop(cpu_rq(cpu
)->migration_thread
);
5174 cpu_rq(cpu
)->migration_thread
= NULL
;
5178 case CPU_DEAD_FROZEN
:
5179 migrate_live_tasks(cpu
);
5181 kthread_stop(rq
->migration_thread
);
5182 rq
->migration_thread
= NULL
;
5183 /* Idle task back to normal (off runqueue, low prio) */
5184 rq
= task_rq_lock(rq
->idle
, &flags
);
5185 deactivate_task(rq
, rq
->idle
, 0);
5186 rq
->idle
->static_prio
= MAX_PRIO
;
5187 __setscheduler(rq
, rq
->idle
, SCHED_NORMAL
, 0);
5188 rq
->idle
->sched_class
= &idle_sched_class
;
5189 migrate_dead_tasks(cpu
);
5190 task_rq_unlock(rq
, &flags
);
5191 migrate_nr_uninterruptible(rq
);
5192 BUG_ON(rq
->nr_running
!= 0);
5194 /* No need to migrate the tasks: it was best-effort if
5195 * they didn't take sched_hotcpu_mutex. Just wake up
5196 * the requestors. */
5197 spin_lock_irq(&rq
->lock
);
5198 while (!list_empty(&rq
->migration_queue
)) {
5199 struct migration_req
*req
;
5201 req
= list_entry(rq
->migration_queue
.next
,
5202 struct migration_req
, list
);
5203 list_del_init(&req
->list
);
5204 complete(&req
->done
);
5206 spin_unlock_irq(&rq
->lock
);
5209 case CPU_LOCK_RELEASE
:
5210 mutex_unlock(&sched_hotcpu_mutex
);
5216 /* Register at highest priority so that task migration (migrate_all_tasks)
5217 * happens before everything else.
5219 static struct notifier_block __cpuinitdata migration_notifier
= {
5220 .notifier_call
= migration_call
,
5224 int __init
migration_init(void)
5226 void *cpu
= (void *)(long)smp_processor_id();
5229 /* Start one for the boot CPU: */
5230 err
= migration_call(&migration_notifier
, CPU_UP_PREPARE
, cpu
);
5231 BUG_ON(err
== NOTIFY_BAD
);
5232 migration_call(&migration_notifier
, CPU_ONLINE
, cpu
);
5233 register_cpu_notifier(&migration_notifier
);
5241 /* Number of possible processor ids */
5242 int nr_cpu_ids __read_mostly
= NR_CPUS
;
5243 EXPORT_SYMBOL(nr_cpu_ids
);
5245 #undef SCHED_DOMAIN_DEBUG
5246 #ifdef SCHED_DOMAIN_DEBUG
5247 static void sched_domain_debug(struct sched_domain
*sd
, int cpu
)
5252 printk(KERN_DEBUG
"CPU%d attaching NULL sched-domain.\n", cpu
);
5256 printk(KERN_DEBUG
"CPU%d attaching sched-domain:\n", cpu
);
5261 struct sched_group
*group
= sd
->groups
;
5262 cpumask_t groupmask
;
5264 cpumask_scnprintf(str
, NR_CPUS
, sd
->span
);
5265 cpus_clear(groupmask
);
5268 for (i
= 0; i
< level
+ 1; i
++)
5270 printk("domain %d: ", level
);
5272 if (!(sd
->flags
& SD_LOAD_BALANCE
)) {
5273 printk("does not load-balance\n");
5275 printk(KERN_ERR
"ERROR: !SD_LOAD_BALANCE domain"
5280 printk("span %s\n", str
);
5282 if (!cpu_isset(cpu
, sd
->span
))
5283 printk(KERN_ERR
"ERROR: domain->span does not contain "
5285 if (!cpu_isset(cpu
, group
->cpumask
))
5286 printk(KERN_ERR
"ERROR: domain->groups does not contain"
5290 for (i
= 0; i
< level
+ 2; i
++)
5296 printk(KERN_ERR
"ERROR: group is NULL\n");
5300 if (!group
->__cpu_power
) {
5302 printk(KERN_ERR
"ERROR: domain->cpu_power not "
5306 if (!cpus_weight(group
->cpumask
)) {
5308 printk(KERN_ERR
"ERROR: empty group\n");
5311 if (cpus_intersects(groupmask
, group
->cpumask
)) {
5313 printk(KERN_ERR
"ERROR: repeated CPUs\n");
5316 cpus_or(groupmask
, groupmask
, group
->cpumask
);
5318 cpumask_scnprintf(str
, NR_CPUS
, group
->cpumask
);
5321 group
= group
->next
;
5322 } while (group
!= sd
->groups
);
5325 if (!cpus_equal(sd
->span
, groupmask
))
5326 printk(KERN_ERR
"ERROR: groups don't span "
5334 if (!cpus_subset(groupmask
, sd
->span
))
5335 printk(KERN_ERR
"ERROR: parent span is not a superset "
5336 "of domain->span\n");
5341 # define sched_domain_debug(sd, cpu) do { } while (0)
5344 static int sd_degenerate(struct sched_domain
*sd
)
5346 if (cpus_weight(sd
->span
) == 1)
5349 /* Following flags need at least 2 groups */
5350 if (sd
->flags
& (SD_LOAD_BALANCE
|
5351 SD_BALANCE_NEWIDLE
|
5355 SD_SHARE_PKG_RESOURCES
)) {
5356 if (sd
->groups
!= sd
->groups
->next
)
5360 /* Following flags don't use groups */
5361 if (sd
->flags
& (SD_WAKE_IDLE
|
5370 sd_parent_degenerate(struct sched_domain
*sd
, struct sched_domain
*parent
)
5372 unsigned long cflags
= sd
->flags
, pflags
= parent
->flags
;
5374 if (sd_degenerate(parent
))
5377 if (!cpus_equal(sd
->span
, parent
->span
))
5380 /* Does parent contain flags not in child? */
5381 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5382 if (cflags
& SD_WAKE_AFFINE
)
5383 pflags
&= ~SD_WAKE_BALANCE
;
5384 /* Flags needing groups don't count if only 1 group in parent */
5385 if (parent
->groups
== parent
->groups
->next
) {
5386 pflags
&= ~(SD_LOAD_BALANCE
|
5387 SD_BALANCE_NEWIDLE
|
5391 SD_SHARE_PKG_RESOURCES
);
5393 if (~cflags
& pflags
)
5400 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5401 * hold the hotplug lock.
5403 static void cpu_attach_domain(struct sched_domain
*sd
, int cpu
)
5405 struct rq
*rq
= cpu_rq(cpu
);
5406 struct sched_domain
*tmp
;
5408 /* Remove the sched domains which do not contribute to scheduling. */
5409 for (tmp
= sd
; tmp
; tmp
= tmp
->parent
) {
5410 struct sched_domain
*parent
= tmp
->parent
;
5413 if (sd_parent_degenerate(tmp
, parent
)) {
5414 tmp
->parent
= parent
->parent
;
5416 parent
->parent
->child
= tmp
;
5420 if (sd
&& sd_degenerate(sd
)) {
5426 sched_domain_debug(sd
, cpu
);
5428 rcu_assign_pointer(rq
->sd
, sd
);
5431 /* cpus with isolated domains */
5432 static cpumask_t cpu_isolated_map
= CPU_MASK_NONE
;
5434 /* Setup the mask of cpus configured for isolated domains */
5435 static int __init
isolated_cpu_setup(char *str
)
5437 int ints
[NR_CPUS
], i
;
5439 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
5440 cpus_clear(cpu_isolated_map
);
5441 for (i
= 1; i
<= ints
[0]; i
++)
5442 if (ints
[i
] < NR_CPUS
)
5443 cpu_set(ints
[i
], cpu_isolated_map
);
5447 __setup ("isolcpus=", isolated_cpu_setup
);
5450 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5451 * to a function which identifies what group(along with sched group) a CPU
5452 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5453 * (due to the fact that we keep track of groups covered with a cpumask_t).
5455 * init_sched_build_groups will build a circular linked list of the groups
5456 * covered by the given span, and will set each group's ->cpumask correctly,
5457 * and ->cpu_power to 0.
5460 init_sched_build_groups(cpumask_t span
, const cpumask_t
*cpu_map
,
5461 int (*group_fn
)(int cpu
, const cpumask_t
*cpu_map
,
5462 struct sched_group
**sg
))
5464 struct sched_group
*first
= NULL
, *last
= NULL
;
5465 cpumask_t covered
= CPU_MASK_NONE
;
5468 for_each_cpu_mask(i
, span
) {
5469 struct sched_group
*sg
;
5470 int group
= group_fn(i
, cpu_map
, &sg
);
5473 if (cpu_isset(i
, covered
))
5476 sg
->cpumask
= CPU_MASK_NONE
;
5477 sg
->__cpu_power
= 0;
5479 for_each_cpu_mask(j
, span
) {
5480 if (group_fn(j
, cpu_map
, NULL
) != group
)
5483 cpu_set(j
, covered
);
5484 cpu_set(j
, sg
->cpumask
);
5495 #define SD_NODES_PER_DOMAIN 16
5500 * find_next_best_node - find the next node to include in a sched_domain
5501 * @node: node whose sched_domain we're building
5502 * @used_nodes: nodes already in the sched_domain
5504 * Find the next node to include in a given scheduling domain. Simply
5505 * finds the closest node not already in the @used_nodes map.
5507 * Should use nodemask_t.
5509 static int find_next_best_node(int node
, unsigned long *used_nodes
)
5511 int i
, n
, val
, min_val
, best_node
= 0;
5515 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
5516 /* Start at @node */
5517 n
= (node
+ i
) % MAX_NUMNODES
;
5519 if (!nr_cpus_node(n
))
5522 /* Skip already used nodes */
5523 if (test_bit(n
, used_nodes
))
5526 /* Simple min distance search */
5527 val
= node_distance(node
, n
);
5529 if (val
< min_val
) {
5535 set_bit(best_node
, used_nodes
);
5540 * sched_domain_node_span - get a cpumask for a node's sched_domain
5541 * @node: node whose cpumask we're constructing
5542 * @size: number of nodes to include in this span
5544 * Given a node, construct a good cpumask for its sched_domain to span. It
5545 * should be one that prevents unnecessary balancing, but also spreads tasks
5548 static cpumask_t
sched_domain_node_span(int node
)
5550 DECLARE_BITMAP(used_nodes
, MAX_NUMNODES
);
5551 cpumask_t span
, nodemask
;
5555 bitmap_zero(used_nodes
, MAX_NUMNODES
);
5557 nodemask
= node_to_cpumask(node
);
5558 cpus_or(span
, span
, nodemask
);
5559 set_bit(node
, used_nodes
);
5561 for (i
= 1; i
< SD_NODES_PER_DOMAIN
; i
++) {
5562 int next_node
= find_next_best_node(node
, used_nodes
);
5564 nodemask
= node_to_cpumask(next_node
);
5565 cpus_or(span
, span
, nodemask
);
5572 int sched_smt_power_savings
= 0, sched_mc_power_savings
= 0;
5575 * SMT sched-domains:
5577 #ifdef CONFIG_SCHED_SMT
5578 static DEFINE_PER_CPU(struct sched_domain
, cpu_domains
);
5579 static DEFINE_PER_CPU(struct sched_group
, sched_group_cpus
);
5581 static int cpu_to_cpu_group(int cpu
, const cpumask_t
*cpu_map
,
5582 struct sched_group
**sg
)
5585 *sg
= &per_cpu(sched_group_cpus
, cpu
);
5591 * multi-core sched-domains:
5593 #ifdef CONFIG_SCHED_MC
5594 static DEFINE_PER_CPU(struct sched_domain
, core_domains
);
5595 static DEFINE_PER_CPU(struct sched_group
, sched_group_core
);
5598 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5599 static int cpu_to_core_group(int cpu
, const cpumask_t
*cpu_map
,
5600 struct sched_group
**sg
)
5603 cpumask_t mask
= cpu_sibling_map
[cpu
];
5604 cpus_and(mask
, mask
, *cpu_map
);
5605 group
= first_cpu(mask
);
5607 *sg
= &per_cpu(sched_group_core
, group
);
5610 #elif defined(CONFIG_SCHED_MC)
5611 static int cpu_to_core_group(int cpu
, const cpumask_t
*cpu_map
,
5612 struct sched_group
**sg
)
5615 *sg
= &per_cpu(sched_group_core
, cpu
);
5620 static DEFINE_PER_CPU(struct sched_domain
, phys_domains
);
5621 static DEFINE_PER_CPU(struct sched_group
, sched_group_phys
);
5623 static int cpu_to_phys_group(int cpu
, const cpumask_t
*cpu_map
,
5624 struct sched_group
**sg
)
5627 #ifdef CONFIG_SCHED_MC
5628 cpumask_t mask
= cpu_coregroup_map(cpu
);
5629 cpus_and(mask
, mask
, *cpu_map
);
5630 group
= first_cpu(mask
);
5631 #elif defined(CONFIG_SCHED_SMT)
5632 cpumask_t mask
= cpu_sibling_map
[cpu
];
5633 cpus_and(mask
, mask
, *cpu_map
);
5634 group
= first_cpu(mask
);
5639 *sg
= &per_cpu(sched_group_phys
, group
);
5645 * The init_sched_build_groups can't handle what we want to do with node
5646 * groups, so roll our own. Now each node has its own list of groups which
5647 * gets dynamically allocated.
5649 static DEFINE_PER_CPU(struct sched_domain
, node_domains
);
5650 static struct sched_group
**sched_group_nodes_bycpu
[NR_CPUS
];
5652 static DEFINE_PER_CPU(struct sched_domain
, allnodes_domains
);
5653 static DEFINE_PER_CPU(struct sched_group
, sched_group_allnodes
);
5655 static int cpu_to_allnodes_group(int cpu
, const cpumask_t
*cpu_map
,
5656 struct sched_group
**sg
)
5658 cpumask_t nodemask
= node_to_cpumask(cpu_to_node(cpu
));
5661 cpus_and(nodemask
, nodemask
, *cpu_map
);
5662 group
= first_cpu(nodemask
);
5665 *sg
= &per_cpu(sched_group_allnodes
, group
);
5669 static void init_numa_sched_groups_power(struct sched_group
*group_head
)
5671 struct sched_group
*sg
= group_head
;
5677 for_each_cpu_mask(j
, sg
->cpumask
) {
5678 struct sched_domain
*sd
;
5680 sd
= &per_cpu(phys_domains
, j
);
5681 if (j
!= first_cpu(sd
->groups
->cpumask
)) {
5683 * Only add "power" once for each
5689 sg_inc_cpu_power(sg
, sd
->groups
->__cpu_power
);
5692 if (sg
!= group_head
)
5698 /* Free memory allocated for various sched_group structures */
5699 static void free_sched_groups(const cpumask_t
*cpu_map
)
5703 for_each_cpu_mask(cpu
, *cpu_map
) {
5704 struct sched_group
**sched_group_nodes
5705 = sched_group_nodes_bycpu
[cpu
];
5707 if (!sched_group_nodes
)
5710 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
5711 cpumask_t nodemask
= node_to_cpumask(i
);
5712 struct sched_group
*oldsg
, *sg
= sched_group_nodes
[i
];
5714 cpus_and(nodemask
, nodemask
, *cpu_map
);
5715 if (cpus_empty(nodemask
))
5725 if (oldsg
!= sched_group_nodes
[i
])
5728 kfree(sched_group_nodes
);
5729 sched_group_nodes_bycpu
[cpu
] = NULL
;
5733 static void free_sched_groups(const cpumask_t
*cpu_map
)
5739 * Initialize sched groups cpu_power.
5741 * cpu_power indicates the capacity of sched group, which is used while
5742 * distributing the load between different sched groups in a sched domain.
5743 * Typically cpu_power for all the groups in a sched domain will be same unless
5744 * there are asymmetries in the topology. If there are asymmetries, group
5745 * having more cpu_power will pickup more load compared to the group having
5748 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5749 * the maximum number of tasks a group can handle in the presence of other idle
5750 * or lightly loaded groups in the same sched domain.
5752 static void init_sched_groups_power(int cpu
, struct sched_domain
*sd
)
5754 struct sched_domain
*child
;
5755 struct sched_group
*group
;
5757 WARN_ON(!sd
|| !sd
->groups
);
5759 if (cpu
!= first_cpu(sd
->groups
->cpumask
))
5764 sd
->groups
->__cpu_power
= 0;
5767 * For perf policy, if the groups in child domain share resources
5768 * (for example cores sharing some portions of the cache hierarchy
5769 * or SMT), then set this domain groups cpu_power such that each group
5770 * can handle only one task, when there are other idle groups in the
5771 * same sched domain.
5773 if (!child
|| (!(sd
->flags
& SD_POWERSAVINGS_BALANCE
) &&
5775 (SD_SHARE_CPUPOWER
| SD_SHARE_PKG_RESOURCES
)))) {
5776 sg_inc_cpu_power(sd
->groups
, SCHED_LOAD_SCALE
);
5781 * add cpu_power of each child group to this groups cpu_power
5783 group
= child
->groups
;
5785 sg_inc_cpu_power(sd
->groups
, group
->__cpu_power
);
5786 group
= group
->next
;
5787 } while (group
!= child
->groups
);
5791 * Build sched domains for a given set of cpus and attach the sched domains
5792 * to the individual cpus
5794 static int build_sched_domains(const cpumask_t
*cpu_map
)
5798 struct sched_group
**sched_group_nodes
= NULL
;
5799 int sd_allnodes
= 0;
5802 * Allocate the per-node list of sched groups
5804 sched_group_nodes
= kzalloc(sizeof(struct sched_group
*)*MAX_NUMNODES
,
5806 if (!sched_group_nodes
) {
5807 printk(KERN_WARNING
"Can not alloc sched group node list\n");
5810 sched_group_nodes_bycpu
[first_cpu(*cpu_map
)] = sched_group_nodes
;
5814 * Set up domains for cpus specified by the cpu_map.
5816 for_each_cpu_mask(i
, *cpu_map
) {
5817 struct sched_domain
*sd
= NULL
, *p
;
5818 cpumask_t nodemask
= node_to_cpumask(cpu_to_node(i
));
5820 cpus_and(nodemask
, nodemask
, *cpu_map
);
5823 if (cpus_weight(*cpu_map
) >
5824 SD_NODES_PER_DOMAIN
*cpus_weight(nodemask
)) {
5825 sd
= &per_cpu(allnodes_domains
, i
);
5826 *sd
= SD_ALLNODES_INIT
;
5827 sd
->span
= *cpu_map
;
5828 cpu_to_allnodes_group(i
, cpu_map
, &sd
->groups
);
5834 sd
= &per_cpu(node_domains
, i
);
5836 sd
->span
= sched_domain_node_span(cpu_to_node(i
));
5840 cpus_and(sd
->span
, sd
->span
, *cpu_map
);
5844 sd
= &per_cpu(phys_domains
, i
);
5846 sd
->span
= nodemask
;
5850 cpu_to_phys_group(i
, cpu_map
, &sd
->groups
);
5852 #ifdef CONFIG_SCHED_MC
5854 sd
= &per_cpu(core_domains
, i
);
5856 sd
->span
= cpu_coregroup_map(i
);
5857 cpus_and(sd
->span
, sd
->span
, *cpu_map
);
5860 cpu_to_core_group(i
, cpu_map
, &sd
->groups
);
5863 #ifdef CONFIG_SCHED_SMT
5865 sd
= &per_cpu(cpu_domains
, i
);
5866 *sd
= SD_SIBLING_INIT
;
5867 sd
->span
= cpu_sibling_map
[i
];
5868 cpus_and(sd
->span
, sd
->span
, *cpu_map
);
5871 cpu_to_cpu_group(i
, cpu_map
, &sd
->groups
);
5875 #ifdef CONFIG_SCHED_SMT
5876 /* Set up CPU (sibling) groups */
5877 for_each_cpu_mask(i
, *cpu_map
) {
5878 cpumask_t this_sibling_map
= cpu_sibling_map
[i
];
5879 cpus_and(this_sibling_map
, this_sibling_map
, *cpu_map
);
5880 if (i
!= first_cpu(this_sibling_map
))
5883 init_sched_build_groups(this_sibling_map
, cpu_map
,
5888 #ifdef CONFIG_SCHED_MC
5889 /* Set up multi-core groups */
5890 for_each_cpu_mask(i
, *cpu_map
) {
5891 cpumask_t this_core_map
= cpu_coregroup_map(i
);
5892 cpus_and(this_core_map
, this_core_map
, *cpu_map
);
5893 if (i
!= first_cpu(this_core_map
))
5895 init_sched_build_groups(this_core_map
, cpu_map
,
5896 &cpu_to_core_group
);
5900 /* Set up physical groups */
5901 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
5902 cpumask_t nodemask
= node_to_cpumask(i
);
5904 cpus_and(nodemask
, nodemask
, *cpu_map
);
5905 if (cpus_empty(nodemask
))
5908 init_sched_build_groups(nodemask
, cpu_map
, &cpu_to_phys_group
);
5912 /* Set up node groups */
5914 init_sched_build_groups(*cpu_map
, cpu_map
,
5915 &cpu_to_allnodes_group
);
5917 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
5918 /* Set up node groups */
5919 struct sched_group
*sg
, *prev
;
5920 cpumask_t nodemask
= node_to_cpumask(i
);
5921 cpumask_t domainspan
;
5922 cpumask_t covered
= CPU_MASK_NONE
;
5925 cpus_and(nodemask
, nodemask
, *cpu_map
);
5926 if (cpus_empty(nodemask
)) {
5927 sched_group_nodes
[i
] = NULL
;
5931 domainspan
= sched_domain_node_span(i
);
5932 cpus_and(domainspan
, domainspan
, *cpu_map
);
5934 sg
= kmalloc_node(sizeof(struct sched_group
), GFP_KERNEL
, i
);
5936 printk(KERN_WARNING
"Can not alloc domain group for "
5940 sched_group_nodes
[i
] = sg
;
5941 for_each_cpu_mask(j
, nodemask
) {
5942 struct sched_domain
*sd
;
5944 sd
= &per_cpu(node_domains
, j
);
5947 sg
->__cpu_power
= 0;
5948 sg
->cpumask
= nodemask
;
5950 cpus_or(covered
, covered
, nodemask
);
5953 for (j
= 0; j
< MAX_NUMNODES
; j
++) {
5954 cpumask_t tmp
, notcovered
;
5955 int n
= (i
+ j
) % MAX_NUMNODES
;
5957 cpus_complement(notcovered
, covered
);
5958 cpus_and(tmp
, notcovered
, *cpu_map
);
5959 cpus_and(tmp
, tmp
, domainspan
);
5960 if (cpus_empty(tmp
))
5963 nodemask
= node_to_cpumask(n
);
5964 cpus_and(tmp
, tmp
, nodemask
);
5965 if (cpus_empty(tmp
))
5968 sg
= kmalloc_node(sizeof(struct sched_group
),
5972 "Can not alloc domain group for node %d\n", j
);
5975 sg
->__cpu_power
= 0;
5977 sg
->next
= prev
->next
;
5978 cpus_or(covered
, covered
, tmp
);
5985 /* Calculate CPU power for physical packages and nodes */
5986 #ifdef CONFIG_SCHED_SMT
5987 for_each_cpu_mask(i
, *cpu_map
) {
5988 struct sched_domain
*sd
= &per_cpu(cpu_domains
, i
);
5990 init_sched_groups_power(i
, sd
);
5993 #ifdef CONFIG_SCHED_MC
5994 for_each_cpu_mask(i
, *cpu_map
) {
5995 struct sched_domain
*sd
= &per_cpu(core_domains
, i
);
5997 init_sched_groups_power(i
, sd
);
6001 for_each_cpu_mask(i
, *cpu_map
) {
6002 struct sched_domain
*sd
= &per_cpu(phys_domains
, i
);
6004 init_sched_groups_power(i
, sd
);
6008 for (i
= 0; i
< MAX_NUMNODES
; i
++)
6009 init_numa_sched_groups_power(sched_group_nodes
[i
]);
6012 struct sched_group
*sg
;
6014 cpu_to_allnodes_group(first_cpu(*cpu_map
), cpu_map
, &sg
);
6015 init_numa_sched_groups_power(sg
);
6019 /* Attach the domains */
6020 for_each_cpu_mask(i
, *cpu_map
) {
6021 struct sched_domain
*sd
;
6022 #ifdef CONFIG_SCHED_SMT
6023 sd
= &per_cpu(cpu_domains
, i
);
6024 #elif defined(CONFIG_SCHED_MC)
6025 sd
= &per_cpu(core_domains
, i
);
6027 sd
= &per_cpu(phys_domains
, i
);
6029 cpu_attach_domain(sd
, i
);
6036 free_sched_groups(cpu_map
);
6041 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6043 static int arch_init_sched_domains(const cpumask_t
*cpu_map
)
6045 cpumask_t cpu_default_map
;
6049 * Setup mask for cpus without special case scheduling requirements.
6050 * For now this just excludes isolated cpus, but could be used to
6051 * exclude other special cases in the future.
6053 cpus_andnot(cpu_default_map
, *cpu_map
, cpu_isolated_map
);
6055 err
= build_sched_domains(&cpu_default_map
);
6060 static void arch_destroy_sched_domains(const cpumask_t
*cpu_map
)
6062 free_sched_groups(cpu_map
);
6066 * Detach sched domains from a group of cpus specified in cpu_map
6067 * These cpus will now be attached to the NULL domain
6069 static void detach_destroy_domains(const cpumask_t
*cpu_map
)
6073 for_each_cpu_mask(i
, *cpu_map
)
6074 cpu_attach_domain(NULL
, i
);
6075 synchronize_sched();
6076 arch_destroy_sched_domains(cpu_map
);
6080 * Partition sched domains as specified by the cpumasks below.
6081 * This attaches all cpus from the cpumasks to the NULL domain,
6082 * waits for a RCU quiescent period, recalculates sched
6083 * domain information and then attaches them back to the
6084 * correct sched domains
6085 * Call with hotplug lock held
6087 int partition_sched_domains(cpumask_t
*partition1
, cpumask_t
*partition2
)
6089 cpumask_t change_map
;
6092 cpus_and(*partition1
, *partition1
, cpu_online_map
);
6093 cpus_and(*partition2
, *partition2
, cpu_online_map
);
6094 cpus_or(change_map
, *partition1
, *partition2
);
6096 /* Detach sched domains from all of the affected cpus */
6097 detach_destroy_domains(&change_map
);
6098 if (!cpus_empty(*partition1
))
6099 err
= build_sched_domains(partition1
);
6100 if (!err
&& !cpus_empty(*partition2
))
6101 err
= build_sched_domains(partition2
);
6106 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6107 int arch_reinit_sched_domains(void)
6111 mutex_lock(&sched_hotcpu_mutex
);
6112 detach_destroy_domains(&cpu_online_map
);
6113 err
= arch_init_sched_domains(&cpu_online_map
);
6114 mutex_unlock(&sched_hotcpu_mutex
);
6119 static ssize_t
sched_power_savings_store(const char *buf
, size_t count
, int smt
)
6123 if (buf
[0] != '0' && buf
[0] != '1')
6127 sched_smt_power_savings
= (buf
[0] == '1');
6129 sched_mc_power_savings
= (buf
[0] == '1');
6131 ret
= arch_reinit_sched_domains();
6133 return ret
? ret
: count
;
6136 int sched_create_sysfs_power_savings_entries(struct sysdev_class
*cls
)
6140 #ifdef CONFIG_SCHED_SMT
6142 err
= sysfs_create_file(&cls
->kset
.kobj
,
6143 &attr_sched_smt_power_savings
.attr
);
6145 #ifdef CONFIG_SCHED_MC
6146 if (!err
&& mc_capable())
6147 err
= sysfs_create_file(&cls
->kset
.kobj
,
6148 &attr_sched_mc_power_savings
.attr
);
6154 #ifdef CONFIG_SCHED_MC
6155 static ssize_t
sched_mc_power_savings_show(struct sys_device
*dev
, char *page
)
6157 return sprintf(page
, "%u\n", sched_mc_power_savings
);
6159 static ssize_t
sched_mc_power_savings_store(struct sys_device
*dev
,
6160 const char *buf
, size_t count
)
6162 return sched_power_savings_store(buf
, count
, 0);
6164 SYSDEV_ATTR(sched_mc_power_savings
, 0644, sched_mc_power_savings_show
,
6165 sched_mc_power_savings_store
);
6168 #ifdef CONFIG_SCHED_SMT
6169 static ssize_t
sched_smt_power_savings_show(struct sys_device
*dev
, char *page
)
6171 return sprintf(page
, "%u\n", sched_smt_power_savings
);
6173 static ssize_t
sched_smt_power_savings_store(struct sys_device
*dev
,
6174 const char *buf
, size_t count
)
6176 return sched_power_savings_store(buf
, count
, 1);
6178 SYSDEV_ATTR(sched_smt_power_savings
, 0644, sched_smt_power_savings_show
,
6179 sched_smt_power_savings_store
);
6183 * Force a reinitialization of the sched domains hierarchy. The domains
6184 * and groups cannot be updated in place without racing with the balancing
6185 * code, so we temporarily attach all running cpus to the NULL domain
6186 * which will prevent rebalancing while the sched domains are recalculated.
6188 static int update_sched_domains(struct notifier_block
*nfb
,
6189 unsigned long action
, void *hcpu
)
6192 case CPU_UP_PREPARE
:
6193 case CPU_UP_PREPARE_FROZEN
:
6194 case CPU_DOWN_PREPARE
:
6195 case CPU_DOWN_PREPARE_FROZEN
:
6196 detach_destroy_domains(&cpu_online_map
);
6199 case CPU_UP_CANCELED
:
6200 case CPU_UP_CANCELED_FROZEN
:
6201 case CPU_DOWN_FAILED
:
6202 case CPU_DOWN_FAILED_FROZEN
:
6204 case CPU_ONLINE_FROZEN
:
6206 case CPU_DEAD_FROZEN
:
6208 * Fall through and re-initialise the domains.
6215 /* The hotplug lock is already held by cpu_up/cpu_down */
6216 arch_init_sched_domains(&cpu_online_map
);
6221 void __init
sched_init_smp(void)
6223 cpumask_t non_isolated_cpus
;
6225 mutex_lock(&sched_hotcpu_mutex
);
6226 arch_init_sched_domains(&cpu_online_map
);
6227 cpus_andnot(non_isolated_cpus
, cpu_possible_map
, cpu_isolated_map
);
6228 if (cpus_empty(non_isolated_cpus
))
6229 cpu_set(smp_processor_id(), non_isolated_cpus
);
6230 mutex_unlock(&sched_hotcpu_mutex
);
6231 /* XXX: Theoretical race here - CPU may be hotplugged now */
6232 hotcpu_notifier(update_sched_domains
, 0);
6234 /* Move init over to a non-isolated CPU */
6235 if (set_cpus_allowed(current
, non_isolated_cpus
) < 0)
6237 sched_init_granularity();
6240 void __init
sched_init_smp(void)
6242 sched_init_granularity();
6244 #endif /* CONFIG_SMP */
6246 int in_sched_functions(unsigned long addr
)
6248 /* Linker adds these: start and end of __sched functions */
6249 extern char __sched_text_start
[], __sched_text_end
[];
6251 return in_lock_functions(addr
) ||
6252 (addr
>= (unsigned long)__sched_text_start
6253 && addr
< (unsigned long)__sched_text_end
);
6256 static inline void init_cfs_rq(struct cfs_rq
*cfs_rq
, struct rq
*rq
)
6258 cfs_rq
->tasks_timeline
= RB_ROOT
;
6259 cfs_rq
->fair_clock
= 1;
6260 #ifdef CONFIG_FAIR_GROUP_SCHED
6265 void __init
sched_init(void)
6267 u64 now
= sched_clock();
6268 int highest_cpu
= 0;
6272 * Link up the scheduling class hierarchy:
6274 rt_sched_class
.next
= &fair_sched_class
;
6275 fair_sched_class
.next
= &idle_sched_class
;
6276 idle_sched_class
.next
= NULL
;
6278 for_each_possible_cpu(i
) {
6279 struct rt_prio_array
*array
;
6283 spin_lock_init(&rq
->lock
);
6284 lockdep_set_class(&rq
->lock
, &rq
->rq_lock_key
);
6287 init_cfs_rq(&rq
->cfs
, rq
);
6288 #ifdef CONFIG_FAIR_GROUP_SCHED
6289 INIT_LIST_HEAD(&rq
->leaf_cfs_rq_list
);
6290 list_add(&rq
->cfs
.leaf_cfs_rq_list
, &rq
->leaf_cfs_rq_list
);
6292 rq
->ls
.load_update_last
= now
;
6293 rq
->ls
.load_update_start
= now
;
6295 for (j
= 0; j
< CPU_LOAD_IDX_MAX
; j
++)
6296 rq
->cpu_load
[j
] = 0;
6299 rq
->active_balance
= 0;
6300 rq
->next_balance
= jiffies
;
6303 rq
->migration_thread
= NULL
;
6304 INIT_LIST_HEAD(&rq
->migration_queue
);
6306 atomic_set(&rq
->nr_iowait
, 0);
6308 array
= &rq
->rt
.active
;
6309 for (j
= 0; j
< MAX_RT_PRIO
; j
++) {
6310 INIT_LIST_HEAD(array
->queue
+ j
);
6311 __clear_bit(j
, array
->bitmap
);
6314 /* delimiter for bitsearch: */
6315 __set_bit(MAX_RT_PRIO
, array
->bitmap
);
6318 set_load_weight(&init_task
);
6321 nr_cpu_ids
= highest_cpu
+ 1;
6322 open_softirq(SCHED_SOFTIRQ
, run_rebalance_domains
, NULL
);
6325 #ifdef CONFIG_RT_MUTEXES
6326 plist_head_init(&init_task
.pi_waiters
, &init_task
.pi_lock
);
6330 * The boot idle thread does lazy MMU switching as well:
6332 atomic_inc(&init_mm
.mm_count
);
6333 enter_lazy_tlb(&init_mm
, current
);
6336 * Make us the idle thread. Technically, schedule() should not be
6337 * called from this thread, however somewhere below it might be,
6338 * but because we are the idle thread, we just pick up running again
6339 * when this runqueue becomes "idle".
6341 init_idle(current
, smp_processor_id());
6343 * During early bootup we pretend to be a normal task:
6345 current
->sched_class
= &fair_sched_class
;
6348 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6349 void __might_sleep(char *file
, int line
)
6352 static unsigned long prev_jiffy
; /* ratelimiting */
6354 if ((in_atomic() || irqs_disabled()) &&
6355 system_state
== SYSTEM_RUNNING
&& !oops_in_progress
) {
6356 if (time_before(jiffies
, prev_jiffy
+ HZ
) && prev_jiffy
)
6358 prev_jiffy
= jiffies
;
6359 printk(KERN_ERR
"BUG: sleeping function called from invalid"
6360 " context at %s:%d\n", file
, line
);
6361 printk("in_atomic():%d, irqs_disabled():%d\n",
6362 in_atomic(), irqs_disabled());
6363 debug_show_held_locks(current
);
6364 if (irqs_disabled())
6365 print_irqtrace_events(current
);
6370 EXPORT_SYMBOL(__might_sleep
);
6373 #ifdef CONFIG_MAGIC_SYSRQ
6374 void normalize_rt_tasks(void)
6376 struct task_struct
*g
, *p
;
6377 unsigned long flags
;
6381 read_lock_irq(&tasklist_lock
);
6382 do_each_thread(g
, p
) {
6384 p
->se
.wait_runtime
= 0;
6385 p
->se
.wait_start_fair
= 0;
6386 p
->se
.wait_start
= 0;
6387 p
->se
.exec_start
= 0;
6388 p
->se
.sleep_start
= 0;
6389 p
->se
.sleep_start_fair
= 0;
6390 p
->se
.block_start
= 0;
6391 task_rq(p
)->cfs
.fair_clock
= 0;
6392 task_rq(p
)->clock
= 0;
6396 * Renice negative nice level userspace
6399 if (TASK_NICE(p
) < 0 && p
->mm
)
6400 set_user_nice(p
, 0);
6404 spin_lock_irqsave(&p
->pi_lock
, flags
);
6405 rq
= __task_rq_lock(p
);
6408 * Do not touch the migration thread:
6410 if (p
== rq
->migration_thread
)
6414 on_rq
= p
->se
.on_rq
;
6416 deactivate_task(task_rq(p
), p
, 0);
6417 __setscheduler(rq
, p
, SCHED_NORMAL
, 0);
6419 activate_task(task_rq(p
), p
, 0);
6420 resched_task(rq
->curr
);
6425 __task_rq_unlock(rq
);
6426 spin_unlock_irqrestore(&p
->pi_lock
, flags
);
6427 } while_each_thread(g
, p
);
6429 read_unlock_irq(&tasklist_lock
);
6432 #endif /* CONFIG_MAGIC_SYSRQ */
6436 * These functions are only useful for the IA64 MCA handling.
6438 * They can only be called when the whole system has been
6439 * stopped - every CPU needs to be quiescent, and no scheduling
6440 * activity can take place. Using them for anything else would
6441 * be a serious bug, and as a result, they aren't even visible
6442 * under any other configuration.
6446 * curr_task - return the current task for a given cpu.
6447 * @cpu: the processor in question.
6449 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6451 struct task_struct
*curr_task(int cpu
)
6453 return cpu_curr(cpu
);
6457 * set_curr_task - set the current task for a given cpu.
6458 * @cpu: the processor in question.
6459 * @p: the task pointer to set.
6461 * Description: This function must only be used when non-maskable interrupts
6462 * are serviced on a separate stack. It allows the architecture to switch the
6463 * notion of the current task on a cpu in a non-blocking manner. This function
6464 * must be called with all CPU's synchronized, and interrupts disabled, the
6465 * and caller must save the original value of the current task (see
6466 * curr_task() above) and restore that value before reenabling interrupts and
6467 * re-starting the system.
6469 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6471 void set_curr_task(int cpu
, struct task_struct
*p
)