2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
7 static cpumask_t rt_overload_mask
;
8 static atomic_t rto_count
;
9 static inline int rt_overloaded(void)
11 return atomic_read(&rto_count
);
13 static inline cpumask_t
*rt_overload(void)
15 return &rt_overload_mask
;
17 static inline void rt_set_overload(struct rq
*rq
)
19 cpu_set(rq
->cpu
, rt_overload_mask
);
21 * Make sure the mask is visible before we set
22 * the overload count. That is checked to determine
23 * if we should look at the mask. It would be a shame
24 * if we looked at the mask, but the mask was not
28 atomic_inc(&rto_count
);
30 static inline void rt_clear_overload(struct rq
*rq
)
32 /* the order here really doesn't matter */
33 atomic_dec(&rto_count
);
34 cpu_clear(rq
->cpu
, rt_overload_mask
);
36 #endif /* CONFIG_SMP */
39 * Update the current task's runtime statistics. Skip current tasks that
40 * are not in our scheduling class.
42 static void update_curr_rt(struct rq
*rq
)
44 struct task_struct
*curr
= rq
->curr
;
47 if (!task_has_rt_policy(curr
))
50 delta_exec
= rq
->clock
- curr
->se
.exec_start
;
51 if (unlikely((s64
)delta_exec
< 0))
54 schedstat_set(curr
->se
.exec_max
, max(curr
->se
.exec_max
, delta_exec
));
56 curr
->se
.sum_exec_runtime
+= delta_exec
;
57 curr
->se
.exec_start
= rq
->clock
;
58 cpuacct_charge(curr
, delta_exec
);
61 static inline void inc_rt_tasks(struct task_struct
*p
, struct rq
*rq
)
64 rq
->rt
.rt_nr_running
++;
66 if (p
->prio
< rq
->rt
.highest_prio
)
67 rq
->rt
.highest_prio
= p
->prio
;
68 if (rq
->rt
.rt_nr_running
> 1)
70 #endif /* CONFIG_SMP */
73 static inline void dec_rt_tasks(struct task_struct
*p
, struct rq
*rq
)
76 WARN_ON(!rq
->rt
.rt_nr_running
);
77 rq
->rt
.rt_nr_running
--;
79 if (rq
->rt
.rt_nr_running
) {
80 struct rt_prio_array
*array
;
82 WARN_ON(p
->prio
< rq
->rt
.highest_prio
);
83 if (p
->prio
== rq
->rt
.highest_prio
) {
85 array
= &rq
->rt
.active
;
87 sched_find_first_bit(array
->bitmap
);
88 } /* otherwise leave rq->highest prio alone */
90 rq
->rt
.highest_prio
= MAX_RT_PRIO
;
91 if (rq
->rt
.rt_nr_running
< 2)
92 rt_clear_overload(rq
);
93 #endif /* CONFIG_SMP */
96 static void enqueue_task_rt(struct rq
*rq
, struct task_struct
*p
, int wakeup
)
98 struct rt_prio_array
*array
= &rq
->rt
.active
;
100 list_add_tail(&p
->run_list
, array
->queue
+ p
->prio
);
101 __set_bit(p
->prio
, array
->bitmap
);
102 inc_cpu_load(rq
, p
->se
.load
.weight
);
108 * Adding/removing a task to/from a priority array:
110 static void dequeue_task_rt(struct rq
*rq
, struct task_struct
*p
, int sleep
)
112 struct rt_prio_array
*array
= &rq
->rt
.active
;
116 list_del(&p
->run_list
);
117 if (list_empty(array
->queue
+ p
->prio
))
118 __clear_bit(p
->prio
, array
->bitmap
);
119 dec_cpu_load(rq
, p
->se
.load
.weight
);
125 * Put task to the end of the run list without the overhead of dequeue
126 * followed by enqueue.
128 static void requeue_task_rt(struct rq
*rq
, struct task_struct
*p
)
130 struct rt_prio_array
*array
= &rq
->rt
.active
;
132 list_move_tail(&p
->run_list
, array
->queue
+ p
->prio
);
136 yield_task_rt(struct rq
*rq
)
138 requeue_task_rt(rq
, rq
->curr
);
142 * Preempt the current task with a newly woken task if needed:
144 static void check_preempt_curr_rt(struct rq
*rq
, struct task_struct
*p
)
146 if (p
->prio
< rq
->curr
->prio
)
147 resched_task(rq
->curr
);
150 static struct task_struct
*pick_next_task_rt(struct rq
*rq
)
152 struct rt_prio_array
*array
= &rq
->rt
.active
;
153 struct task_struct
*next
;
154 struct list_head
*queue
;
157 idx
= sched_find_first_bit(array
->bitmap
);
158 if (idx
>= MAX_RT_PRIO
)
161 queue
= array
->queue
+ idx
;
162 next
= list_entry(queue
->next
, struct task_struct
, run_list
);
164 next
->se
.exec_start
= rq
->clock
;
169 static void put_prev_task_rt(struct rq
*rq
, struct task_struct
*p
)
172 p
->se
.exec_start
= 0;
176 /* Only try algorithms three times */
177 #define RT_MAX_TRIES 3
179 static int double_lock_balance(struct rq
*this_rq
, struct rq
*busiest
);
180 static void deactivate_task(struct rq
*rq
, struct task_struct
*p
, int sleep
);
182 static int pick_rt_task(struct rq
*rq
, struct task_struct
*p
, int cpu
)
184 if (!task_running(rq
, p
) &&
185 (cpu
< 0 || cpu_isset(cpu
, p
->cpus_allowed
)))
190 /* Return the second highest RT task, NULL otherwise */
191 static struct task_struct
*pick_next_highest_task_rt(struct rq
*rq
,
194 struct rt_prio_array
*array
= &rq
->rt
.active
;
195 struct task_struct
*next
;
196 struct list_head
*queue
;
199 assert_spin_locked(&rq
->lock
);
201 if (likely(rq
->rt
.rt_nr_running
< 2))
204 idx
= sched_find_first_bit(array
->bitmap
);
205 if (unlikely(idx
>= MAX_RT_PRIO
)) {
206 WARN_ON(1); /* rt_nr_running is bad */
210 queue
= array
->queue
+ idx
;
211 BUG_ON(list_empty(queue
));
213 next
= list_entry(queue
->next
, struct task_struct
, run_list
);
214 if (unlikely(pick_rt_task(rq
, next
, cpu
)))
217 if (queue
->next
->next
!= queue
) {
219 next
= list_entry(queue
->next
->next
, struct task_struct
, run_list
);
220 if (pick_rt_task(rq
, next
, cpu
))
225 /* slower, but more flexible */
226 idx
= find_next_bit(array
->bitmap
, MAX_RT_PRIO
, idx
+1);
227 if (unlikely(idx
>= MAX_RT_PRIO
))
230 queue
= array
->queue
+ idx
;
231 BUG_ON(list_empty(queue
));
233 list_for_each_entry(next
, queue
, run_list
) {
234 if (pick_rt_task(rq
, next
, cpu
))
244 static DEFINE_PER_CPU(cpumask_t
, local_cpu_mask
);
246 /* Will lock the rq it finds */
247 static struct rq
*find_lock_lowest_rq(struct task_struct
*task
,
250 struct rq
*lowest_rq
= NULL
;
253 cpumask_t
*cpu_mask
= &__get_cpu_var(local_cpu_mask
);
255 cpus_and(*cpu_mask
, cpu_online_map
, task
->cpus_allowed
);
257 for (tries
= 0; tries
< RT_MAX_TRIES
; tries
++) {
259 * Scan each rq for the lowest prio.
261 for_each_cpu_mask(cpu
, *cpu_mask
) {
262 struct rq
*rq
= &per_cpu(runqueues
, cpu
);
264 if (cpu
== this_rq
->cpu
)
267 /* We look for lowest RT prio or non-rt CPU */
268 if (rq
->rt
.highest_prio
>= MAX_RT_PRIO
) {
273 /* no locking for now */
274 if (rq
->rt
.highest_prio
> task
->prio
&&
275 (!lowest_rq
|| rq
->rt
.highest_prio
> lowest_rq
->rt
.highest_prio
)) {
283 /* if the prio of this runqueue changed, try again */
284 if (double_lock_balance(this_rq
, lowest_rq
)) {
286 * We had to unlock the run queue. In
287 * the mean time, task could have
288 * migrated already or had its affinity changed.
289 * Also make sure that it wasn't scheduled on its rq.
291 if (unlikely(task_rq(task
) != this_rq
||
292 !cpu_isset(lowest_rq
->cpu
, task
->cpus_allowed
) ||
293 task_running(this_rq
, task
) ||
295 spin_unlock(&lowest_rq
->lock
);
301 /* If this rq is still suitable use it. */
302 if (lowest_rq
->rt
.highest_prio
> task
->prio
)
306 spin_unlock(&lowest_rq
->lock
);
314 * If the current CPU has more than one RT task, see if the non
315 * running task can migrate over to a CPU that is running a task
316 * of lesser priority.
318 static int push_rt_task(struct rq
*this_rq
)
320 struct task_struct
*next_task
;
321 struct rq
*lowest_rq
;
323 int paranoid
= RT_MAX_TRIES
;
325 assert_spin_locked(&this_rq
->lock
);
327 next_task
= pick_next_highest_task_rt(this_rq
, -1);
332 if (unlikely(next_task
== this_rq
->curr
)) {
338 * It's possible that the next_task slipped in of
339 * higher priority than current. If that's the case
340 * just reschedule current.
342 if (unlikely(next_task
->prio
< this_rq
->curr
->prio
)) {
343 resched_task(this_rq
->curr
);
347 /* We might release this_rq lock */
348 get_task_struct(next_task
);
350 /* find_lock_lowest_rq locks the rq if found */
351 lowest_rq
= find_lock_lowest_rq(next_task
, this_rq
);
353 struct task_struct
*task
;
355 * find lock_lowest_rq releases this_rq->lock
356 * so it is possible that next_task has changed.
357 * If it has, then try again.
359 task
= pick_next_highest_task_rt(this_rq
, -1);
360 if (unlikely(task
!= next_task
) && task
&& paranoid
--) {
361 put_task_struct(next_task
);
368 assert_spin_locked(&lowest_rq
->lock
);
370 deactivate_task(this_rq
, next_task
, 0);
371 set_task_cpu(next_task
, lowest_rq
->cpu
);
372 activate_task(lowest_rq
, next_task
, 0);
374 resched_task(lowest_rq
->curr
);
376 spin_unlock(&lowest_rq
->lock
);
380 put_task_struct(next_task
);
386 * TODO: Currently we just use the second highest prio task on
387 * the queue, and stop when it can't migrate (or there's
388 * no more RT tasks). There may be a case where a lower
389 * priority RT task has a different affinity than the
390 * higher RT task. In this case the lower RT task could
391 * possibly be able to migrate where as the higher priority
392 * RT task could not. We currently ignore this issue.
393 * Enhancements are welcome!
395 static void push_rt_tasks(struct rq
*rq
)
397 /* push_rt_task will return true if it moved an RT */
398 while (push_rt_task(rq
))
402 static int pull_rt_task(struct rq
*this_rq
)
404 struct task_struct
*next
;
405 struct task_struct
*p
;
407 cpumask_t
*rto_cpumask
;
408 int this_cpu
= this_rq
->cpu
;
412 assert_spin_locked(&this_rq
->lock
);
415 * If cpusets are used, and we have overlapping
416 * run queue cpusets, then this algorithm may not catch all.
417 * This is just the price you pay on trying to keep
418 * dirtying caches down on large SMP machines.
420 if (likely(!rt_overloaded()))
423 next
= pick_next_task_rt(this_rq
);
425 rto_cpumask
= rt_overload();
427 for_each_cpu_mask(cpu
, *rto_cpumask
) {
431 src_rq
= cpu_rq(cpu
);
432 if (unlikely(src_rq
->rt
.rt_nr_running
<= 1)) {
434 * It is possible that overlapping cpusets
435 * will miss clearing a non overloaded runqueue.
438 if (double_lock_balance(this_rq
, src_rq
)) {
439 /* unlocked our runqueue lock */
440 struct task_struct
*old_next
= next
;
441 next
= pick_next_task_rt(this_rq
);
442 if (next
!= old_next
)
445 if (likely(src_rq
->rt
.rt_nr_running
<= 1))
447 * Small chance that this_rq->curr changed
448 * but it's really harmless here.
450 rt_clear_overload(this_rq
);
453 * Heh, the src_rq is now overloaded, since
454 * we already have the src_rq lock, go straight
455 * to pulling tasks from it.
458 spin_unlock(&src_rq
->lock
);
463 * We can potentially drop this_rq's lock in
464 * double_lock_balance, and another CPU could
465 * steal our next task - hence we must cause
466 * the caller to recalculate the next task
469 if (double_lock_balance(this_rq
, src_rq
)) {
470 struct task_struct
*old_next
= next
;
471 next
= pick_next_task_rt(this_rq
);
472 if (next
!= old_next
)
477 * Are there still pullable RT tasks?
479 if (src_rq
->rt
.rt_nr_running
<= 1) {
480 spin_unlock(&src_rq
->lock
);
485 p
= pick_next_highest_task_rt(src_rq
, this_cpu
);
488 * Do we have an RT task that preempts
489 * the to-be-scheduled task?
491 if (p
&& (!next
|| (p
->prio
< next
->prio
))) {
492 WARN_ON(p
== src_rq
->curr
);
493 WARN_ON(!p
->se
.on_rq
);
496 * There's a chance that p is higher in priority
497 * than what's currently running on its cpu.
498 * This is just that p is wakeing up and hasn't
499 * had a chance to schedule. We only pull
500 * p if it is lower in priority than the
501 * current task on the run queue or
502 * this_rq next task is lower in prio than
503 * the current task on that rq.
505 if (p
->prio
< src_rq
->curr
->prio
||
506 (next
&& next
->prio
< src_rq
->curr
->prio
))
511 deactivate_task(src_rq
, p
, 0);
512 set_task_cpu(p
, this_cpu
);
513 activate_task(this_rq
, p
, 0);
515 * We continue with the search, just in
516 * case there's an even higher prio task
517 * in another runqueue. (low likelyhood
522 * Update next so that we won't pick a task
523 * on another cpu with a priority lower (or equal)
524 * than the one we just picked.
530 spin_unlock(&src_rq
->lock
);
536 static void schedule_balance_rt(struct rq
*rq
,
537 struct task_struct
*prev
)
539 /* Try to pull RT tasks here if we lower this rq's prio */
540 if (unlikely(rt_task(prev
)) &&
541 rq
->rt
.highest_prio
> prev
->prio
)
545 static void schedule_tail_balance_rt(struct rq
*rq
)
548 * If we have more than one rt_task queued, then
549 * see if we can push the other rt_tasks off to other CPUS.
550 * Note we may release the rq lock, and since
551 * the lock was owned by prev, we need to release it
552 * first via finish_lock_switch and then reaquire it here.
554 if (unlikely(rq
->rt
.rt_nr_running
> 1)) {
555 spin_lock_irq(&rq
->lock
);
557 spin_unlock_irq(&rq
->lock
);
562 * Load-balancing iterator. Note: while the runqueue stays locked
563 * during the whole iteration, the current task might be
564 * dequeued so the iterator has to be dequeue-safe. Here we
565 * achieve that by always pre-iterating before returning
568 static struct task_struct
*load_balance_start_rt(void *arg
)
571 struct rt_prio_array
*array
= &rq
->rt
.active
;
572 struct list_head
*head
, *curr
;
573 struct task_struct
*p
;
576 idx
= sched_find_first_bit(array
->bitmap
);
577 if (idx
>= MAX_RT_PRIO
)
580 head
= array
->queue
+ idx
;
583 p
= list_entry(curr
, struct task_struct
, run_list
);
587 rq
->rt
.rt_load_balance_idx
= idx
;
588 rq
->rt
.rt_load_balance_head
= head
;
589 rq
->rt
.rt_load_balance_curr
= curr
;
594 static struct task_struct
*load_balance_next_rt(void *arg
)
597 struct rt_prio_array
*array
= &rq
->rt
.active
;
598 struct list_head
*head
, *curr
;
599 struct task_struct
*p
;
602 idx
= rq
->rt
.rt_load_balance_idx
;
603 head
= rq
->rt
.rt_load_balance_head
;
604 curr
= rq
->rt
.rt_load_balance_curr
;
607 * If we arrived back to the head again then
608 * iterate to the next queue (if any):
610 if (unlikely(head
== curr
)) {
611 int next_idx
= find_next_bit(array
->bitmap
, MAX_RT_PRIO
, idx
+1);
613 if (next_idx
>= MAX_RT_PRIO
)
617 head
= array
->queue
+ idx
;
620 rq
->rt
.rt_load_balance_idx
= idx
;
621 rq
->rt
.rt_load_balance_head
= head
;
624 p
= list_entry(curr
, struct task_struct
, run_list
);
628 rq
->rt
.rt_load_balance_curr
= curr
;
634 load_balance_rt(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
635 unsigned long max_load_move
,
636 struct sched_domain
*sd
, enum cpu_idle_type idle
,
637 int *all_pinned
, int *this_best_prio
)
639 struct rq_iterator rt_rq_iterator
;
641 rt_rq_iterator
.start
= load_balance_start_rt
;
642 rt_rq_iterator
.next
= load_balance_next_rt
;
643 /* pass 'busiest' rq argument into
644 * load_balance_[start|next]_rt iterators
646 rt_rq_iterator
.arg
= busiest
;
648 return balance_tasks(this_rq
, this_cpu
, busiest
, max_load_move
, sd
,
649 idle
, all_pinned
, this_best_prio
, &rt_rq_iterator
);
653 move_one_task_rt(struct rq
*this_rq
, int this_cpu
, struct rq
*busiest
,
654 struct sched_domain
*sd
, enum cpu_idle_type idle
)
656 struct rq_iterator rt_rq_iterator
;
658 rt_rq_iterator
.start
= load_balance_start_rt
;
659 rt_rq_iterator
.next
= load_balance_next_rt
;
660 rt_rq_iterator
.arg
= busiest
;
662 return iter_move_one_task(this_rq
, this_cpu
, busiest
, sd
, idle
,
665 #else /* CONFIG_SMP */
666 # define schedule_tail_balance_rt(rq) do { } while (0)
667 # define schedule_balance_rt(rq, prev) do { } while (0)
668 #endif /* CONFIG_SMP */
670 static void task_tick_rt(struct rq
*rq
, struct task_struct
*p
)
675 * RR tasks need a special form of timeslice management.
676 * FIFO tasks have no timeslices.
678 if (p
->policy
!= SCHED_RR
)
684 p
->time_slice
= DEF_TIMESLICE
;
687 * Requeue to the end of queue if we are not the only element
690 if (p
->run_list
.prev
!= p
->run_list
.next
) {
691 requeue_task_rt(rq
, p
);
692 set_tsk_need_resched(p
);
696 static void set_curr_task_rt(struct rq
*rq
)
698 struct task_struct
*p
= rq
->curr
;
700 p
->se
.exec_start
= rq
->clock
;
703 const struct sched_class rt_sched_class
= {
704 .next
= &fair_sched_class
,
705 .enqueue_task
= enqueue_task_rt
,
706 .dequeue_task
= dequeue_task_rt
,
707 .yield_task
= yield_task_rt
,
709 .check_preempt_curr
= check_preempt_curr_rt
,
711 .pick_next_task
= pick_next_task_rt
,
712 .put_prev_task
= put_prev_task_rt
,
715 .load_balance
= load_balance_rt
,
716 .move_one_task
= move_one_task_rt
,
719 .set_curr_task
= set_curr_task_rt
,
720 .task_tick
= task_tick_rt
,