[mirror_ubuntu-zesty-kernel.git] / kernel / sched_rt.c

/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */

#ifdef CONFIG_SMP
static cpumask_t rt_overload_mask;
static atomic_t rto_count;
static inline int rt_overloaded(void)
{
	return atomic_read(&rto_count);
}
static inline cpumask_t *rt_overload(void)
{
	return &rt_overload_mask;
}
static inline void rt_set_overload(struct rq *rq)
{
	cpu_set(rq->cpu, rt_overload_mask);
	/*
	 * Make sure the mask is visible before we set
	 * the overload count. That is checked to determine
	 * if we should look at the mask. It would be a shame
	 * if we looked at the mask, but the mask was not
	 * updated yet.
	 */
	wmb();
	atomic_inc(&rto_count);
}
static inline void rt_clear_overload(struct rq *rq)
{
	/* the order here really doesn't matter */
	atomic_dec(&rto_count);
	cpu_clear(rq->cpu, rt_overload_mask);
}

static void update_rt_migration(struct rq *rq)
{
	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
		rt_set_overload(rq);
	else
		rt_clear_overload(rq);
}
#endif /* CONFIG_SMP */

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static void update_curr_rt(struct rq *rq)
{
	struct task_struct *curr = rq->curr;
	u64 delta_exec;

	if (!task_has_rt_policy(curr))
		return;

	delta_exec = rq->clock - curr->se.exec_start;
	if (unlikely((s64)delta_exec < 0))
		delta_exec = 0;

	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));

	curr->se.sum_exec_runtime += delta_exec;
	curr->se.exec_start = rq->clock;
	cpuacct_charge(curr, delta_exec);
}

static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
{
	WARN_ON(!rt_task(p));
	rq->rt.rt_nr_running++;
#ifdef CONFIG_SMP
	if (p->prio < rq->rt.highest_prio)
		rq->rt.highest_prio = p->prio;
	if (p->nr_cpus_allowed > 1)
		rq->rt.rt_nr_migratory++;

	update_rt_migration(rq);
#endif /* CONFIG_SMP */
}

static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
{
	WARN_ON(!rt_task(p));
	WARN_ON(!rq->rt.rt_nr_running);
	rq->rt.rt_nr_running--;
#ifdef CONFIG_SMP
	if (rq->rt.rt_nr_running) {
		struct rt_prio_array *array;

		WARN_ON(p->prio < rq->rt.highest_prio);
		if (p->prio == rq->rt.highest_prio) {
			/* recalculate */
			array = &rq->rt.active;
			rq->rt.highest_prio =
				sched_find_first_bit(array->bitmap);
		} /* otherwise leave rq->highest prio alone */
	} else
		rq->rt.highest_prio = MAX_RT_PRIO;
	if (p->nr_cpus_allowed > 1)
		rq->rt.rt_nr_migratory--;

	update_rt_migration(rq);
#endif /* CONFIG_SMP */
}

static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct rt_prio_array *array = &rq->rt.active;

	list_add_tail(&p->run_list, array->queue + p->prio);
	__set_bit(p->prio, array->bitmap);
	inc_cpu_load(rq, p->se.load.weight);

	inc_rt_tasks(p, rq);
}

/*
 * Adding/removing a task to/from a priority array:
 */
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
{
	struct rt_prio_array *array = &rq->rt.active;

	update_curr_rt(rq);

	list_del(&p->run_list);
	if (list_empty(array->queue + p->prio))
		__clear_bit(p->prio, array->bitmap);
	dec_cpu_load(rq, p->se.load.weight);

	dec_rt_tasks(p, rq);
}

/*
 * Put task to the end of the run list without the overhead of dequeue
 * followed by enqueue.
 */
static void requeue_task_rt(struct rq *rq, struct task_struct *p)
{
	struct rt_prio_array *array = &rq->rt.active;

	list_move_tail(&p->run_list, array->queue + p->prio);
}

static void
yield_task_rt(struct rq *rq)
{
	requeue_task_rt(rq, rq->curr);
}

#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);

static int select_task_rq_rt(struct task_struct *p, int sync)
{
	struct rq *rq = task_rq(p);

	/*
	 * If the task will not preempt the RQ, try to find a better RQ
	 * before we even activate the task
	 */
	if ((p->prio >= rq->rt.highest_prio)
	    && (p->nr_cpus_allowed > 1)) {
		int cpu = find_lowest_rq(p);

		return (cpu == -1) ? task_cpu(p) : cpu;
	}

	/*
	 * Otherwise, just let it ride on the affined RQ and the
	 * post-schedule router will push the preempted task away
	 */
	return task_cpu(p);
}
#endif /* CONFIG_SMP */

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
{
	if (p->prio < rq->curr->prio)
		resched_task(rq->curr);
}

static struct task_struct *pick_next_task_rt(struct rq *rq)
{
	struct rt_prio_array *array = &rq->rt.active;
	struct task_struct *next;
	struct list_head *queue;
	int idx;

	idx = sched_find_first_bit(array->bitmap);
	if (idx >= MAX_RT_PRIO)
		return NULL;

	queue = array->queue + idx;
	next = list_entry(queue->next, struct task_struct, run_list);

	next->se.exec_start = rq->clock;

	return next;
}

static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
	update_curr_rt(rq);
	p->se.exec_start = 0;
}

#ifdef CONFIG_SMP
/* Only try algorithms three times */
#define RT_MAX_TRIES 3

static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);

static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
	if (!task_running(rq, p) &&
	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
	    (p->nr_cpus_allowed > 1))
		return 1;
	return 0;
}

/* Return the second highest RT task, NULL otherwise */
static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
						     int cpu)
{
	struct rt_prio_array *array = &rq->rt.active;
	struct task_struct *next;
	struct list_head *queue;
	int idx;

	assert_spin_locked(&rq->lock);

	if (likely(rq->rt.rt_nr_running < 2))
		return NULL;

	idx = sched_find_first_bit(array->bitmap);
	if (unlikely(idx >= MAX_RT_PRIO)) {
		WARN_ON(1); /* rt_nr_running is bad */
		return NULL;
	}

	queue = array->queue + idx;
	BUG_ON(list_empty(queue));

	next = list_entry(queue->next, struct task_struct, run_list);
	if (unlikely(pick_rt_task(rq, next, cpu)))
		goto out;

	if (queue->next->next != queue) {
		/* same prio task */
		next = list_entry(queue->next->next, struct task_struct, run_list);
		if (pick_rt_task(rq, next, cpu))
			goto out;
	}

 retry:
	/* slower, but more flexible */
	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
	if (unlikely(idx >= MAX_RT_PRIO))
		return NULL;

	queue = array->queue + idx;
	BUG_ON(list_empty(queue));

	list_for_each_entry(next, queue, run_list) {
		if (pick_rt_task(rq, next, cpu))
			goto out;
	}

	goto retry;

 out:
	return next;
}

static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);

static int find_lowest_rq(struct task_struct *task)
{
	int cpu;
	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
	struct rq *lowest_rq = NULL;

	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);

	/*
	 * Scan each rq for the lowest prio.
	 */
	for_each_cpu_mask(cpu, *cpu_mask) {
		struct rq *rq = cpu_rq(cpu);

		/* We look for lowest RT prio or non-rt CPU */
		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
			lowest_rq = rq;
			break;
		}

		/* no locking for now */
		if (rq->rt.highest_prio > task->prio &&
		    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
			lowest_rq = rq;
		}
	}

	return lowest_rq ? lowest_rq->cpu : -1;
}

/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task,
				      struct rq *rq)
{
	struct rq *lowest_rq = NULL;
	int cpu;
	int tries;

	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
		cpu = find_lowest_rq(task);

		if ((cpu == -1) || (cpu == rq->cpu))
			break;

		lowest_rq = cpu_rq(cpu);

		/* if the prio of this runqueue changed, try again */
		if (double_lock_balance(rq, lowest_rq)) {
			/*
			 * We had to unlock the run queue. In
			 * the mean time, task could have
			 * migrated already or had its affinity changed.
			 * Also make sure that it wasn't scheduled on its rq.
			 */
			if (unlikely(task_rq(task) != rq ||
				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
				     task_running(rq, task) ||
				     !task->se.on_rq)) {
				spin_unlock(&lowest_rq->lock);
				lowest_rq = NULL;
				break;
			}
		}

		/* If this rq is still suitable use it. */
		if (lowest_rq->rt.highest_prio > task->prio)
			break;

		/* try again */
		spin_unlock(&lowest_rq->lock);
		lowest_rq = NULL;
	}

	return lowest_rq;
}

/*
 * If the current CPU has more than one RT task, see if the non
 * running task can migrate over to a CPU that is running a task
 * of lesser priority.
 */
static int push_rt_task(struct rq *rq)
{
	struct task_struct *next_task;
	struct rq *lowest_rq;
	int ret = 0;
	int paranoid = RT_MAX_TRIES;

	assert_spin_locked(&rq->lock);

	next_task = pick_next_highest_task_rt(rq, -1);
	if (!next_task)
		return 0;

 retry:
	if (unlikely(next_task == rq->curr)) {
		WARN_ON(1);
		return 0;
	}

	/*
	 * It's possible that the next_task slipped in of
	 * higher priority than current. If that's the case
	 * just reschedule current.
	 */
	if (unlikely(next_task->prio < rq->curr->prio)) {
		resched_task(rq->curr);
		return 0;
	}

	/* We might release rq lock */
	get_task_struct(next_task);

	/* find_lock_lowest_rq locks the rq if found */
	lowest_rq = find_lock_lowest_rq(next_task, rq);
	if (!lowest_rq) {
		struct task_struct *task;
		/*
		 * find lock_lowest_rq releases rq->lock
		 * so it is possible that next_task has changed.
		 * If it has, then try again.
		 */
		task = pick_next_highest_task_rt(rq, -1);
		if (unlikely(task != next_task) && task && paranoid--) {
			put_task_struct(next_task);
			next_task = task;
			goto retry;
		}
		goto out;
	}

	assert_spin_locked(&lowest_rq->lock);

	deactivate_task(rq, next_task, 0);
	set_task_cpu(next_task, lowest_rq->cpu);
	activate_task(lowest_rq, next_task, 0);

	resched_task(lowest_rq->curr);

	spin_unlock(&lowest_rq->lock);

	ret = 1;
out:
	put_task_struct(next_task);

	return ret;
}

/*
 * TODO: Currently we just use the second highest prio task on
 *       the queue, and stop when it can't migrate (or there's
 *       no more RT tasks).  There may be a case where a lower
 *       priority RT task has a different affinity than the
 *       higher RT task. In this case the lower RT task could
 *       possibly be able to migrate where as the higher priority
 *       RT task could not.  We currently ignore this issue.
 *       Enhancements are welcome!
 */
static void push_rt_tasks(struct rq *rq)
{
	/* push_rt_task will return true if it moved an RT */
	while (push_rt_task(rq))
		;
}

static int pull_rt_task(struct rq *this_rq)
{
	struct task_struct *next;
	struct task_struct *p;
	struct rq *src_rq;
	cpumask_t *rto_cpumask;
	int this_cpu = this_rq->cpu;
	int cpu;
	int ret = 0;

	assert_spin_locked(&this_rq->lock);

	/*
	 * If cpusets are used, and we have overlapping
	 * run queue cpusets, then this algorithm may not catch all.
	 * This is just the price you pay on trying to keep
	 * dirtying caches down on large SMP machines.
	 */
	if (likely(!rt_overloaded()))
		return 0;

	next = pick_next_task_rt(this_rq);

	rto_cpumask = rt_overload();

	for_each_cpu_mask(cpu, *rto_cpumask) {
		if (this_cpu == cpu)
			continue;

		src_rq = cpu_rq(cpu);
		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
			/*
			 * It is possible that overlapping cpusets
			 * will miss clearing a non overloaded runqueue.
			 * Clear it now.
			 */
			if (double_lock_balance(this_rq, src_rq)) {
				/* unlocked our runqueue lock */
				struct task_struct *old_next = next;
				next = pick_next_task_rt(this_rq);
				if (next != old_next)
					ret = 1;
			}
			if (likely(src_rq->rt.rt_nr_running <= 1))
				/*
				 * Small chance that this_rq->curr changed
				 * but it's really harmless here.
				 */
				rt_clear_overload(this_rq);
			else
				/*
				 * Heh, the src_rq is now overloaded, since
				 * we already have the src_rq lock, go straight
				 * to pulling tasks from it.
				 */
				goto try_pulling;
			spin_unlock(&src_rq->lock);
			continue;
		}

		/*
		 * We can potentially drop this_rq's lock in
		 * double_lock_balance, and another CPU could
		 * steal our next task - hence we must cause
		 * the caller to recalculate the next task
		 * in that case:
		 */
		if (double_lock_balance(this_rq, src_rq)) {
			struct task_struct *old_next = next;
			next = pick_next_task_rt(this_rq);
			if (next != old_next)
				ret = 1;
		}

		/*
		 * Are there still pullable RT tasks?
		 */
		if (src_rq->rt.rt_nr_running <= 1) {
			spin_unlock(&src_rq->lock);
			continue;
		}

 try_pulling:
		p = pick_next_highest_task_rt(src_rq, this_cpu);

		/*
		 * Do we have an RT task that preempts
		 * the to-be-scheduled task?
		 */
		if (p && (!next || (p->prio < next->prio))) {
			WARN_ON(p == src_rq->curr);
			WARN_ON(!p->se.on_rq);

			/*
			 * There's a chance that p is higher in priority
			 * than what's currently running on its cpu.
			 * This is just that p is wakeing up and hasn't
			 * had a chance to schedule. We only pull
			 * p if it is lower in priority than the
			 * current task on the run queue or
			 * this_rq next task is lower in prio than
			 * the current task on that rq.
			 */
			if (p->prio < src_rq->curr->prio ||
			    (next && next->prio < src_rq->curr->prio))
				goto bail;

			ret = 1;

			deactivate_task(src_rq, p, 0);
			set_task_cpu(p, this_cpu);
			activate_task(this_rq, p, 0);
			/*
			 * We continue with the search, just in
			 * case there's an even higher prio task
			 * in another runqueue. (low likelyhood
			 * but possible)
			 */

			/*
			 * Update next so that we won't pick a task
			 * on another cpu with a priority lower (or equal)
			 * than the one we just picked.
			 */
			next = p;

		}
 bail:
		spin_unlock(&src_rq->lock);
	}

	return ret;
}

static void schedule_balance_rt(struct rq *rq,
				struct task_struct *prev)
{
	/* Try to pull RT tasks here if we lower this rq's prio */
	if (unlikely(rt_task(prev)) &&
	    rq->rt.highest_prio > prev->prio)
		pull_rt_task(rq);
}

static void schedule_tail_balance_rt(struct rq *rq)
{
	/*
	 * If we have more than one rt_task queued, then
	 * see if we can push the other rt_tasks off to other CPUS.
	 * Note we may release the rq lock, and since
	 * the lock was owned by prev, we need to release it
	 * first via finish_lock_switch and then reaquire it here.
	 */
	if (unlikely(rq->rt.rt_nr_running > 1)) {
		spin_lock_irq(&rq->lock);
		push_rt_tasks(rq);
		spin_unlock_irq(&rq->lock);
	}
}


static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
{
	if (unlikely(rt_task(p)) &&
	    !task_running(rq, p) &&
	    (p->prio >= rq->curr->prio))
		push_rt_tasks(rq);
}

static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
		unsigned long max_load_move,
		struct sched_domain *sd, enum cpu_idle_type idle,
		int *all_pinned, int *this_best_prio)
{
	/* don't touch RT tasks */
	return 0;
}

static int
move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
		 struct sched_domain *sd, enum cpu_idle_type idle)
{
	/* don't touch RT tasks */
	return 0;
}
static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
{
	int weight = cpus_weight(*new_mask);

	BUG_ON(!rt_task(p));

	/*
	 * Update the migration status of the RQ if we have an RT task
	 * which is running AND changing its weight value.
	 */
	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
		struct rq *rq = task_rq(p);

		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
			rq->rt.rt_nr_migratory++;
		else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
			BUG_ON(!rq->rt.rt_nr_migratory);
			rq->rt.rt_nr_migratory--;
		}

		update_rt_migration(rq);
	}

	p->cpus_allowed    = *new_mask;
	p->nr_cpus_allowed = weight;
}
#else /* CONFIG_SMP */
# define schedule_tail_balance_rt(rq)	do { } while (0)
# define schedule_balance_rt(rq, prev)	do { } while (0)
# define wakeup_balance_rt(rq, p)	do { } while (0)
#endif /* CONFIG_SMP */

static void task_tick_rt(struct rq *rq, struct task_struct *p)
{
	update_curr_rt(rq);

	/*
	 * RR tasks need a special form of timeslice management.
	 * FIFO tasks have no timeslices.
	 */
	if (p->policy != SCHED_RR)
		return;

	if (--p->time_slice)
		return;

	p->time_slice = DEF_TIMESLICE;

	/*
	 * Requeue to the end of queue if we are not the only element
	 * on the queue:
	 */
	if (p->run_list.prev != p->run_list.next) {
		requeue_task_rt(rq, p);
		set_tsk_need_resched(p);
	}
}

static void set_curr_task_rt(struct rq *rq)
{
	struct task_struct *p = rq->curr;

	p->se.exec_start = rq->clock;
}

const struct sched_class rt_sched_class = {
	.next			= &fair_sched_class,
	.enqueue_task		= enqueue_task_rt,
	.dequeue_task		= dequeue_task_rt,
	.yield_task		= yield_task_rt,
#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_rt,
#endif /* CONFIG_SMP */

	.check_preempt_curr	= check_preempt_curr_rt,

	.pick_next_task		= pick_next_task_rt,
	.put_prev_task		= put_prev_task_rt,

#ifdef CONFIG_SMP
	.load_balance		= load_balance_rt,
	.move_one_task		= move_one_task_rt,
	.set_cpus_allowed       = set_cpus_allowed_rt,
#endif

	.set_curr_task          = set_curr_task_rt,
	.task_tick		= task_tick_rt,
};
Commit	Line	Data
bb44e5d1 IM	1	/*
	2	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
	3	* policies)
	4	*/
	5
4fd29176 SR	6	#ifdef CONFIG_SMP
	7	static cpumask_t rt_overload_mask;
	8	static atomic_t rto_count;
	9	static inline int rt_overloaded(void)
	10	{
	11	return atomic_read(&rto_count);
	12	}
	13	static inline cpumask_t *rt_overload(void)
	14	{
	15	return &rt_overload_mask;
	16	}
	17	static inline void rt_set_overload(struct rq *rq)
	18	{
	19	cpu_set(rq->cpu, rt_overload_mask);
	20	/*
	21	* Make sure the mask is visible before we set
	22	* the overload count. That is checked to determine
	23	* if we should look at the mask. It would be a shame
	24	* if we looked at the mask, but the mask was not
	25	* updated yet.
	26	*/
	27	wmb();
	28	atomic_inc(&rto_count);
	29	}
	30	static inline void rt_clear_overload(struct rq *rq)
	31	{
	32	/* the order here really doesn't matter */
	33	atomic_dec(&rto_count);
	34	cpu_clear(rq->cpu, rt_overload_mask);
	35	}
73fe6aae GH	36
	37	static void update_rt_migration(struct rq *rq)
	38	{
	39	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
	40	rt_set_overload(rq);
	41	else
	42	rt_clear_overload(rq);
	43	}
4fd29176 SR	44	#endif /* CONFIG_SMP */
4fd29176 SR	45
bb44e5d1 IM	46	/*
	47	* Update the current task's runtime statistics. Skip current tasks that
	48	* are not in our scheduling class.
	49	*/
a9957449	50	static void update_curr_rt(struct rq *rq)
bb44e5d1 IM	51	{
	52	struct task_struct *curr = rq->curr;
	53	u64 delta_exec;
	54
	55	if (!task_has_rt_policy(curr))
	56	return;
	57
d281918d	58	delta_exec = rq->clock - curr->se.exec_start;
bb44e5d1 IM	59	if (unlikely((s64)delta_exec < 0))
bb44e5d1 IM	60	delta_exec = 0;
6cfb0d5d IM	61
6cfb0d5d IM	62	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
bb44e5d1 IM	63
bb44e5d1 IM	64	curr->se.sum_exec_runtime += delta_exec;
d281918d	65	curr->se.exec_start = rq->clock;
d842de87	66	cpuacct_charge(curr, delta_exec);
bb44e5d1 IM	67	}
bb44e5d1 IM	68
63489e45 SR	69	static inline void inc_rt_tasks(struct task_struct p, struct rq rq)
	70	{
	71	WARN_ON(!rt_task(p));
	72	rq->rt.rt_nr_running++;
764a9d6f SR	73	#ifdef CONFIG_SMP
	74	if (p->prio < rq->rt.highest_prio)
	75	rq->rt.highest_prio = p->prio;
73fe6aae GH	76	if (p->nr_cpus_allowed > 1)
	77	rq->rt.rt_nr_migratory++;
	78
	79	update_rt_migration(rq);
764a9d6f	80	#endif /* CONFIG_SMP */
63489e45 SR	81	}
	82
	83	static inline void dec_rt_tasks(struct task_struct p, struct rq rq)
	84	{
	85	WARN_ON(!rt_task(p));
	86	WARN_ON(!rq->rt.rt_nr_running);
	87	rq->rt.rt_nr_running--;
764a9d6f SR	88	#ifdef CONFIG_SMP
	89	if (rq->rt.rt_nr_running) {
	90	struct rt_prio_array *array;
	91
	92	WARN_ON(p->prio < rq->rt.highest_prio);
	93	if (p->prio == rq->rt.highest_prio) {
	94	/* recalculate */
	95	array = &rq->rt.active;
	96	rq->rt.highest_prio =
	97	sched_find_first_bit(array->bitmap);
	98	} /* otherwise leave rq->highest prio alone */
	99	} else
	100	rq->rt.highest_prio = MAX_RT_PRIO;
73fe6aae GH	101	if (p->nr_cpus_allowed > 1)
	102	rq->rt.rt_nr_migratory--;
	103
	104	update_rt_migration(rq);
764a9d6f	105	#endif /* CONFIG_SMP */
63489e45 SR	106	}
63489e45 SR	107
fd390f6a	108	static void enqueue_task_rt(struct rq rq, struct task_struct p, int wakeup)
bb44e5d1 IM	109	{
	110	struct rt_prio_array *array = &rq->rt.active;
	111
	112	list_add_tail(&p->run_list, array->queue + p->prio);
	113	__set_bit(p->prio, array->bitmap);
58e2d4ca	114	inc_cpu_load(rq, p->se.load.weight);
63489e45 SR	115
63489e45 SR	116	inc_rt_tasks(p, rq);
bb44e5d1 IM	117	}
	118
	119	/*
	120	* Adding/removing a task to/from a priority array:
	121	*/
f02231e5	122	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)
bb44e5d1 IM	123	{
	124	struct rt_prio_array *array = &rq->rt.active;
	125
f1e14ef6	126	update_curr_rt(rq);
bb44e5d1 IM	127
	128	list_del(&p->run_list);
	129	if (list_empty(array->queue + p->prio))
	130	__clear_bit(p->prio, array->bitmap);
58e2d4ca	131	dec_cpu_load(rq, p->se.load.weight);
63489e45 SR	132
63489e45 SR	133	dec_rt_tasks(p, rq);
bb44e5d1 IM	134	}
	135
	136	/*
	137	* Put task to the end of the run list without the overhead of dequeue
	138	* followed by enqueue.
	139	*/
	140	static void requeue_task_rt(struct rq rq, struct task_struct p)
	141	{
	142	struct rt_prio_array *array = &rq->rt.active;
	143
	144	list_move_tail(&p->run_list, array->queue + p->prio);
	145	}
	146
	147	static void
4530d7ab	148	yield_task_rt(struct rq *rq)
bb44e5d1	149	{
4530d7ab	150	requeue_task_rt(rq, rq->curr);
bb44e5d1 IM	151	}
bb44e5d1 IM	152
e7693a36	153	#ifdef CONFIG_SMP
318e0893 GH	154	static int find_lowest_rq(struct task_struct *task);
318e0893 GH	155
e7693a36 GH	156	static int select_task_rq_rt(struct task_struct *p, int sync)
e7693a36 GH	157	{
318e0893 GH	158	struct rq *rq = task_rq(p);
	159
	160	/*
	161	* If the task will not preempt the RQ, try to find a better RQ
	162	* before we even activate the task
	163	*/
	164	if ((p->prio >= rq->rt.highest_prio)
	165	&& (p->nr_cpus_allowed > 1)) {
	166	int cpu = find_lowest_rq(p);
	167
	168	return (cpu == -1) ? task_cpu(p) : cpu;
	169	}
	170
	171	/*
	172	* Otherwise, just let it ride on the affined RQ and the
	173	* post-schedule router will push the preempted task away
	174	*/
e7693a36 GH	175	return task_cpu(p);
	176	}
	177	#endif /* CONFIG_SMP */
	178
bb44e5d1 IM	179	/*
	180	* Preempt the current task with a newly woken task if needed:
	181	*/
	182	static void check_preempt_curr_rt(struct rq rq, struct task_struct p)
	183	{
	184	if (p->prio < rq->curr->prio)
	185	resched_task(rq->curr);
	186	}
	187
fb8d4724	188	static struct task_struct pick_next_task_rt(struct rq rq)
bb44e5d1 IM	189	{
	190	struct rt_prio_array *array = &rq->rt.active;
	191	struct task_struct *next;
	192	struct list_head *queue;
	193	int idx;
	194
	195	idx = sched_find_first_bit(array->bitmap);
	196	if (idx >= MAX_RT_PRIO)
	197	return NULL;
	198
	199	queue = array->queue + idx;
	200	next = list_entry(queue->next, struct task_struct, run_list);
	201
d281918d	202	next->se.exec_start = rq->clock;
bb44e5d1 IM	203
	204	return next;
	205	}
	206
31ee529c	207	static void put_prev_task_rt(struct rq rq, struct task_struct p)
bb44e5d1	208	{
f1e14ef6	209	update_curr_rt(rq);
bb44e5d1 IM	210	p->se.exec_start = 0;
	211	}
	212
681f3e68	213	#ifdef CONFIG_SMP
e8fa1362 SR	214	/* Only try algorithms three times */
	215	#define RT_MAX_TRIES 3
	216
	217	static int double_lock_balance(struct rq this_rq, struct rq busiest);
	218	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
	219
f65eda4f SR	220	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
	221	{
	222	if (!task_running(rq, p) &&
73fe6aae GH	223	(cpu < 0 \|\| cpu_isset(cpu, p->cpus_allowed)) &&
73fe6aae GH	224	(p->nr_cpus_allowed > 1))
f65eda4f SR	225	return 1;
	226	return 0;
	227	}
	228
e8fa1362	229	/* Return the second highest RT task, NULL otherwise */
f65eda4f SR	230	static struct task_struct pick_next_highest_task_rt(struct rq rq,
f65eda4f SR	231	int cpu)
e8fa1362 SR	232	{
	233	struct rt_prio_array *array = &rq->rt.active;
	234	struct task_struct *next;
	235	struct list_head *queue;
	236	int idx;
	237
	238	assert_spin_locked(&rq->lock);
	239
	240	if (likely(rq->rt.rt_nr_running < 2))
	241	return NULL;
	242
	243	idx = sched_find_first_bit(array->bitmap);
	244	if (unlikely(idx >= MAX_RT_PRIO)) {
	245	WARN_ON(1); /* rt_nr_running is bad */
	246	return NULL;
	247	}
	248
	249	queue = array->queue + idx;
f65eda4f SR	250	BUG_ON(list_empty(queue));
f65eda4f SR	251
e8fa1362	252	next = list_entry(queue->next, struct task_struct, run_list);
f65eda4f SR	253	if (unlikely(pick_rt_task(rq, next, cpu)))
f65eda4f SR	254	goto out;
e8fa1362 SR	255
	256	if (queue->next->next != queue) {
	257	/* same prio task */
	258	next = list_entry(queue->next->next, struct task_struct, run_list);
f65eda4f SR	259	if (pick_rt_task(rq, next, cpu))
f65eda4f SR	260	goto out;
e8fa1362 SR	261	}
e8fa1362 SR	262
f65eda4f	263	retry:
e8fa1362 SR	264	/* slower, but more flexible */
e8fa1362 SR	265	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
f65eda4f	266	if (unlikely(idx >= MAX_RT_PRIO))
e8fa1362	267	return NULL;
e8fa1362 SR	268
e8fa1362 SR	269	queue = array->queue + idx;
f65eda4f SR	270	BUG_ON(list_empty(queue));
	271
	272	list_for_each_entry(next, queue, run_list) {
	273	if (pick_rt_task(rq, next, cpu))
	274	goto out;
	275	}
	276
	277	goto retry;
e8fa1362	278
f65eda4f	279	out:
e8fa1362 SR	280	return next;
	281	}
	282
	283	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
	284
07b4032c	285	static int find_lowest_rq(struct task_struct *task)
e8fa1362	286	{
e8fa1362	287	int cpu;
e8fa1362	288	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
07b4032c	289	struct rq *lowest_rq = NULL;
e8fa1362 SR	290
	291	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
	292
07b4032c GH	293	/*
	294	* Scan each rq for the lowest prio.
	295	*/
	296	for_each_cpu_mask(cpu, *cpu_mask) {
	297	struct rq *rq = cpu_rq(cpu);
e8fa1362	298
07b4032c GH	299	/* We look for lowest RT prio or non-rt CPU */
	300	if (rq->rt.highest_prio >= MAX_RT_PRIO) {
	301	lowest_rq = rq;
	302	break;
	303	}
	304
	305	/* no locking for now */
	306	if (rq->rt.highest_prio > task->prio &&
	307	(!lowest_rq \|\| rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
	308	lowest_rq = rq;
e8fa1362	309	}
07b4032c GH	310	}
	311
	312	return lowest_rq ? lowest_rq->cpu : -1;
	313	}
	314
	315	/* Will lock the rq it finds */
	316	static struct rq find_lock_lowest_rq(struct task_struct task,
	317	struct rq *rq)
	318	{
	319	struct rq *lowest_rq = NULL;
	320	int cpu;
	321	int tries;
e8fa1362	322
07b4032c GH	323	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
	324	cpu = find_lowest_rq(task);
	325
2de0b463	326	if ((cpu == -1) \|\| (cpu == rq->cpu))
e8fa1362 SR	327	break;
e8fa1362 SR	328
07b4032c GH	329	lowest_rq = cpu_rq(cpu);
07b4032c GH	330
e8fa1362	331	/* if the prio of this runqueue changed, try again */
07b4032c	332	if (double_lock_balance(rq, lowest_rq)) {
e8fa1362 SR	333	/*
	334	* We had to unlock the run queue. In
	335	* the mean time, task could have
	336	* migrated already or had its affinity changed.
	337	* Also make sure that it wasn't scheduled on its rq.
	338	*/
07b4032c	339	if (unlikely(task_rq(task) != rq \|\|
e8fa1362	340	!cpu_isset(lowest_rq->cpu, task->cpus_allowed) \|\|
07b4032c	341	task_running(rq, task) \|\|
e8fa1362 SR	342	!task->se.on_rq)) {
	343	spin_unlock(&lowest_rq->lock);
	344	lowest_rq = NULL;
	345	break;
	346	}
	347	}
	348
	349	/* If this rq is still suitable use it. */
	350	if (lowest_rq->rt.highest_prio > task->prio)
	351	break;
	352
	353	/* try again */
	354	spin_unlock(&lowest_rq->lock);
	355	lowest_rq = NULL;
	356	}
	357
	358	return lowest_rq;
	359	}
	360
	361	/*
	362	* If the current CPU has more than one RT task, see if the non
	363	* running task can migrate over to a CPU that is running a task
	364	* of lesser priority.
	365	*/
697f0a48	366	static int push_rt_task(struct rq *rq)
e8fa1362 SR	367	{
	368	struct task_struct *next_task;
	369	struct rq *lowest_rq;
	370	int ret = 0;
	371	int paranoid = RT_MAX_TRIES;
	372
697f0a48	373	assert_spin_locked(&rq->lock);
e8fa1362	374
697f0a48	375	next_task = pick_next_highest_task_rt(rq, -1);
e8fa1362 SR	376	if (!next_task)
	377	return 0;
	378
	379	retry:
697f0a48	380	if (unlikely(next_task == rq->curr)) {
f65eda4f	381	WARN_ON(1);
e8fa1362	382	return 0;
f65eda4f	383	}
e8fa1362 SR	384
	385	/*
	386	* It's possible that the next_task slipped in of
	387	* higher priority than current. If that's the case
	388	* just reschedule current.
	389	*/
697f0a48 GH	390	if (unlikely(next_task->prio < rq->curr->prio)) {
697f0a48 GH	391	resched_task(rq->curr);
e8fa1362 SR	392	return 0;
	393	}
	394
697f0a48	395	/* We might release rq lock */
e8fa1362 SR	396	get_task_struct(next_task);
	397
	398	/* find_lock_lowest_rq locks the rq if found */
697f0a48	399	lowest_rq = find_lock_lowest_rq(next_task, rq);
e8fa1362 SR	400	if (!lowest_rq) {
	401	struct task_struct *task;
	402	/*
697f0a48	403	* find lock_lowest_rq releases rq->lock
e8fa1362 SR	404	* so it is possible that next_task has changed.
	405	* If it has, then try again.
	406	*/
697f0a48	407	task = pick_next_highest_task_rt(rq, -1);
e8fa1362 SR	408	if (unlikely(task != next_task) && task && paranoid--) {
	409	put_task_struct(next_task);
	410	next_task = task;
	411	goto retry;
	412	}
	413	goto out;
	414	}
	415
	416	assert_spin_locked(&lowest_rq->lock);
	417
697f0a48	418	deactivate_task(rq, next_task, 0);
e8fa1362 SR	419	set_task_cpu(next_task, lowest_rq->cpu);
	420	activate_task(lowest_rq, next_task, 0);
	421
	422	resched_task(lowest_rq->curr);
	423
	424	spin_unlock(&lowest_rq->lock);
	425
	426	ret = 1;
	427	out:
	428	put_task_struct(next_task);
	429
	430	return ret;
	431	}
	432
	433	/*
	434	* TODO: Currently we just use the second highest prio task on
	435	* the queue, and stop when it can't migrate (or there's
	436	* no more RT tasks). There may be a case where a lower
	437	* priority RT task has a different affinity than the
	438	* higher RT task. In this case the lower RT task could
	439	* possibly be able to migrate where as the higher priority
	440	* RT task could not. We currently ignore this issue.
	441	* Enhancements are welcome!
	442	*/
	443	static void push_rt_tasks(struct rq *rq)
	444	{
	445	/* push_rt_task will return true if it moved an RT */
	446	while (push_rt_task(rq))
	447	;
	448	}
	449
f65eda4f SR	450	static int pull_rt_task(struct rq *this_rq)
	451	{
	452	struct task_struct *next;
	453	struct task_struct *p;
	454	struct rq *src_rq;
	455	cpumask_t *rto_cpumask;
	456	int this_cpu = this_rq->cpu;
	457	int cpu;
	458	int ret = 0;
	459
	460	assert_spin_locked(&this_rq->lock);
	461
	462	/*
	463	* If cpusets are used, and we have overlapping
	464	* run queue cpusets, then this algorithm may not catch all.
	465	* This is just the price you pay on trying to keep
	466	* dirtying caches down on large SMP machines.
	467	*/
	468	if (likely(!rt_overloaded()))
	469	return 0;
	470
	471	next = pick_next_task_rt(this_rq);
	472
	473	rto_cpumask = rt_overload();
	474
	475	for_each_cpu_mask(cpu, *rto_cpumask) {
	476	if (this_cpu == cpu)
	477	continue;
	478
	479	src_rq = cpu_rq(cpu);
	480	if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
	481	/*
	482	* It is possible that overlapping cpusets
	483	* will miss clearing a non overloaded runqueue.
	484	* Clear it now.
	485	*/
	486	if (double_lock_balance(this_rq, src_rq)) {
	487	/* unlocked our runqueue lock */
	488	struct task_struct *old_next = next;
	489	next = pick_next_task_rt(this_rq);
	490	if (next != old_next)
	491	ret = 1;
	492	}
	493	if (likely(src_rq->rt.rt_nr_running <= 1))
	494	/*
	495	* Small chance that this_rq->curr changed
	496	* but it's really harmless here.
	497	*/
	498	rt_clear_overload(this_rq);
	499	else
	500	/*
	501	* Heh, the src_rq is now overloaded, since
	502	* we already have the src_rq lock, go straight
	503	* to pulling tasks from it.
	504	*/
	505	goto try_pulling;
	506	spin_unlock(&src_rq->lock);
	507	continue;
	508	}
	509
	510	/*
	511	* We can potentially drop this_rq's lock in
	512	* double_lock_balance, and another CPU could
	513	* steal our next task - hence we must cause
514	* the caller to recalculate the next task
515	* in that case:
516	*/
517	if (double_lock_balance(this_rq, src_rq)) {
518	struct task_struct *old_next = next;
519	next = pick_next_task_rt(this_rq);
520	if (next != old_next)
521	ret = 1;
522	}
523
524	/*
525	* Are there still pullable RT tasks?
526	*/
527	if (src_rq->rt.rt_nr_running <= 1) {
528	spin_unlock(&src_rq->lock);
529	continue;
530	}
531
532	try_pulling:
533	p = pick_next_highest_task_rt(src_rq, this_cpu);
534
535	/*
536	* Do we have an RT task that preempts
537	* the to-be-scheduled task?
538	*/
539	if (p && (!next \|\| (p->prio < next->prio))) {
540	WARN_ON(p == src_rq->curr);
541	WARN_ON(!p->se.on_rq);
542
543	/*
544	* There's a chance that p is higher in priority
545	* than what's currently running on its cpu.
546	* This is just that p is wakeing up and hasn't
547	* had a chance to schedule. We only pull
548	* p if it is lower in priority than the
549	* current task on the run queue or
550	* this_rq next task is lower in prio than
551	* the current task on that rq.
552	*/
553	if (p->prio < src_rq->curr->prio \|\|
554	(next && next->prio < src_rq->curr->prio))
555	goto bail;
556
557	ret = 1;
558
559	deactivate_task(src_rq, p, 0);
560	set_task_cpu(p, this_cpu);
561	activate_task(this_rq, p, 0);
562	/*
563	* We continue with the search, just in
564	* case there's an even higher prio task
565	* in another runqueue. (low likelyhood
566	* but possible)
567	*/
568
569	/*
570	* Update next so that we won't pick a task
571	* on another cpu with a priority lower (or equal)
572	* than the one we just picked.
573	*/
574	next = p;
575
576	}
577	bail:
578	spin_unlock(&src_rq->lock);
579	}
580
581	return ret;
582	}
583
584	static void schedule_balance_rt(struct rq *rq,
585	struct task_struct *prev)
586	{
587	/* Try to pull RT tasks here if we lower this rq's prio */
588	if (unlikely(rt_task(prev)) &&
589	rq->rt.highest_prio > prev->prio)
590	pull_rt_task(rq);
591	}
592
e8fa1362 SR	593	static void schedule_tail_balance_rt(struct rq *rq)
	594	{
	595	/*
	596	* If we have more than one rt_task queued, then
	597	* see if we can push the other rt_tasks off to other CPUS.
	598	* Note we may release the rq lock, and since
	599	* the lock was owned by prev, we need to release it
	600	* first via finish_lock_switch and then reaquire it here.
	601	*/
	602	if (unlikely(rq->rt.rt_nr_running > 1)) {
	603	spin_lock_irq(&rq->lock);
	604	push_rt_tasks(rq);
	605	spin_unlock_irq(&rq->lock);
	606	}
	607	}
	608
4642dafd SR	609
	610	static void wakeup_balance_rt(struct rq rq, struct task_struct p)
	611	{
	612	if (unlikely(rt_task(p)) &&
	613	!task_running(rq, p) &&
	614	(p->prio >= rq->curr->prio))
	615	push_rt_tasks(rq);
	616	}
	617
43010659	618	static unsigned long
bb44e5d1	619	load_balance_rt(struct rq this_rq, int this_cpu, struct rq busiest,
e1d1484f PW	620	unsigned long max_load_move,
	621	struct sched_domain *sd, enum cpu_idle_type idle,
	622	int all_pinned, int this_best_prio)
bb44e5d1	623	{
c7a1e46a SR	624	/* don't touch RT tasks */
c7a1e46a SR	625	return 0;
e1d1484f PW	626	}
	627
	628	static int
	629	move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
	630	struct sched_domain *sd, enum cpu_idle_type idle)
	631	{
c7a1e46a SR	632	/* don't touch RT tasks */
c7a1e46a SR	633	return 0;
bb44e5d1	634	}
73fe6aae GH	635	static void set_cpus_allowed_rt(struct task_struct p, cpumask_t new_mask)
	636	{
	637	int weight = cpus_weight(*new_mask);
	638
	639	BUG_ON(!rt_task(p));
	640
	641	/*
	642	* Update the migration status of the RQ if we have an RT task
	643	* which is running AND changing its weight value.
	644	*/
	645	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
	646	struct rq *rq = task_rq(p);
	647
	648	if ((p->nr_cpus_allowed <= 1) && (weight > 1))
	649	rq->rt.rt_nr_migratory++;
	650	else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
	651	BUG_ON(!rq->rt.rt_nr_migratory);
	652	rq->rt.rt_nr_migratory--;
	653	}
	654
	655	update_rt_migration(rq);
	656	}
	657
	658	p->cpus_allowed = *new_mask;
	659	p->nr_cpus_allowed = weight;
	660	}
e8fa1362 SR	661	#else /* CONFIG_SMP */
e8fa1362 SR	662	# define schedule_tail_balance_rt(rq) do { } while (0)
f65eda4f	663	# define schedule_balance_rt(rq, prev) do { } while (0)
4642dafd	664	# define wakeup_balance_rt(rq, p) do { } while (0)
e8fa1362	665	#endif /* CONFIG_SMP */
bb44e5d1 IM	666
	667	static void task_tick_rt(struct rq rq, struct task_struct p)
	668	{
67e2be02 PZ	669	update_curr_rt(rq);
67e2be02 PZ	670
bb44e5d1 IM	671	/*
	672	* RR tasks need a special form of timeslice management.
	673	* FIFO tasks have no timeslices.
	674	*/
	675	if (p->policy != SCHED_RR)
	676	return;
	677
	678	if (--p->time_slice)
	679	return;
	680
a4ec24b4	681	p->time_slice = DEF_TIMESLICE;
bb44e5d1	682
98fbc798 DA	683	/*
	684	* Requeue to the end of queue if we are not the only element
	685	* on the queue:
	686	*/
	687	if (p->run_list.prev != p->run_list.next) {
	688	requeue_task_rt(rq, p);
	689	set_tsk_need_resched(p);
	690	}
bb44e5d1 IM	691	}
bb44e5d1 IM	692
83b699ed SV	693	static void set_curr_task_rt(struct rq *rq)
	694	{
	695	struct task_struct *p = rq->curr;
	696
	697	p->se.exec_start = rq->clock;
	698	}
	699
5522d5d5 IM	700	const struct sched_class rt_sched_class = {
5522d5d5 IM	701	.next = &fair_sched_class,
bb44e5d1 IM	702	.enqueue_task = enqueue_task_rt,
	703	.dequeue_task = dequeue_task_rt,
	704	.yield_task = yield_task_rt,
e7693a36 GH	705	#ifdef CONFIG_SMP
	706	.select_task_rq = select_task_rq_rt,
	707	#endif /* CONFIG_SMP */
bb44e5d1 IM	708
	709	.check_preempt_curr = check_preempt_curr_rt,
	710
	711	.pick_next_task = pick_next_task_rt,
	712	.put_prev_task = put_prev_task_rt,
	713
681f3e68	714	#ifdef CONFIG_SMP
bb44e5d1	715	.load_balance = load_balance_rt,
e1d1484f	716	.move_one_task = move_one_task_rt,
73fe6aae	717	.set_cpus_allowed = set_cpus_allowed_rt,
681f3e68	718	#endif
bb44e5d1	719
83b699ed	720	.set_curr_task = set_curr_task_rt,
bb44e5d1	721	.task_tick = task_tick_rt,
bb44e5d1	722	};