sched/deadline: Always enqueue on previous rq when dl_task_timer() fires

[mirror_ubuntu-jammy-kernel.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 13049aac05a6242e44a59b9ec424f1004448b3d4..4c49e75ca24dfdc1c4777cbe5960cbd5e08e3cde 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -306,66 +306,6 @@ __read_mostly int scheduler_running;
   */
  int sysctl_sched_rt_runtime = 950000;
  
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       lockdep_assert_held(&p->pi_lock);
-
-       for (;;) {
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-       __acquires(p->pi_lock)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       for (;;) {
-               raw_spin_lock_irqsave(&p->pi_lock, *flags);
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-static void __task_rq_unlock(struct rq *rq)
-       __releases(rq->lock)
-{
-       raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-       __releases(rq->lock)
-       __releases(p->pi_lock)
-{
-       raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
  /*
   * this_rq_lock - lock this runqueue and disable interrupts.
   */
@@ -749,6 +689,23 @@ static inline bool got_nohz_idle_kick(void)
  #ifdef CONFIG_NO_HZ_FULL
  bool sched_can_stop_tick(void)
  {
+       /*
+        * FIFO realtime policy runs the highest priority task. Other runnable
+        * tasks are of a lower priority. The scheduler tick does nothing.
+        */
+       if (current->policy == SCHED_FIFO)
+               return true;
+
+       /*
+        * Round-robin realtime tasks time slice with other tasks at the same
+        * realtime priority. Is this task the only one at this priority?
+        */
+       if (current->policy == SCHED_RR) {
+               struct sched_rt_entity *rt_se = &current->rt;
+
+               return rt_se->run_list.prev == rt_se->run_list.next;
+       }
+
         /*
          * More than one running task need preemption.
          * nr_running update is assumed to be visible
@@ -2899,7 +2856,7 @@ void __sched schedule_preempt_disabled(void)
         preempt_disable();
  }
  
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
  {
         do {
                 __preempt_count_add(PREEMPT_ACTIVE);
@@ -3094,6 +3051,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         } else {
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
+               if (rt_prio(oldprio))
+                       p->rt.timeout = 0;
                 p->sched_class = &fair_sched_class;
         }
  
@@ -4418,36 +4377,29 @@ EXPORT_SYMBOL_GPL(yield_to);
   * This task is about to go to sleep on IO. Increment rq->nr_iowait so
   * that process accounting knows that this is a task in IO wait state.
   */
-void __sched io_schedule(void)
-{
-       struct rq *rq = raw_rq();
-
-       delayacct_blkio_start();
-       atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
-       schedule();
-       current->in_iowait = 0;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
  long __sched io_schedule_timeout(long timeout)
  {
-       struct rq *rq = raw_rq();
+       int old_iowait = current->in_iowait;
+       struct rq *rq;
         long ret;
  
+       current->in_iowait = 1;
+       if (old_iowait)
+               blk_schedule_flush_plug(current);
+       else
+               blk_flush_plug(current);
+
         delayacct_blkio_start();
+       rq = raw_rq();
         atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
         ret = schedule_timeout(timeout);
-       current->in_iowait = 0;
+       current->in_iowait = old_iowait;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
+
         return ret;
  }
+EXPORT_SYMBOL(io_schedule_timeout);
  
  /**
   * sys_sched_get_priority_max - return maximum RT priority.
@@ -5495,17 +5447,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                         break;
                 }
  
-               /*
-                * Even though we initialize ->capacity to something semi-sane,
-                * we leave capacity_orig unset. This allows us to detect if
-                * domain iteration is still funny without causing /0 traps.
-                */
-               if (!group->sgc->capacity_orig) {
-                       printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
-                       break;
-               }
-
                 if (!cpumask_weight(sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: empty group\n");
@@ -5989,7 +5930,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                  * die on a /0 trap.
                  */
                 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-               sg->sgc->capacity_orig = sg->sgc->capacity;
  
                 /*
                  * Make sure the first group of this domain contains the
@@ -6300,6 +6240,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
          */
  
         if (sd->flags & SD_SHARE_CPUCAPACITY) {
+               sd->flags |= SD_PREFER_SIBLING;
                 sd->imbalance_pct = 110;
                 sd->smt_gain = 1178; /* ~15% */
  
@@ -7223,8 +7164,8 @@ void __init sched_init(void)
                 rq->calc_load_active = 0;
                 rq->calc_load_update = jiffies + LOAD_FREQ;
                 init_cfs_rq(&rq->cfs);
-               init_rt_rq(&rq->rt, rq);
-               init_dl_rq(&rq->dl, rq);
+               init_rt_rq(&rq->rt);
+               init_dl_rq(&rq->dl);
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7264,7 +7205,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
-               rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+               rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
@@ -7642,6 +7583,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
  {
         struct task_struct *g, *p;
  
+       /*
+        * Autogroups do not have RT tasks; see autogroup_create().
+        */
+       if (task_group_is_autogroup(tg))
+               return 0;
+
         for_each_process_thread(g, p) {
                 if (rt_task(p) && task_group(p) == tg)
                         return 1;
@@ -7734,6 +7681,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
  {
         int i, err = 0;
  
+       /*
+        * Disallowing the root group RT runtime is BAD, it would disallow the
+        * kernel creating (and or operating) RT threads.
+        */
+       if (tg == &root_task_group && rt_runtime == 0)
+               return -EINVAL;
+
+       /* No period doesn't make any sense. */
+       if (rt_period == 0)
+               return -EINVAL;
+
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
         err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7790,9 +7748,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
         rt_period = (u64)rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
-       if (rt_period == 0)
-               return -EINVAL;
-
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
  
@@ -7849,7 +7804,7 @@ static int sched_rt_global_constraints(void)
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
  {
         u64 runtime = global_rt_runtime();
         u64 period = global_rt_period();
@@ -7950,11 +7905,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
                 if (ret)
                         goto undo;
  
-               ret = sched_rt_global_constraints();
+               ret = sched_dl_global_validate();
                 if (ret)
                         goto undo;
  
-               ret = sched_dl_global_constraints();
+               ret = sched_rt_global_constraints();
                 if (ret)
                         goto undo;