Merge tag 'v4.11-rc6' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)

committer Ingo Molnar <mingo@kernel.org>

Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)
author Ingo Molnar <mingo@kernel.org>
Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)
committer Ingo Molnar <mingo@kernel.org>
Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3b31fc05a0f1e45be5985b860a5fde95ee969832..ab9f6ac099a71368ccb22c857c82239e2183fb28 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -85,21 +85,6 @@ int sysctl_sched_rt_runtime = 950000;
  /* CPUs with isolated domains */
  cpumask_var_t cpu_isolated_map;
  
-/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       local_irq_disable();
-       rq = this_rq();
-       raw_spin_lock(&rq->lock);
-
-       return rq;
-}
-
  /*
   * __task_rq_lock - lock the rq @p resides on.
   */
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
                 return;
  
  #ifdef CONFIG_SCHED_DEBUG
+       if (sched_feat(WARN_DOUBLE_CLOCK))
+               SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
         rq->clock_update_flags |= RQCF_UPDATED;
  #endif
+
         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
         if (delta < 0)
                 return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
  static enum hrtimer_restart hrtick(struct hrtimer *timer)
  {
         struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+       struct rq_flags rf;
  
         WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
  
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
         update_rq_clock(rq);
         rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
  
         return HRTIMER_NORESTART;
  }
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
  static void __hrtick_start(void *arg)
  {
         struct rq *rq = arg;
+       struct rq_flags rf;
  
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
         __hrtick_restart(rq);
         rq->hrtick_csd_pending = 0;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
  }
  
  /*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
  
  static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
-       update_rq_clock(rq);
+       if (!(flags & ENQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
         if (!(flags & ENQUEUE_RESTORE))
                 sched_info_queued(rq, p);
+
         p->sched_class->enqueue_task(rq, p, flags);
  }
  
  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
-       update_rq_clock(rq);
+       if (!(flags & DEQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
         if (!(flags & DEQUEUE_SAVE))
                 sched_info_dequeued(rq, p);
+
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
   *
   * Returns (locked) new rq. Old rq's lock is released.
   */
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+                                  struct task_struct *p, int new_cpu)
  {
         lockdep_assert_held(&rq->lock);
  
         p->on_rq = TASK_ON_RQ_MIGRATING;
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, DEQUEUE_NOCLOCK);
         set_task_cpu(p, new_cpu);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, rf);
  
         rq = cpu_rq(new_cpu);
  
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, rf);
         BUG_ON(task_cpu(p) != new_cpu);
         enqueue_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
   * So we race with normal scheduler movements, but that's OK, as long
   * as the task is no longer on this CPU.
   */
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+                                struct task_struct *p, int dest_cpu)
  {
         if (unlikely(!cpu_active(dest_cpu)))
                 return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
         if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 return rq;
  
-       rq = move_queued_task(rq, p, dest_cpu);
+       update_rq_clock(rq);
+       rq = move_queued_task(rq, rf, p, dest_cpu);
  
         return rq;
  }
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
         struct migration_arg *arg = data;
         struct task_struct *p = arg->task;
         struct rq *rq = this_rq();
+       struct rq_flags rf;
  
         /*
          * The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
         sched_ttwu_pending();
  
         raw_spin_lock(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
         /*
          * If task_rq(p) != rq, it cannot be migrated here, because we're
          * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
          */
         if (task_rq(p) == rq) {
                 if (task_on_rq_queued(p))
-                       rq = __migrate_task(rq, p, arg->dest_cpu);
+                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                 else
                         p->wake_cpu = arg->dest_cpu;
         }
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
         raw_spin_unlock(&p->pi_lock);
  
         local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                  * holding rq->lock.
                  */
                 lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
         }
         if (running)
                 put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         p->sched_class->set_cpus_allowed(p, new_mask);
  
         if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
                 set_curr_task(rq, p);
  }
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                  * OK, since we're going to drop the lock immediately
                  * afterwards anyway.
                  */
-               rq_unpin_lock(rq, &rf);
-               rq = move_queued_task(rq, p, dest_cpu);
-               rq_repin_lock(rq, &rf);
+               rq = move_queued_task(rq, &rf, p, dest_cpu);
         }
  out:
         task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
  {
         if (task_on_rq_queued(p)) {
                 struct rq *src_rq, *dst_rq;
+               struct rq_flags srf, drf;
  
                 src_rq = task_rq(p);
                 dst_rq = cpu_rq(cpu);
  
+               rq_pin_lock(src_rq, &srf);
+               rq_pin_lock(dst_rq, &drf);
+
                 p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src_rq, p, 0);
                 set_task_cpu(p, cpu);
                 activate_task(dst_rq, p, 0);
                 p->on_rq = TASK_ON_RQ_QUEUED;
                 check_preempt_curr(dst_rq, p, 0);
+
+               rq_unpin_lock(dst_rq, &drf);
+               rq_unpin_lock(src_rq, &srf);
+
         } else {
                 /*
                  * Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                  struct rq_flags *rf)
  {
-       int en_flags = ENQUEUE_WAKEUP;
+       int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
  
         lockdep_assert_held(&rq->lock);
  
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
         struct rq *rq = this_rq();
         struct llist_node *llist = llist_del_all(&rq->wake_list);
         struct task_struct *p;
-       unsigned long flags;
         struct rq_flags rf;
  
         if (!llist)
                 return;
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       rq_pin_lock(rq, &rf);
+       rq_lock_irqsave(rq, &rf);
+       update_rq_clock(rq);
  
         while (llist) {
                 int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
                 ttwu_do_activate(rq, p, wake_flags, &rf);
         }
  
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
  }
  
  void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
  void wake_up_if_idle(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
  
         rcu_read_lock();
  
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
         if (set_nr_if_polling(rq->idle)) {
                 trace_sched_wake_idle_without_ipi(cpu);
         } else {
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               rq_lock_irqsave(rq, &rf);
                 if (is_idle_task(rq->curr))
                         smp_send_reschedule(cpu);
                 /* Else CPU is not idle, do nothing here: */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
         }
  
  out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
         }
  #endif
  
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
         ttwu_do_activate(rq, p, wake_flags, &rf);
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
  }
  
  /*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                  * disabled avoiding further scheduler activity on it and we've
                  * not yet picked a replacement task.
                  */
-               rq_unpin_lock(rq, rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                 raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
-               rq_repin_lock(rq, rf);
+               rq_relock(rq, rf);
         }
  
         if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                         delayacct_blkio_end();
                         atomic_dec(&rq->nr_iowait);
                 }
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
         }
  
         ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
         update_rq_clock(rq);
         post_init_entity_util_avg(&p->se);
  
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
         struct task_struct *curr = rq->curr;
+       struct rq_flags rf;
  
         sched_clock_tick();
  
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
+
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         cpu_load_update_active(rq);
         calc_global_load_tick(rq);
-       raw_spin_unlock(&rq->lock);
+
+       rq_unlock(rq, &rf);
  
         perf_event_task_tick();
  
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
          * done by the caller to avoid the race with signal_wake_up().
          */
         smp_mb__before_spinlock();
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
  
         /* Promote REQ to ACT */
         rq->clock_update_flags <<= 1;
+       update_rq_clock(rq);
  
         switch_count = &prev->nivcsw;
         if (!preempt && prev->state) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
                         prev->on_rq = 0;
  
                         if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
                 switch_count = &prev->nvcsw;
         }
  
-       if (task_on_rq_queued(prev))
-               update_rq_clock(rq);
-
         next = pick_next_task(rq, prev, &rf);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
                 rq = context_switch(rq, prev, next, &rf);
         } else {
                 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
         }
  
         balance_callback(rq);
@@ -3684,7 +3684,8 @@ EXPORT_SYMBOL(default_wake_function);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int oldprio, queued, running, queue_flag =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         const struct sched_class *prev_class;
         struct rq_flags rf;
         struct rq *rq;
@@ -3805,7 +3806,7 @@ void set_user_nice(struct task_struct *p, long nice)
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
         if (running)
                 put_prev_task(rq, p);
  
@@ -3816,7 +3817,7 @@ void set_user_nice(struct task_struct *p, long nice)
         delta = p->prio - old_prio;
  
         if (queued) {
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -4126,7 +4127,7 @@ static int __sched_setscheduler(struct task_struct *p,
         const struct sched_class *prev_class;
         struct rq_flags rf;
         int reset_on_fork;
-       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         struct rq *rq;
  
         /* May grab non-irq protected spin_locks: */
@@ -4923,7 +4924,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
   */
  SYSCALL_DEFINE0(sched_yield)
  {
-       struct rq *rq = this_rq_lock();
+       struct rq_flags rf;
+       struct rq *rq;
+
+       local_irq_disable();
+       rq = this_rq();
+       rq_lock(rq, &rf);
  
         schedstat_inc(rq->yld_count);
         current->sched_class->yield_task(rq);
@@ -4932,9 +4938,8 @@ SYSCALL_DEFINE0(sched_yield)
          * Since we are going to call schedule() anyway, there's
          * no need to preempt or enable interrupts:
          */
-       __release(rq->lock);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-       do_raw_spin_unlock(&rq->lock);
+       preempt_disable();
+       rq_unlock(rq, &rf);
         sched_preempt_enable_no_resched();
  
         schedule();
@@ -5514,7 +5519,7 @@ void sched_setnuma(struct task_struct *p, int nid)
         p->numa_preferred_nid = nid;
  
         if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
                 set_curr_task(rq, p);
         task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5584,11 @@ static struct task_struct fake_task = {
   * there's no concurrency possible, we hold the required locks anyway
   * because of lock validation efforts.
   */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
  {
         struct rq *rq = dead_rq;
         struct task_struct *next, *stop = rq->stop;
-       struct rq_flags rf;
+       struct rq_flags orf = *rf;
         int dest_cpu;
  
         /*
@@ -5602,9 +5607,7 @@ static void migrate_tasks(struct rq *dead_rq)
          * class method both need to have an up-to-date
          * value of rq->clock[_task]
          */
-       rq_pin_lock(rq, &rf);
         update_rq_clock(rq);
-       rq_unpin_lock(rq, &rf);
  
         for (;;) {
                 /*
@@ -5617,8 +5620,7 @@ static void migrate_tasks(struct rq *dead_rq)
                 /*
                  * pick_next_task() assumes pinned rq->lock:
                  */
-               rq_repin_lock(rq, &rf);
-               next = pick_next_task(rq, &fake_task, &rf);
+               next = pick_next_task(rq, &fake_task, rf);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
  
@@ -5631,10 +5633,9 @@ static void migrate_tasks(struct rq *dead_rq)
                  * because !cpu_active at this point, which means load-balance
                  * will not interfere. Also, stop-machine.
                  */
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                 raw_spin_lock(&next->pi_lock);
-               raw_spin_lock(&rq->lock);
+               rq_relock(rq, rf);
  
                 /*
                  * Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5649,12 @@ static void migrate_tasks(struct rq *dead_rq)
  
                 /* Find suitable destination for @next, with force if needed. */
                 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
-               rq = __migrate_task(rq, next, dest_cpu);
+               rq = __migrate_task(rq, rf, next, dest_cpu);
                 if (rq != dead_rq) {
-                       raw_spin_unlock(&rq->lock);
+                       rq_unlock(rq, rf);
                         rq = dead_rq;
-                       raw_spin_lock(&rq->lock);
+                       *rf = orf;
+                       rq_relock(rq, rf);
                 }
                 raw_spin_unlock(&next->pi_lock);
         }
@@ -5766,7 +5767,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
  int sched_cpu_activate(unsigned int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
  
         set_cpu_active(cpu, true);
  
@@ -5784,12 +5785,12 @@ int sched_cpu_activate(unsigned int cpu)
          * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
          *    domains.
          */
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
         if (rq->rd) {
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_online(rq);
         }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
  
         update_max_interval();
  
@@ -5847,18 +5848,20 @@ int sched_cpu_starting(unsigned int cpu)
  int sched_cpu_dying(unsigned int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
  
         /* Handle pending wakeups and then migrate everything off */
         sched_ttwu_pending();
-       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       rq_lock_irqsave(rq, &rf);
         if (rq->rd) {
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_offline(rq);
         }
-       migrate_tasks(rq);
+       migrate_tasks(rq, &rf);
         BUG_ON(rq->nr_running != 1);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
+
         calc_load_migrate(rq);
         update_max_interval();
         nohz_balance_exit_idle(cpu);
@@ -6412,7 +6415,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
   */
  void sched_move_task(struct task_struct *tsk)
  {
-       int queued, running;
+       int queued, running, queue_flags =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         struct rq_flags rf;
         struct rq *rq;
  
@@ -6423,14 +6427,14 @@ void sched_move_task(struct task_struct *tsk)
         queued = task_on_rq_queued(tsk);
  
         if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+               dequeue_task(rq, tsk, queue_flags);
         if (running)
                 put_prev_task(rq, tsk);
  
         sched_change_group(tsk, TASK_MOVE_GROUP);
  
         if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+               enqueue_task(rq, tsk, queue_flags);
         if (running)
                 set_curr_task(rq, tsk);
  
@@ -7008,14 +7012,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
         for_each_online_cpu(i) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                 struct rq *rq = cfs_rq->rq;
+               struct rq_flags rf;
  
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                 cfs_rq->runtime_enabled = runtime_enabled;
                 cfs_rq->runtime_remaining = 0;
  
                 if (cfs_rq->throttled)
                         unthrottle_cfs_rq(cfs_rq);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
         }
         if (runtime_was_enabled && !runtime_enabled)
                 cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index dea138964b9107b3e22542a8b80f5cf1d43c1dee..76f67b3e34d6d3f80264e309954ef59083829f45 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
   * Approximate:
   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
   */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
  {
         unsigned int local_n;
  
@@ -2795,31 +2795,112 @@ static __always_inline u64 decay_load(u64 val, u64 n)
         return val;
  }
  
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
  {
-       u32 contrib = 0;
+       u32 c1, c2, c3 = remainder; /* y^0 == 1 */
+
+       if (!periods)
+               return remainder - period_contrib;
  
-       if (likely(n <= LOAD_AVG_PERIOD))
-               return runnable_avg_yN_sum[n];
-       else if (unlikely(n >= LOAD_AVG_MAX_N))
+       if (unlikely(periods >= LOAD_AVG_MAX_N))
                 return LOAD_AVG_MAX;
  
-       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-       n %= LOAD_AVG_PERIOD;
-       contrib = decay_load(contrib, n);
-       return contrib + runnable_avg_yN_sum[n];
+       /*
+        * c1 = d1 y^(p+1)
+        */
+       c1 = decay_load((u64)(1024 - period_contrib), periods);
+
+       periods -= 1;
+       /*
+        * For updates fully spanning n periods, the contribution to runnable
+        * average will be:
+        *
+        *   c2 = 1024 \Sum y^n
+        *
+        * We can compute this reasonably efficiently by combining:
+        *
+        *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+        */
+       if (likely(periods <= LOAD_AVG_PERIOD)) {
+               c2 = runnable_avg_yN_sum[periods];
+       } else {
+               c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+               periods %= LOAD_AVG_PERIOD;
+               c2 = decay_load(c2, periods);
+               c2 += runnable_avg_yN_sum[periods];
+       }
+
+       return c1 + c2 + c3;
  }
  
  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                                p
+ * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
+ *                               n=1
+ *
+ *    = u y^(p+1) +                            (Step 1)
+ *
+ *                          p
+ *      d1 y^(p+1) + 1024 \Sum y^n + d3 y^0    (Step 2)
+ *                         n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+              unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+       unsigned long scale_freq, scale_cpu;
+       u64 periods;
+       u32 contrib;
+
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+       delta += sa->period_contrib;
+       periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+       /*
+        * Step 1: decay old *_sum if we crossed period boundaries.
+        */
+       if (periods) {
+               sa->load_sum = decay_load(sa->load_sum, periods);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_sum =
+                               decay_load(cfs_rq->runnable_load_sum, periods);
+               }
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+       }
+
+       /*
+        * Step 2
+        */
+       delta %= 1024;
+       contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * contrib;
+               if (cfs_rq)
+                       cfs_rq->runnable_load_sum += weight * contrib;
+       }
+       if (running)
+               sa->util_sum += contrib * scale_cpu;
+
+       return periods;
+}
+
  /*
   * We can represent the historical contribution to runnable average as the
   * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -2849,13 +2930,10 @@ static u32 __compute_runnable_contrib(u64 n)
   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
   */
  static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
  {
-       u64 delta, scaled_delta, periods;
-       u32 contrib;
-       unsigned int delta_w, scaled_delta_w, decayed = 0;
-       unsigned long scale_freq, scale_cpu;
+       u64 delta;
  
         delta = now - sa->last_update_time;
         /*
@@ -2876,81 +2954,49 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                 return 0;
         sa->last_update_time = now;
  
-       scale_freq = arch_scale_freq_capacity(NULL, cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-       /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->period_contrib;
-       if (delta + delta_w >= 1024) {
-               decayed = 1;
-
-               /* how much left for next period will start over, we don't know yet */
-               sa->period_contrib = 0;
-
-               /*
-                * Now that we know we're crossing a period boundary, figure
-                * out how much from delta we need to complete the current
-                * period and accrue it.
-                */
-               delta_w = 1024 - delta_w;
-               scaled_delta_w = cap_scale(delta_w, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * scaled_delta_w;
-                       if (cfs_rq) {
-                               cfs_rq->runnable_load_sum +=
-                                               weight * scaled_delta_w;
-                       }
-               }
-               if (running)
-                       sa->util_sum += scaled_delta_w * scale_cpu;
-
-               delta -= delta_w;
-
-               /* Figure out how many additional periods this update spans */
-               periods = delta / 1024;
-               delta %= 1024;
+       /*
+        * Now we know we crossed measurement unit boundaries. The *_avg
+        * accrues by two steps:
+        *
+        * Step 1: accumulate *_sum since last_update_time. If we haven't
+        * crossed period boundaries, finish.
+        */
+       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+               return 0;
  
-               sa->load_sum = decay_load(sa->load_sum, periods + 1);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
-               }
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-               contrib = __compute_runnable_contrib(periods);
-               contrib = cap_scale(contrib, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * contrib;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * contrib;
-               }
-               if (running)
-                       sa->util_sum += contrib * scale_cpu;
+       /*
+        * Step 2: update *_avg.
+        */
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       if (cfs_rq) {
+               cfs_rq->runnable_load_avg =
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
         }
+       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
  
-       /* Remainder of delta accrued against u_0` */
-       scaled_delta = cap_scale(delta, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * scaled_delta;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * scaled_delta;
-       }
-       if (running)
-               sa->util_sum += scaled_delta * scale_cpu;
+       return 1;
+}
  
-       sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
  
-       if (decayed) {
-               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_avg =
-                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-               }
-               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
-       }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg,
+                                 se->on_rq * scale_load_down(se->load.weight),
+                                 cfs_rq->curr == se, NULL);
+}
  
-       return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+       return ___update_load_avg(now, cpu, &cfs_rq->avg,
+                       scale_load_down(cfs_rq->load.weight),
+                       cfs_rq->curr != NULL, cfs_rq);
  }
  
  /*
@@ -3014,6 +3060,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  void set_task_rq_fair(struct sched_entity *se,
                       struct cfs_rq *prev, struct cfs_rq *next)
  {
+       u64 p_last_update_time;
+       u64 n_last_update_time;
+
         if (!sched_feat(ATTACH_AGE_LOAD))
                 return;
  
@@ -3024,11 +3073,11 @@ void set_task_rq_fair(struct sched_entity *se,
          * time. This will result in the wakee task is less decayed, but giving
          * the wakee more load sounds not bad.
          */
-       if (se->avg.last_update_time && prev) {
-               u64 p_last_update_time;
-               u64 n_last_update_time;
+       if (!(se->avg.last_update_time && prev))
+               return;
  
  #ifndef CONFIG_64BIT
+       {
                 u64 p_last_update_time_copy;
                 u64 n_last_update_time_copy;
  
@@ -3043,14 +3092,13 @@ void set_task_rq_fair(struct sched_entity *se,
  
                 } while (p_last_update_time != p_last_update_time_copy ||
                          n_last_update_time != n_last_update_time_copy);
+       }
  #else
-               p_last_update_time = prev->avg.last_update_time;
-               n_last_update_time = next->avg.last_update_time;
+       p_last_update_time = prev->avg.last_update_time;
+       n_last_update_time = next->avg.last_update_time;
  #endif
-               __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
-                                 &se->avg, 0, 0, NULL);
-               se->avg.last_update_time = n_last_update_time;
-       }
+       __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+       se->avg.last_update_time = n_last_update_time;
  }
  
  /* Take into account change of utilization of a child task group */
@@ -3173,6 +3221,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
         return 1;
  }
  
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+       /*
+        * If sched_entity still have not zero load or utilization, we have to
+        * decay it:
+        */
+       if (se->avg.load_avg || se->avg.util_avg)
+               return false;
+
+       /*
+        * If there is a pending propagation, we have to update the load and
+        * the utilization of the sched_entity:
+        */
+       if (gcfs_rq->propagate_avg)
+               return false;
+
+       /*
+        * Otherwise, the load and the utilization of the sched_entity is
+        * already zero and there is no pending propagation, so it will be a
+        * waste of time to try to decay it:
+        */
+       return true;
+}
+
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3343,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
                 set_tg_cfs_propagate(cfs_rq);
         }
  
-       decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-               scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+       decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
  
  #ifndef CONFIG_64BIT
         smp_wmb();
@@ -3298,11 +3375,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
          * Track task load average for carrying it to new CPU after migrated, and
          * track group sched_entity load average for task_h_load calc in migration
          */
-       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
-               __update_load_avg(now, cpu, &se->avg,
-                         se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
-       }
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+               __update_load_avg_se(now, cpu, cfs_rq, se);
  
         decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
         decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3481,7 @@ void sync_entity_load_avg(struct sched_entity *se)
         u64 last_update_time;
  
         last_update_time = cfs_rq_last_update_time(cfs_rq);
-       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+       __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
  }
  
  /*
@@ -4271,8 +4345,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                 throttled_list) {
                 struct rq *rq = rq_of(cfs_rq);
+               struct rq_flags rf;
  
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq, &rf);
                 if (!cfs_rq_throttled(cfs_rq))
                         goto next;
  
@@ -4289,7 +4364,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                         unthrottle_cfs_rq(cfs_rq);
  
  next:
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, &rf);
  
                 if (!remaining)
                         break;
@@ -5097,15 +5172,16 @@ void cpu_load_update_nohz_stop(void)
         unsigned long curr_jiffies = READ_ONCE(jiffies);
         struct rq *this_rq = this_rq();
         unsigned long load;
+       struct rq_flags rf;
  
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
         load = weighted_cpuload(cpu_of(this_rq));
-       raw_spin_lock(&this_rq->lock);
+       rq_lock(this_rq, &rf);
         update_rq_clock(this_rq);
         cpu_load_update_nohz(this_rq, curr_jiffies, load);
-       raw_spin_unlock(&this_rq->lock);
+       rq_unlock(this_rq, &rf);
  }
  #else /* !CONFIG_NO_HZ_COMMON */
  static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6845,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
         lockdep_assert_held(&env->src_rq->lock);
  
         p->on_rq = TASK_ON_RQ_MIGRATING;
-       deactivate_task(env->src_rq, p, 0);
+       deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
         set_task_cpu(p, env->dst_cpu);
  }
  
@@ -6902,7 +6978,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
         lockdep_assert_held(&rq->lock);
  
         BUG_ON(task_rq(p) != rq);
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
         p->on_rq = TASK_ON_RQ_QUEUED;
         check_preempt_curr(rq, p, 0);
  }
@@ -6913,9 +6989,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
   */
  static void attach_one_task(struct rq *rq, struct task_struct *p)
  {
-       raw_spin_lock(&rq->lock);
+       struct rq_flags rf;
+
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
         attach_task(rq, p);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
  }
  
  /*
@@ -6926,8 +7005,10 @@ static void attach_tasks(struct lb_env *env)
  {
         struct list_head *tasks = &env->tasks;
         struct task_struct *p;
+       struct rq_flags rf;
  
-       raw_spin_lock(&env->dst_rq->lock);
+       rq_lock(env->dst_rq, &rf);
+       update_rq_clock(env->dst_rq);
  
         while (!list_empty(tasks)) {
                 p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +7017,7 @@ static void attach_tasks(struct lb_env *env)
                 attach_task(env->dst_rq, p);
         }
  
-       raw_spin_unlock(&env->dst_rq->lock);
+       rq_unlock(env->dst_rq, &rf);
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +7025,9 @@ static void update_blocked_averages(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         struct cfs_rq *cfs_rq;
-       unsigned long flags;
+       struct rq_flags rf;
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
  
         /*
@@ -6954,6 +7035,8 @@ static void update_blocked_averages(int cpu)
          * list_add_leaf_cfs_rq() for details.
          */
         for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct sched_entity *se;
+
                 /* throttled entities do not contribute to load */
                 if (throttled_hierarchy(cfs_rq))
                         continue;
@@ -6961,11 +7044,12 @@ static void update_blocked_averages(int cpu)
                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
                         update_tg_load_avg(cfs_rq, 0);
  
-               /* Propagate pending load changes to the parent */
-               if (cfs_rq->tg->se[cpu])
-                       update_load_avg(cfs_rq->tg->se[cpu], 0);
+               /* Propagate pending load changes to the parent, if any: */
+               se = cfs_rq->tg->se[cpu];
+               if (se && !skip_blocked_update(se))
+                       update_load_avg(se, 0);
         }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
  }
  
  /*
@@ -7019,12 +7103,12 @@ static inline void update_blocked_averages(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         struct cfs_rq *cfs_rq = &rq->cfs;
-       unsigned long flags;
+       struct rq_flags rf;
  
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
  }
  
  static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7609,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  {
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
+       struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
         int load_idx, prefer_sibling = 0;
         bool overload = false;
@@ -7541,7 +7626,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                 if (local_group) {
                         sds->local = sg;
-                       sgs = &sds->local_stat;
+                       sgs = local;
  
                         if (env->idle != CPU_NEWLY_IDLE ||
                             time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7650,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                  * the tasks on the system).
                  */
                 if (prefer_sibling && sds->local &&
-                   group_has_capacity(env, &sds->local_stat) &&
-                   (sgs->sum_nr_running > 1)) {
+                   group_has_capacity(env, local) &&
+                   (sgs->sum_nr_running > local->sum_nr_running + 1)) {
                         sgs->group_no_capacity = 1;
                         sgs->group_type = group_classify(sg, sgs);
                 }
@@ -8042,7 +8127,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         struct sched_domain *sd_parent = sd->parent;
         struct sched_group *group;
         struct rq *busiest;
-       unsigned long flags;
+       struct rq_flags rf;
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
  
         struct lb_env env = {
@@ -8105,7 +8190,7 @@ redo:
                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
  
  more_balance:
-               raw_spin_lock_irqsave(&busiest->lock, flags);
+               rq_lock_irqsave(busiest, &rf);
                 update_rq_clock(busiest);
  
                 /*
@@ -8122,14 +8207,14 @@ more_balance:
                  * See task_rq_lock() family for the details.
                  */
  
-               raw_spin_unlock(&busiest->lock);
+               rq_unlock(busiest, &rf);
  
                 if (cur_ld_moved) {
                         attach_tasks(&env);
                         ld_moved += cur_ld_moved;
                 }
  
-               local_irq_restore(flags);
+               local_irq_restore(rf.flags);
  
                 if (env.flags & LBF_NEED_BREAK) {
                         env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8292,8 @@ more_balance:
                         sd->nr_balance_failed++;
  
                 if (need_active_balance(&env)) {
+                       unsigned long flags;
+
                         raw_spin_lock_irqsave(&busiest->lock, flags);
  
                         /* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8531,9 @@ static int active_load_balance_cpu_stop(void *data)
         struct rq *target_rq = cpu_rq(target_cpu);
         struct sched_domain *sd;
         struct task_struct *p = NULL;
+       struct rq_flags rf;
  
-       raw_spin_lock_irq(&busiest_rq->lock);
+       rq_lock_irq(busiest_rq, &rf);
  
         /* make sure the requested cpu hasn't gone down in the meantime */
         if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8584,7 @@ static int active_load_balance_cpu_stop(void *data)
         rcu_read_unlock();
  out_unlock:
         busiest_rq->active_balance = 0;
-       raw_spin_unlock(&busiest_rq->lock);
+       rq_unlock(busiest_rq, &rf);
  
         if (p)
                 attach_one_task(target_rq, p);
@@ -8794,10 +8882,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                  * do the balance.
                  */
                 if (time_after_eq(jiffies, rq->next_balance)) {
-                       raw_spin_lock_irq(&rq->lock);
+                       struct rq_flags rf;
+
+                       rq_lock_irq(rq, &rf);
                         update_rq_clock(rq);
                         cpu_load_update_idle(rq);
-                       raw_spin_unlock_irq(&rq->lock);
+                       rq_unlock_irq(rq, &rf);
+
                         rebalance_domains(rq, CPU_IDLE);
                 }
  
@@ -8988,8 +9079,9 @@ static void task_fork_fair(struct task_struct *p)
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se, *curr;
         struct rq *rq = this_rq();
+       struct rq_flags rf;
  
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
         update_rq_clock(rq);
  
         cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9102,7 @@ static void task_fork_fair(struct task_struct *p)
         }
  
         se->vruntime -= cfs_rq->min_vruntime;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
  }
  
  /*
@@ -9372,7 +9464,6 @@ static DEFINE_MUTEX(shares_mutex);
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
         int i;
-       unsigned long flags;
  
         /*
          * We can't change the weight of the root cgroup.
@@ -9389,19 +9480,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         tg->shares = shares;
         for_each_possible_cpu(i) {
                 struct rq *rq = cpu_rq(i);
-               struct sched_entity *se;
+               struct sched_entity *se = tg->se[i];
+               struct rq_flags rf;
  
-               se = tg->se[i];
                 /* Propagate contribution to hierarchy */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-
-               /* Possible calls to update_curr() need rq clock */
+               rq_lock_irqsave(rq, &rf);
                 update_rq_clock(rq);
                 for_each_sched_entity(se) {
                         update_load_avg(se, UPDATE_TG);
                         update_cfs_shares(se);
                 }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
         }
  
  done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 1b3c8189b28656d2644a714ff60ceab7d015d97b..11192e0cb122c72066a2f46417246464d909ea7c 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
   */
  SCHED_FEAT(SIS_AVG_CPU, false)
  
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
  #ifdef HAVE_RT_PUSH_IPI
  /*
   * In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 9f3e40226dec875c7b318b4a9e6a2c01a89604ac..979b7341008afc94f839f38b3fd84a12b7a3fb16 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
  #define RT_PUSH_IPI_EXECUTING          1
  #define RT_PUSH_IPI_RESTART            2
  
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ *    (None or zero):          No IPI is going around for the current rq
+ *    RT_PUSH_IPI_EXECUTING:   An IPI for the rq is being passed around
+ *    RT_PUSH_IPI_RESTART:     The priority of the running task for the rq
+ *                             has changed, and the IPI should restart
+ *                             circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
  static void tell_cpu_to_push(struct rq *rq)
  {
         int cpu;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 5cbf92214ad89287d111ab8300e5b55923d83ffe..de4b934ba9743da0cb8ee4515178f6102ddcc87e 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
  #define DEQUEUE_SLEEP          0x01
  #define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
  #define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK                0x08 /* matches ENQUEUE_NOCLOCK */
  
  #define ENQUEUE_WAKEUP         0x01
  #define ENQUEUE_RESTORE                0x02
  #define ENQUEUE_MOVE           0x04
+#define ENQUEUE_NOCLOCK                0x08
  
-#define ENQUEUE_HEAD           0x08
-#define ENQUEUE_REPLENISH      0x10
+#define ENQUEUE_HEAD           0x10
+#define ENQUEUE_REPLENISH      0x20
  #ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED       0x20
+#define ENQUEUE_MIGRATED       0x40
  #else
  #define ENQUEUE_MIGRATED       0x00
  #endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
  
  struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
         __acquires(rq->lock);
+
  struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
         __acquires(p->pi_lock)
         __acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
  }
  
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irqsave(&rq->lock, rf->flags);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irq(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock(&rq->lock);
+}
+
  #ifdef CONFIG_SMP
  #ifdef CONFIG_PREEMPT
author	Ingo Molnar <mingo@kernel.org>
	Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 11 Apr 2017 07:05:36 +0000 (09:05 +0200)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/features.h		patch \| blob \| blame \| history
kernel/sched/rt.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history