]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
sched: Fix yet more sched_fork() races
authorPeter Zijlstra <peterz@infradead.org>
Mon, 14 Feb 2022 09:16:57 +0000 (10:16 +0100)
committerPaolo Pisati <paolo.pisati@canonical.com>
Wed, 9 Mar 2022 14:17:55 +0000 (15:17 +0100)
BugLink: https://bugs.launchpad.net/bugs/1964361
commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream.

Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an
invalid sched_task_group") fixed a fork race vs cgroup, it opened up a
race vs syscalls by not placing the task on the runqueue before it
gets exposed through the pidhash.

Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is
trying to fix a single instance of this, instead fix the whole class
of issues, effectively reverting this commit.

Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Tadeusz Struk <tadeusz.struk@linaro.org>
Tested-by: Zhang Qiao <zhangqiao22@huawei.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com>
include/linux/sched/task.h
kernel/fork.c
kernel/sched/core.c

index 058d7f371e25af6fb2226bc9646909239af96b23..8db25fcc1eba7c4ce32d610446d0196811249ac4 100644 (file)
@@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_post_fork(struct task_struct *p,
-                           struct kernel_clone_args *kargs);
+extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_post_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
 void __noreturn do_task_dead(void);
index 05c8c4df847c2fff25426c835ee87747449a883e..4ab3b1dd12c3c4cad8b0ba003ed5ba8b6185f3db 100644 (file)
@@ -2306,6 +2306,17 @@ static __latent_entropy struct task_struct *copy_process(
        if (retval)
                goto bad_fork_put_pidfd;
 
+       /*
+        * Now that the cgroups are pinned, re-clone the parent cgroup and put
+        * the new task on the correct runqueue. All this *before* the task
+        * becomes visible.
+        *
+        * This isn't part of ->can_fork() because while the re-cloning is
+        * cgroup specific, it unconditionally needs to place the task on a
+        * runqueue.
+        */
+       sched_cgroup_fork(p, args);
+
        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
@@ -2415,7 +2426,7 @@ static __latent_entropy struct task_struct *copy_process(
                fd_install(pidfd, pidfile);
 
        proc_fork_connector(p);
-       sched_post_fork(p, args);
+       sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);
 
index 4959ed985c301c7edad4850d0b64c27aa4b3a44d..aa09853d15620a0c7f821ddfb5931d66c2bf46d8 100644 (file)
@@ -1199,9 +1199,8 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
 
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
 {
-       bool update_load = !(READ_ONCE(p->__state) & TASK_NEW);
        int prio = p->static_prio - MAX_RT_PRIO;
        struct load_weight *load = &p->se.load;
 
@@ -4359,7 +4358,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                        p->static_prio = NICE_TO_PRIO(0);
 
                p->prio = p->normal_prio = p->static_prio;
-               set_load_weight(p);
+               set_load_weight(p, false);
 
                /*
                 * We don't need the reset flag anymore after the fork. It has
@@ -4377,6 +4376,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
        init_entity_runnable_average(&p->se);
 
+
 #ifdef CONFIG_SCHED_INFO
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4392,18 +4392,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
        return 0;
 }
 
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 {
        unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
-       struct task_group *tg;
-#endif
 
+       /*
+        * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+        * required yet, but lockdep gets upset if rules are violated.
+        */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_CGROUP_SCHED
-       tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
-                         struct task_group, css);
-       p->sched_task_group = autogroup_task_group(p, tg);
+       if (1) {
+               struct task_group *tg;
+               tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+                                 struct task_group, css);
+               tg = autogroup_task_group(p, tg);
+               p->sched_task_group = tg;
+       }
 #endif
        rseq_migrate(p);
        /*
@@ -4414,7 +4419,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
 
+void sched_post_fork(struct task_struct *p)
+{
        uclamp_post_fork(p);
 }
 
@@ -6903,7 +6911,7 @@ void set_user_nice(struct task_struct *p, long nice)
                put_prev_task(rq, p);
 
        p->static_prio = NICE_TO_PRIO(nice);
-       set_load_weight(p);
+       set_load_weight(p, true);
        old_prio = p->prio;
        p->prio = effective_prio(p);
 
@@ -7195,7 +7203,7 @@ static void __setscheduler_params(struct task_struct *p,
         */
        p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-       set_load_weight(p);
+       set_load_weight(p, true);
 }
 
 /*
@@ -9433,7 +9441,7 @@ void __init sched_init(void)
 #endif
        }
 
-       set_load_weight(&init_task);
+       set_load_weight(&init_task, false);
 
        /*
         * The boot idle thread does lazy MMU switching as well: