]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blobdiff - kernel/fork.c
Merge tag 'linux-kselftest-4.16-rc4' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-hirsute-kernel.git] / kernel / fork.c
index 2295fc69717f6c3d877ef3cac15b55336d7746c6..e5d9d405ae4e55ce862318a152609cd80e7e0c6e 100644 (file)
@@ -77,6 +77,7 @@
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
+#include <linux/sched/mm.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk)
 
 void thread_stack_cache_init(void)
 {
-       thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
-                                             THREAD_SIZE, 0, NULL);
+       thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
+                                       THREAD_SIZE, THREAD_SIZE, 0, 0,
+                                       THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
 }
 # endif
@@ -390,6 +392,235 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
+#ifdef CONFIG_MMU
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+                                       struct mm_struct *oldmm)
+{
+       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+       struct rb_node **rb_link, *rb_parent;
+       int retval;
+       unsigned long charge;
+       LIST_HEAD(uf);
+
+       uprobe_start_dup_mmap();
+       if (down_write_killable(&oldmm->mmap_sem)) {
+               retval = -EINTR;
+               goto fail_uprobe_end;
+       }
+       flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
+       /*
+        * Not linked in yet - no deadlock potential:
+        */
+       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+       /* No ordering required: file already has been exposed. */
+       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
+       mm->total_vm = oldmm->total_vm;
+       mm->data_vm = oldmm->data_vm;
+       mm->exec_vm = oldmm->exec_vm;
+       mm->stack_vm = oldmm->stack_vm;
+
+       rb_link = &mm->mm_rb.rb_node;
+       rb_parent = NULL;
+       pprev = &mm->mmap;
+       retval = ksm_fork(mm, oldmm);
+       if (retval)
+               goto out;
+       retval = khugepaged_fork(mm, oldmm);
+       if (retval)
+               goto out;
+
+       prev = NULL;
+       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+               struct file *file;
+
+               if (mpnt->vm_flags & VM_DONTCOPY) {
+                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+                       continue;
+               }
+               charge = 0;
+               if (mpnt->vm_flags & VM_ACCOUNT) {
+                       unsigned long len = vma_pages(mpnt);
+
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+                               goto fail_nomem;
+                       charge = len;
+               }
+               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+               if (!tmp)
+                       goto fail_nomem;
+               *tmp = *mpnt;
+               INIT_LIST_HEAD(&tmp->anon_vma_chain);
+               retval = vma_dup_policy(mpnt, tmp);
+               if (retval)
+                       goto fail_nomem_policy;
+               tmp->vm_mm = mm;
+               retval = dup_userfaultfd(tmp, &uf);
+               if (retval)
+                       goto fail_nomem_anon_vma_fork;
+               if (tmp->vm_flags & VM_WIPEONFORK) {
+                       /* VM_WIPEONFORK gets a clean slate in the child. */
+                       tmp->anon_vma = NULL;
+                       if (anon_vma_prepare(tmp))
+                               goto fail_nomem_anon_vma_fork;
+               } else if (anon_vma_fork(tmp, mpnt))
+                       goto fail_nomem_anon_vma_fork;
+               tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+               tmp->vm_next = tmp->vm_prev = NULL;
+               file = tmp->vm_file;
+               if (file) {
+                       struct inode *inode = file_inode(file);
+                       struct address_space *mapping = file->f_mapping;
+
+                       get_file(file);
+                       if (tmp->vm_flags & VM_DENYWRITE)
+                               atomic_dec(&inode->i_writecount);
+                       i_mmap_lock_write(mapping);
+                       if (tmp->vm_flags & VM_SHARED)
+                               atomic_inc(&mapping->i_mmap_writable);
+                       flush_dcache_mmap_lock(mapping);
+                       /* insert tmp into the share list, just after mpnt */
+                       vma_interval_tree_insert_after(tmp, mpnt,
+                                       &mapping->i_mmap);
+                       flush_dcache_mmap_unlock(mapping);
+                       i_mmap_unlock_write(mapping);
+               }
+
+               /*
+                * Clear hugetlb-related page reserves for children. This only
+                * affects MAP_PRIVATE mappings. Faults generated by the child
+                * are not guaranteed to succeed, even if read-only
+                */
+               if (is_vm_hugetlb_page(tmp))
+                       reset_vma_resv_huge_pages(tmp);
+
+               /*
+                * Link in the new vma and copy the page table entries.
+                */
+               *pprev = tmp;
+               pprev = &tmp->vm_next;
+               tmp->vm_prev = prev;
+               prev = tmp;
+
+               __vma_link_rb(mm, tmp, rb_link, rb_parent);
+               rb_link = &tmp->vm_rb.rb_right;
+               rb_parent = &tmp->vm_rb;
+
+               mm->map_count++;
+               if (!(tmp->vm_flags & VM_WIPEONFORK))
+                       retval = copy_page_range(mm, oldmm, mpnt);
+
+               if (tmp->vm_ops && tmp->vm_ops->open)
+                       tmp->vm_ops->open(tmp);
+
+               if (retval)
+                       goto out;
+       }
+       /* a new mm has just been created */
+       arch_dup_mmap(oldmm, mm);
+       retval = 0;
+out:
+       up_write(&mm->mmap_sem);
+       flush_tlb_mm(oldmm);
+       up_write(&oldmm->mmap_sem);
+       dup_userfaultfd_complete(&uf);
+fail_uprobe_end:
+       uprobe_end_dup_mmap();
+       return retval;
+fail_nomem_anon_vma_fork:
+       mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+       kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+       retval = -ENOMEM;
+       vm_unacct_memory(charge);
+       goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+       mm->pgd = pgd_alloc(mm);
+       if (unlikely(!mm->pgd))
+               return -ENOMEM;
+       return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+       pgd_free(mm, mm->pgd);
+}
+#else
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+       down_write(&oldmm->mmap_sem);
+       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+       up_write(&oldmm->mmap_sem);
+       return 0;
+}
+#define mm_alloc_pgd(mm)       (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+static void check_mm(struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+               if (unlikely(x))
+                       printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
+       }
+
+       if (mm_pgtables_bytes(mm))
+               pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+                               mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+       VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
+#define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void __mmdrop(struct mm_struct *mm)
+{
+       BUG_ON(mm == &init_mm);
+       mm_free_pgd(mm);
+       destroy_context(mm);
+       hmm_mm_destroy(mm);
+       mmu_notifier_mm_destroy(mm);
+       check_mm(mm);
+       put_user_ns(mm->user_ns);
+       free_mm(mm);
+}
+EXPORT_SYMBOL_GPL(__mmdrop);
+
+static void mmdrop_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm;
+
+       mm = container_of(work, struct mm_struct, async_put_work);
+       __mmdrop(mm);
+}
+
+static void mmdrop_async(struct mm_struct *mm)
+{
+       if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+               INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+               schedule_work(&mm->async_put_work);
+       }
+}
+
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
@@ -457,6 +688,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 int arch_task_struct_size __read_mostly;
 #endif
 
+static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+       /* Fetch thread_struct whitelist for the architecture. */
+       arch_thread_struct_whitelist(offset, size);
+
+       /*
+        * Handle zero-sized whitelist or empty thread_struct, otherwise
+        * adjust offset to position of thread_struct in task_struct.
+        */
+       if (unlikely(*size == 0))
+               *offset = 0;
+       else
+               *offset += offsetof(struct task_struct, thread);
+}
+
 void __init fork_init(void)
 {
        int i;
@@ -465,11 +711,14 @@ void __init fork_init(void)
 #define ARCH_MIN_TASKALIGN     0
 #endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+       unsigned long useroffset, usersize;
 
        /* create a slab on which task_structs can be allocated */
-       task_struct_cachep = kmem_cache_create("task_struct",
+       task_struct_whitelist(&useroffset, &usersize);
+       task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
-                       SLAB_PANIC|SLAB_ACCOUNT, NULL);
+                       SLAB_PANIC|SLAB_ACCOUNT,
+                       useroffset, usersize, NULL);
 #endif
 
        /* do the arch specific task caches init */
@@ -594,181 +843,8 @@ free_tsk:
        return NULL;
 }
 
-#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
-                                       struct mm_struct *oldmm)
-{
-       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
-       struct rb_node **rb_link, *rb_parent;
-       int retval;
-       unsigned long charge;
-       LIST_HEAD(uf);
-
-       uprobe_start_dup_mmap();
-       if (down_write_killable(&oldmm->mmap_sem)) {
-               retval = -EINTR;
-               goto fail_uprobe_end;
-       }
-       flush_cache_dup_mm(oldmm);
-       uprobe_dup_mmap(oldmm, mm);
-       /*
-        * Not linked in yet - no deadlock potential:
-        */
-       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-
-       /* No ordering required: file already has been exposed. */
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-
-       mm->total_vm = oldmm->total_vm;
-       mm->data_vm = oldmm->data_vm;
-       mm->exec_vm = oldmm->exec_vm;
-       mm->stack_vm = oldmm->stack_vm;
-
-       rb_link = &mm->mm_rb.rb_node;
-       rb_parent = NULL;
-       pprev = &mm->mmap;
-       retval = ksm_fork(mm, oldmm);
-       if (retval)
-               goto out;
-       retval = khugepaged_fork(mm, oldmm);
-       if (retval)
-               goto out;
-
-       prev = NULL;
-       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
-               struct file *file;
-
-               if (mpnt->vm_flags & VM_DONTCOPY) {
-                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
-                       continue;
-               }
-               charge = 0;
-               if (mpnt->vm_flags & VM_ACCOUNT) {
-                       unsigned long len = vma_pages(mpnt);
-
-                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
-                               goto fail_nomem;
-                       charge = len;
-               }
-               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-               if (!tmp)
-                       goto fail_nomem;
-               *tmp = *mpnt;
-               INIT_LIST_HEAD(&tmp->anon_vma_chain);
-               retval = vma_dup_policy(mpnt, tmp);
-               if (retval)
-                       goto fail_nomem_policy;
-               tmp->vm_mm = mm;
-               retval = dup_userfaultfd(tmp, &uf);
-               if (retval)
-                       goto fail_nomem_anon_vma_fork;
-               if (tmp->vm_flags & VM_WIPEONFORK) {
-                       /* VM_WIPEONFORK gets a clean slate in the child. */
-                       tmp->anon_vma = NULL;
-                       if (anon_vma_prepare(tmp))
-                               goto fail_nomem_anon_vma_fork;
-               } else if (anon_vma_fork(tmp, mpnt))
-                       goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
-               tmp->vm_next = tmp->vm_prev = NULL;
-               file = tmp->vm_file;
-               if (file) {
-                       struct inode *inode = file_inode(file);
-                       struct address_space *mapping = file->f_mapping;
-
-                       get_file(file);
-                       if (tmp->vm_flags & VM_DENYWRITE)
-                               atomic_dec(&inode->i_writecount);
-                       i_mmap_lock_write(mapping);
-                       if (tmp->vm_flags & VM_SHARED)
-                               atomic_inc(&mapping->i_mmap_writable);
-                       flush_dcache_mmap_lock(mapping);
-                       /* insert tmp into the share list, just after mpnt */
-                       vma_interval_tree_insert_after(tmp, mpnt,
-                                       &mapping->i_mmap);
-                       flush_dcache_mmap_unlock(mapping);
-                       i_mmap_unlock_write(mapping);
-               }
-
-               /*
-                * Clear hugetlb-related page reserves for children. This only
-                * affects MAP_PRIVATE mappings. Faults generated by the child
-                * are not guaranteed to succeed, even if read-only
-                */
-               if (is_vm_hugetlb_page(tmp))
-                       reset_vma_resv_huge_pages(tmp);
-
-               /*
-                * Link in the new vma and copy the page table entries.
-                */
-               *pprev = tmp;
-               pprev = &tmp->vm_next;
-               tmp->vm_prev = prev;
-               prev = tmp;
-
-               __vma_link_rb(mm, tmp, rb_link, rb_parent);
-               rb_link = &tmp->vm_rb.rb_right;
-               rb_parent = &tmp->vm_rb;
-
-               mm->map_count++;
-               if (!(tmp->vm_flags & VM_WIPEONFORK))
-                       retval = copy_page_range(mm, oldmm, mpnt);
-
-               if (tmp->vm_ops && tmp->vm_ops->open)
-                       tmp->vm_ops->open(tmp);
-
-               if (retval)
-                       goto out;
-       }
-       /* a new mm has just been created */
-       retval = arch_dup_mmap(oldmm, mm);
-out:
-       up_write(&mm->mmap_sem);
-       flush_tlb_mm(oldmm);
-       up_write(&oldmm->mmap_sem);
-       dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
-       uprobe_end_dup_mmap();
-       return retval;
-fail_nomem_anon_vma_fork:
-       mpol_put(vma_policy(tmp));
-fail_nomem_policy:
-       kmem_cache_free(vm_area_cachep, tmp);
-fail_nomem:
-       retval = -ENOMEM;
-       vm_unacct_memory(charge);
-       goto out;
-}
-
-static inline int mm_alloc_pgd(struct mm_struct *mm)
-{
-       mm->pgd = pgd_alloc(mm);
-       if (unlikely(!mm->pgd))
-               return -ENOMEM;
-       return 0;
-}
-
-static inline void mm_free_pgd(struct mm_struct *mm)
-{
-       pgd_free(mm, mm->pgd);
-}
-#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-       down_write(&oldmm->mmap_sem);
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-       up_write(&oldmm->mmap_sem);
-       return 0;
-}
-#define mm_alloc_pgd(mm)       (0)
-#define mm_free_pgd(mm)
-#endif /* CONFIG_MMU */
-
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
-
 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
 
 static int __init coredump_filter_setup(char *s)
@@ -858,27 +934,6 @@ fail_nopgd:
        return NULL;
 }
 
-static void check_mm(struct mm_struct *mm)
-{
-       int i;
-
-       for (i = 0; i < NR_MM_COUNTERS; i++) {
-               long x = atomic_long_read(&mm->rss_stat.count[i]);
-
-               if (unlikely(x))
-                       printk(KERN_ALERT "BUG: Bad rss-counter state "
-                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
-       }
-
-       if (mm_pgtables_bytes(mm))
-               pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
-                               mm_pgtables_bytes(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-       VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
 /*
  * Allocate and initialize an mm_struct.
  */
@@ -894,24 +949,6 @@ struct mm_struct *mm_alloc(void)
        return mm_init(mm, current, current_user_ns());
 }
 
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
-{
-       BUG_ON(mm == &init_mm);
-       mm_free_pgd(mm);
-       destroy_context(mm);
-       hmm_mm_destroy(mm);
-       mmu_notifier_mm_destroy(mm);
-       check_mm(mm);
-       put_user_ns(mm->user_ns);
-       free_mm(mm);
-}
-EXPORT_SYMBOL_GPL(__mmdrop);
-
 static inline void __mmput(struct mm_struct *mm)
 {
        VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -1544,6 +1581,10 @@ static __latent_entropy struct task_struct *copy_process(
        int retval;
        struct task_struct *p;
 
+       /*
+        * Don't allow sharing the root directory with processes in a different
+        * namespace
+        */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
 
@@ -2019,6 +2060,8 @@ long _do_fork(unsigned long clone_flags,
              int __user *child_tidptr,
              unsigned long tls)
 {
+       struct completion vfork;
+       struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        long nr;
@@ -2044,43 +2087,40 @@ long _do_fork(unsigned long clone_flags,
        p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        add_latent_entropy();
+
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
-       if (!IS_ERR(p)) {
-               struct completion vfork;
-               struct pid *pid;
+       trace_sched_process_fork(current, p);
 
-               trace_sched_process_fork(current, p);
+       pid = get_task_pid(p, PIDTYPE_PID);
+       nr = pid_vnr(pid);
 
-               pid = get_task_pid(p, PIDTYPE_PID);
-               nr = pid_vnr(pid);
+       if (clone_flags & CLONE_PARENT_SETTID)
+               put_user(nr, parent_tidptr);
 
-               if (clone_flags & CLONE_PARENT_SETTID)
-                       put_user(nr, parent_tidptr);
-
-               if (clone_flags & CLONE_VFORK) {
-                       p->vfork_done = &vfork;
-                       init_completion(&vfork);
-                       get_task_struct(p);
-               }
+       if (clone_flags & CLONE_VFORK) {
+               p->vfork_done = &vfork;
+               init_completion(&vfork);
+               get_task_struct(p);
+       }
 
-               wake_up_new_task(p);
+       wake_up_new_task(p);
 
-               /* forking complete and child started to run, tell ptracer */
-               if (unlikely(trace))
-                       ptrace_event_pid(trace, pid);
+       /* forking complete and child started to run, tell ptracer */
+       if (unlikely(trace))
+               ptrace_event_pid(trace, pid);
 
-               if (clone_flags & CLONE_VFORK) {
-                       if (!wait_for_vfork_done(p, &vfork))
-                               ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
-               }
-
-               put_pid(pid);
-       } else {
-               nr = PTR_ERR(p);
+       if (clone_flags & CLONE_VFORK) {
+               if (!wait_for_vfork_done(p, &vfork))
+                       ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }
+
+       put_pid(pid);
        return nr;
 }
 
@@ -2224,9 +2264,11 @@ void __init proc_caches_init(void)
         * maximum number of CPU's we can ever have.  The cpumask_allocation
         * is at the end of the structure, exactly for that reason.
         */
-       mm_cachep = kmem_cache_create("mm_struct",
+       mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+                       offsetof(struct mm_struct, saved_auxv),
+                       sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
        mmap_init();