sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK

[mirror_ubuntu-zesty-kernel.git] / kernel / fork.c
diff --git a/kernel/fork.c b/kernel/fork.c

index 42451aeb245f9aed29539ff75f648a2386d795b3..5dd0a516626d9d13ad09c6ad06a01105caa40193 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,57 +148,69 @@ static inline void free_task_struct(struct task_struct *tsk)
  }
  #endif
  
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
  {
  }
  
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
  
  /*
   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
   * kmemcache based allocator.
   */
-# if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
-                                                 int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
  {
-       struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
-                                                 THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+       void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                          VMALLOC_START, VMALLOC_END,
+                                          THREADINFO_GFP | __GFP_HIGHMEM,
+                                          PAGE_KERNEL,
+                                          0, node,
+                                          __builtin_return_address(0));
  
-       if (page)
-               memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                           1 << THREAD_SIZE_ORDER);
+       /*
+        * We can't call find_vm_area() in interrupt context, and
+        * free_thread_stack() can be called in interrupt context,
+        * so cache the vm_struct.
+        */
+       if (stack)
+               tsk->stack_vm_area = find_vm_area(stack);
+       return stack;
+#else
+       struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+                                            THREAD_SIZE_ORDER);
  
         return page ? page_address(page) : NULL;
+#endif
  }
  
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(struct task_struct *tsk)
  {
-       struct page *page = virt_to_page(ti);
-
-       memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                   -(1 << THREAD_SIZE_ORDER));
-       __free_kmem_pages(page, THREAD_SIZE_ORDER);
+       if (task_stack_vm_area(tsk))
+               vfree(tsk->stack);
+       else
+               __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
  }
  # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
  
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                   int node)
  {
-       return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+       return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
  }
  
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(struct task_struct *tsk)
  {
-       kmem_cache_free(thread_info_cache, ti);
+       kmem_cache_free(thread_stack_cache, tsk->stack);
  }
  
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
  {
-       thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+       thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                               THREAD_SIZE, 0, NULL);
-       BUG_ON(thread_info_cache == NULL);
+       BUG_ON(thread_stack_cache == NULL);
  }
  # endif
  #endif
@@ -221,18 +233,76 @@ struct kmem_cache *vm_area_cachep;
  /* SLAB cache for mm_struct structures (tsk->mm) */
  static struct kmem_cache *mm_cachep;
  
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
  {
-       struct zone *zone = page_zone(virt_to_page(ti));
+       void *stack = task_stack_page(tsk);
+       struct vm_struct *vm = task_stack_vm_area(tsk);
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+       if (vm) {
+               int i;
+
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+                       mod_zone_page_state(page_zone(vm->pages[i]),
+                                           NR_KERNEL_STACK_KB,
+                                           PAGE_SIZE / 1024 * account);
+               }
+
+               /* All stack pages belong to the same memcg. */
+               memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       } else {
+               /*
+                * All stack pages are in the same zone and belong to the
+                * same memcg.
+                */
+               struct page *first_page = virt_to_page(stack);
  
-       mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                   THREAD_SIZE / 1024 * account);
+
+               memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       }
+}
+
+static void release_task_stack(struct task_struct *tsk)
+{
+       account_kernel_stack(tsk, -1);
+       arch_release_thread_stack(tsk->stack);
+       free_thread_stack(tsk);
+       tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = NULL;
+#endif
  }
  
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+       if (atomic_dec_and_test(&tsk->stack_refcount))
+               release_task_stack(tsk);
+}
+#endif
+
  void free_task(struct task_struct *tsk)
  {
-       account_kernel_stack(tsk->stack, -1);
-       arch_release_thread_info(tsk->stack);
-       free_thread_info(tsk->stack);
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * The task is finally done with both the stack and thread_info,
+        * so free both.
+        */
+       release_task_stack(tsk);
+#else
+       /*
+        * If the task had a separate stack allocation, it should be gone
+        * by now.
+        */
+       WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
         rt_mutex_debug_task_free(tsk);
         ftrace_graph_exit_task(tsk);
         put_seccomp_filter(tsk);
@@ -340,26 +410,43 @@ void set_task_stack_end_magic(struct task_struct *tsk)
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
  }
  
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
         struct task_struct *tsk;
-       struct thread_info *ti;
-       int node = tsk_fork_get_node(orig);
+       unsigned long *stack;
+       struct vm_struct *stack_vm_area;
         int err;
  
+       if (node == NUMA_NO_NODE)
+               node = tsk_fork_get_node(orig);
         tsk = alloc_task_struct_node(node);
         if (!tsk)
                 return NULL;
  
-       ti = alloc_thread_info_node(tsk, node);
-       if (!ti)
+       stack = alloc_thread_stack_node(tsk, node);
+       if (!stack)
                 goto free_tsk;
  
+       stack_vm_area = task_stack_vm_area(tsk);
+
         err = arch_dup_task_struct(tsk, orig);
+
+       /*
+        * arch_dup_task_struct() clobbers the stack-related fields.  Make
+        * sure they're properly initialized before using any stack-related
+        * functions again.
+        */
+       tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       atomic_set(&tsk->stack_refcount, 1);
+#endif
+
         if (err)
-               goto free_ti;
+               goto free_stack;
  
-       tsk->stack = ti;
  #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@ -391,14 +478,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
         tsk->task_frag.page = NULL;
         tsk->wake_q.next = NULL;
  
-       account_kernel_stack(ti, 1);
+       account_kernel_stack(tsk, 1);
  
         kcov_task_init(tsk);
  
         return tsk;
  
-free_ti:
-       free_thread_info(ti);
+free_stack:
+       free_thread_stack(tsk);
  free_tsk:
         free_task_struct(tsk);
         return NULL;
@@ -413,7 +500,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
         unsigned long charge;
  
         uprobe_start_dup_mmap();
-       down_write(&oldmm->mmap_sem);
+       if (down_write_killable(&oldmm->mmap_sem)) {
+               retval = -EINTR;
+               goto fail_uprobe_end;
+       }
         flush_cache_dup_mm(oldmm);
         uprobe_dup_mmap(oldmm, mm);
         /*
@@ -525,6 +615,7 @@ out:
         up_write(&mm->mmap_sem);
         flush_tlb_mm(oldmm);
         up_write(&oldmm->mmap_sem);
+fail_uprobe_end:
         uprobe_end_dup_mmap();
         return retval;
  fail_nomem_anon_vma_fork:
@@ -699,6 +790,26 @@ void __mmdrop(struct mm_struct *mm)
  }
  EXPORT_SYMBOL_GPL(__mmdrop);
  
+static inline void __mmput(struct mm_struct *mm)
+{
+       VM_BUG_ON(atomic_read(&mm->mm_users));
+
+       uprobe_clear_state(mm);
+       exit_aio(mm);
+       ksm_exit(mm);
+       khugepaged_exit(mm); /* must run before exit_mmap */
+       exit_mmap(mm);
+       set_mm_exe_file(mm, NULL);
+       if (!list_empty(&mm->mmlist)) {
+               spin_lock(&mmlist_lock);
+               list_del(&mm->mmlist);
+               spin_unlock(&mmlist_lock);
+       }
+       if (mm->binfmt)
+               module_put(mm->binfmt->module);
+       mmdrop(mm);
+}
+
  /*
   * Decrement the use count and release all resources for an mm.
   */
@@ -706,24 +817,26 @@ void mmput(struct mm_struct *mm)
  {
         might_sleep();
  
+       if (atomic_dec_and_test(&mm->mm_users))
+               __mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+       __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
         if (atomic_dec_and_test(&mm->mm_users)) {
-               uprobe_clear_state(mm);
-               exit_aio(mm);
-               ksm_exit(mm);
-               khugepaged_exit(mm); /* must run before exit_mmap */
-               exit_mmap(mm);
-               set_mm_exe_file(mm, NULL);
-               if (!list_empty(&mm->mmlist)) {
-                       spin_lock(&mmlist_lock);
-                       list_del(&mm->mmlist);
-                       spin_unlock(&mmlist_lock);
-               }
-               if (mm->binfmt)
-                       module_put(mm->binfmt->module);
-               mmdrop(mm);
+               INIT_WORK(&mm->async_put_work, mmput_async_fn);
+               schedule_work(&mm->async_put_work);
         }
  }
-EXPORT_SYMBOL_GPL(mmput);
+#endif
  
  /**
   * set_mm_exe_file - change a reference to the mm's executable file
@@ -911,14 +1024,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
         deactivate_mm(tsk, mm);
  
         /*
-        * If we're exiting normally, clear a user-space tid field if
-        * requested.  We leave this alone when dying by signal, to leave
-        * the value intact in a core dump, and to save the unnecessary
-        * trouble, say, a killed vfork parent shouldn't touch this mm.
-        * Userland only wants this done for a sys_exit.
+        * Signal userspace if we're not exiting with a core dump
+        * because we want to leave the value intact for debugging
+        * purposes.
          */
         if (tsk->clear_child_tid) {
-               if (!(tsk->flags & PF_SIGNALED) &&
+               if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                     atomic_read(&mm->mm_users) > 1) {
                         /*
                          * We don't check the error code - if userspace has
@@ -1279,7 +1390,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                         int __user *child_tidptr,
                                         struct pid *pid,
                                         int trace,
-                                       unsigned long tls)
+                                       unsigned long tls,
+                                       int node)
  {
         int retval;
         struct task_struct *p;
@@ -1331,7 +1443,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                 goto fork_out;
  
         retval = -ENOMEM;
-       p = dup_task_struct(current);
+       p = dup_task_struct(current, node);
         if (!p)
                 goto fork_out;
  
@@ -1401,7 +1513,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         p->real_start_time = ktime_get_boot_ns();
         p->io_context = NULL;
         p->audit_context = NULL;
-       threadgroup_change_begin(current);
         cgroup_fork(p);
  #ifdef CONFIG_NUMA
         p->mempolicy = mpol_dup(p->mempolicy);
@@ -1493,7 +1604,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                 if (IS_ERR(pid)) {
                         retval = PTR_ERR(pid);
-                       goto bad_fork_cleanup_io;
+                       goto bad_fork_cleanup_thread;
                 }
         }
  
@@ -1517,7 +1628,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
          * sigaltstack should be cleared when sharing the same VM
          */
         if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
-               p->sas_ss_sp = p->sas_ss_size = 0;
+               sas_ss_reset(p);
  
         /*
          * Syscall tracing and stepping should be turned off in the
@@ -1553,6 +1664,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
  
+       threadgroup_change_begin(current);
         /*
          * Ensure that the cgroup subsystem policies allow the new process to be
          * forked. It should be noted the the new process's css_set can be changed
@@ -1653,8 +1765,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  bad_fork_cancel_cgroup:
         cgroup_cancel_fork(p);
  bad_fork_free_pid:
+       threadgroup_change_end(current);
         if (pid != &init_struct_pid)
                 free_pid(pid);
+bad_fork_cleanup_thread:
+       exit_thread(p);
  bad_fork_cleanup_io:
         if (p->io_context)
                 exit_io_context(p);
@@ -1683,12 +1798,12 @@ bad_fork_cleanup_policy:
         mpol_put(p->mempolicy);
  bad_fork_cleanup_threadgroup_lock:
  #endif
-       threadgroup_change_end(current);
         delayacct_tsk_free(p);
  bad_fork_cleanup_count:
         atomic_dec(&p->cred->user->processes);
         exit_creds(p);
  bad_fork_free:
+       put_task_stack(p);
         free_task(p);
  fork_out:
         return ERR_PTR(retval);
@@ -1707,7 +1822,8 @@ static inline void init_idle_pids(struct pid_link *links)
  struct task_struct *fork_idle(int cpu)
  {
         struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+                           cpu_to_node(cpu));
         if (!IS_ERR(task)) {
                 init_idle_pids(task->pids);
                 init_idle(task, cpu);
@@ -1752,7 +1868,7 @@ long _do_fork(unsigned long clone_flags,
         }
  
         p = copy_process(clone_flags, stack_start, stack_size,
-                        child_tidptr, NULL, trace, tls);
+                        child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
         /*
          * Do this prior waking up the new thread - the thread pointer
          * might get invalid after that point, if the thread exits quickly.