mm: add new mmgrab() helper

[mirror_ubuntu-bionic-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 6bf2b471e30ca566a55160e4131bf7e7b9c3c4ea..14fc0b40f0bb6cf3ee50cfff8e7db865ad442cdd 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -30,7 +30,7 @@
  
  /*
   * 05.04.94  -  Multi-page memory management added for v1.1.
- *             Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *              Idea by Alex Bligh (alex@cconcepts.co.uk)
   *
   * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
   *             (Gerhard.Wichert@pdb.siemens.de)
@@ -82,9 +82,9 @@
  #ifndef CONFIG_NEED_MULTIPLE_NODES
  /* use the per-pgdat data instead for discontigmem - mbligh */
  unsigned long max_mapnr;
-struct page *mem_map;
-
  EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
  EXPORT_SYMBOL(mem_map);
  #endif
  
@@ -95,8 +95,7 @@ EXPORT_SYMBOL(mem_map);
   * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
   * and ZONE_HIGHMEM.
   */
-void * high_memory;
-
+void *high_memory;
  EXPORT_SYMBOL(high_memory);
  
  /*
@@ -120,10 +119,10 @@ static int __init disable_randmaps(char *s)
  __setup("norandmaps", disable_randmaps);
  
  unsigned long zero_pfn __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
-
  EXPORT_SYMBOL(zero_pfn);
  
+unsigned long highest_memmap_pfn __read_mostly;
+
  /*
   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
   */
@@ -556,7 +555,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
  
                 if (is_vm_hugetlb_page(vma)) {
                         hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
-                               floor, next? next->vm_start: ceiling);
+                               floor, next ? next->vm_start : ceiling);
                 } else {
                         /*
                          * Optimization: gather nearby vmas into one call down
@@ -569,7 +568,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                 unlink_file_vma(vma);
                         }
                         free_pgd_range(tlb, addr, vma->vm_end,
-                               floor, next? next->vm_start: ceiling);
+                               floor, next ? next->vm_start : ceiling);
                 }
                 vma = next;
         }
@@ -1001,7 +1000,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
                 next = pmd_addr_end(addr, end);
                 if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                         int err;
-                       VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                       VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
                         err = copy_huge_pmd(dst_mm, src_mm,
                                             dst_pmd, src_pmd, addr, vma);
                         if (err == -ENOMEM)
@@ -1032,6 +1031,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
         src_pud = pud_offset(src_pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
+               if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+                       int err;
+
+                       VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+                       err = copy_huge_pud(dst_mm, src_mm,
+                                           dst_pud, src_pud, addr, vma);
+                       if (err == -ENOMEM)
+                               return -ENOMEM;
+                       if (!err)
+                               continue;
+                       /* fall through */
+               }
                 if (pud_none_or_clear_bad(src_pud))
                         continue;
                 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1129,9 +1140,8 @@ again:
         arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
-               if (pte_none(ptent)) {
+               if (pte_none(ptent))
                         continue;
-               }
  
                 if (pte_present(ptent)) {
                         struct page *page;
@@ -1155,12 +1165,6 @@ again:
  
                         if (!PageAnon(page)) {
                                 if (pte_dirty(ptent)) {
-                                       /*
-                                        * oom_reaper cannot tear down dirty
-                                        * pages
-                                        */
-                                       if (unlikely(details && details->ignore_dirty))
-                                               continue;
                                         force_flush = 1;
                                         set_page_dirty(page);
                                 }
@@ -1179,8 +1183,8 @@ again:
                         }
                         continue;
                 }
-               /* only check swap_entries if explicitly asked for in details */
-               if (unlikely(details && !details->check_swap_entries))
+               /* If details->check_mapping, we leave swap entries. */
+               if (unlikely(details))
                         continue;
  
                 entry = pte_to_swp_entry(ptent);
@@ -1269,9 +1273,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
         pud = pud_offset(pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
+               if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+                       if (next - addr != HPAGE_PUD_SIZE) {
+                               VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+                               split_huge_pud(vma, pud, addr);
+                       } else if (zap_huge_pud(tlb, vma, pud, addr))
+                               goto next;
+                       /* fall through */
+               }
                 if (pud_none_or_clear_bad(pud))
                         continue;
                 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+               cond_resched();
         } while (pud++, addr = next, addr != end);
  
         return addr;
@@ -1376,12 +1390,11 @@ void unmap_vmas(struct mmu_gather *tlb,
   * @vma: vm_area_struct holding the applicable pages
   * @start: starting address of pages to zap
   * @size: number of bytes to zap
- * @details: details of shared cache invalidation
   *
   * Caller must protect the VMA list
   */
  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
-               unsigned long size, struct zap_details *details)
+               unsigned long size)
  {
         struct mm_struct *mm = vma->vm_mm;
         struct mmu_gather tlb;
@@ -1392,7 +1405,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
         update_hiwater_rss(mm);
         mmu_notifier_invalidate_range_start(mm, start, end);
         for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-               unmap_single_vma(&tlb, vma, start, end, details);
+               unmap_single_vma(&tlb, vma, start, end, NULL);
         mmu_notifier_invalidate_range_end(mm, start, end);
         tlb_finish_mmu(&tlb, start, end);
  }
@@ -1448,10 +1461,10 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
  pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                         spinlock_t **ptl)
  {
-       pgd_t * pgd = pgd_offset(mm, addr);
-       pud_t * pud = pud_alloc(mm, pgd, addr);
+       pgd_t *pgd = pgd_offset(mm, addr);
+       pud_t *pud = pud_alloc(mm, pgd, addr);
         if (pud) {
-               pmd_t * pmd = pmd_alloc(mm, pud, addr);
+               pmd_t *pmd = pmd_alloc(mm, pud, addr);
                 if (pmd) {
                         VM_BUG_ON(pmd_trans_huge(*pmd));
                         return pte_alloc_map_lock(mm, pmd, addr, ptl);
@@ -2042,7 +2055,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
  
         vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
  
-       ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+       ret = vmf->vma->vm_ops->page_mkwrite(vmf);
         /* Restore original flags so that caller is not surprised */
         vmf->flags = old_flags;
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2314,7 +2327,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
  
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
                 vmf->flags |= FAULT_FLAG_MKWRITE;
-               ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+               ret = vma->vm_ops->pfn_mkwrite(vmf);
                 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                         return ret;
                 return finish_mkwrite_fault(vmf);
@@ -2510,7 +2523,7 @@ void unmap_mapping_range(struct address_space *mapping,
                         hlen = ULONG_MAX - hba + 1;
         }
  
-       details.check_mapping = even_cows? NULL: mapping;
+       details.check_mapping = even_cows ? NULL : mapping;
         details.first_index = hba;
         details.last_index = hba + hlen - 1;
         if (details.last_index < details.first_index)
@@ -2868,7 +2881,7 @@ static int __do_fault(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         int ret;
  
-       ret = vma->vm_ops->fault(vma, vmf);
+       ret = vma->vm_ops->fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                             VM_FAULT_DONE_COW)))
                 return ret;
@@ -2905,7 +2918,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
                 atomic_long_inc(&vma->vm_mm->nr_ptes);
                 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
                 spin_unlock(vmf->ptl);
-               vmf->prealloc_pte = 0;
+               vmf->prealloc_pte = NULL;
         } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
                 return VM_FAULT_OOM;
         }
@@ -2953,7 +2966,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
          * count that as nr_ptes.
          */
         atomic_long_inc(&vma->vm_mm->nr_ptes);
-       vmf->prealloc_pte = 0;
+       vmf->prealloc_pte = NULL;
  }
  
  static int do_set_pmd(struct vm_fault *vmf, struct page *page)
@@ -3359,7 +3372,7 @@ static int do_fault(struct vm_fault *vmf)
         /* preallocated pagetable is unused: free it */
         if (vmf->prealloc_pte) {
                 pte_free(vma->vm_mm, vmf->prealloc_pte);
-               vmf->prealloc_pte = 0;
+               vmf->prealloc_pte = NULL;
         }
         return ret;
  }
@@ -3387,32 +3400,32 @@ static int do_numa_page(struct vm_fault *vmf)
         int last_cpupid;
         int target_nid;
         bool migrated = false;
-       pte_t pte = vmf->orig_pte;
-       bool was_writable = pte_write(pte);
+       pte_t pte;
+       bool was_writable = pte_savedwrite(vmf->orig_pte);
         int flags = 0;
  
         /*
-       * The "pte" at this point cannot be used safely without
-       * validation through pte_unmap_same(). It's of NUMA type but
-       * the pfn may be screwed if the read is non atomic.
-       *
-       * We can safely just do a "set_pte_at()", because the old
-       * page table entry is not accessible, so there would be no
-       * concurrent hardware modifications to the PTE.
-       */
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        */
         vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
         spin_lock(vmf->ptl);
-       if (unlikely(!pte_same(*vmf->pte, pte))) {
+       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
                 goto out;
         }
  
-       /* Make it present again */
+       /*
+        * Make it present again, Depending on how arch implementes non
+        * accessible ptes, some can allow access by kernel mode.
+        */
+       pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
         pte = pte_modify(pte, vma->vm_page_prot);
         pte = pte_mkyoung(pte);
         if (was_writable)
                 pte = pte_mkwrite(pte);
-       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+       ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
         update_mmu_cache(vma, vmf->address, vmf->pte);
  
         page = vm_normal_page(vma, vmf->address, pte);
@@ -3471,12 +3484,10 @@ out:
  
  static int create_huge_pmd(struct vm_fault *vmf)
  {
-       struct vm_area_struct *vma = vmf->vma;
-       if (vma_is_anonymous(vma))
+       if (vma_is_anonymous(vmf->vma))
                 return do_huge_pmd_anonymous_page(vmf);
-       if (vma->vm_ops->pmd_fault)
-               return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-                               vmf->flags);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
         return VM_FAULT_FALLBACK;
  }
  
@@ -3484,9 +3495,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
  {
         if (vma_is_anonymous(vmf->vma))
                 return do_huge_pmd_wp_page(vmf, orig_pmd);
-       if (vmf->vma->vm_ops->pmd_fault)
-               return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-                                                  vmf->pmd, vmf->flags);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
  
         /* COW handled on pte level: split pmd */
         VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3500,6 +3510,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
         return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
  }
  
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* No support for anonymous transparent PUD pages yet */
+       if (vma_is_anonymous(vmf->vma))
+               return VM_FAULT_FALLBACK;
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* No support for anonymous transparent PUD pages yet */
+       if (vma_is_anonymous(vmf->vma))
+               return VM_FAULT_FALLBACK;
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       return VM_FAULT_FALLBACK;
+}
+
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most
@@ -3615,22 +3649,46 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         };
         struct mm_struct *mm = vma->vm_mm;
         pgd_t *pgd;
-       pud_t *pud;
+       int ret;
  
         pgd = pgd_offset(mm, address);
-       pud = pud_alloc(mm, pgd, address);
-       if (!pud)
+
+       vmf.pud = pud_alloc(mm, pgd, address);
+       if (!vmf.pud)
                 return VM_FAULT_OOM;
-       vmf.pmd = pmd_alloc(mm, pud, address);
+       if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+               ret = create_huge_pud(&vmf);
+               if (!(ret & VM_FAULT_FALLBACK))
+                       return ret;
+       } else {
+               pud_t orig_pud = *vmf.pud;
+
+               barrier();
+               if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+                       /* NUMA case for anonymous PUDs would go here */
+
+                       if (dirty && !pud_write(orig_pud)) {
+                               ret = wp_huge_pud(&vmf, orig_pud);
+                               if (!(ret & VM_FAULT_FALLBACK))
+                                       return ret;
+                       } else {
+                               huge_pud_set_accessed(&vmf, orig_pud);
+                               return 0;
+                       }
+               }
+       }
+
+       vmf.pmd = pmd_alloc(mm, vmf.pud, address);
         if (!vmf.pmd)
                 return VM_FAULT_OOM;
         if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = create_huge_pmd(&vmf);
+               ret = create_huge_pmd(&vmf);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
         } else {
                 pmd_t orig_pmd = *vmf.pmd;
-               int ret;
  
                 barrier();
                 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
@@ -3690,14 +3748,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  
         if (flags & FAULT_FLAG_USER) {
                 mem_cgroup_oom_disable();
-                /*
-                 * The task may have entered a memcg OOM situation but
-                 * if the allocation error was handled gracefully (no
-                 * VM_FAULT_OOM), there is no need to kill anything.
-                 * Just clean up the OOM state peacefully.
-                 */
-                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
-                        mem_cgroup_oom_synchronize(false);
+               /*
+                * The task may have entered a memcg OOM situation but
+                * if the allocation error was handled gracefully (no
+                * VM_FAULT_OOM), there is no need to kill anything.
+                * Just clean up the OOM state peacefully.
+                */
+               if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                       mem_cgroup_oom_synchronize(false);
         }
  
         /*
@@ -3747,13 +3805,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
   */
  int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  {
+       spinlock_t *ptl;
         pmd_t *new = pmd_alloc_one(mm, address);
         if (!new)
                 return -ENOMEM;
  
         smp_wmb(); /* See comment in __pte_alloc */
  
-       spin_lock(&mm->page_table_lock);
+       ptl = pud_lock(mm, pud);
  #ifndef __ARCH_HAS_4LEVEL_HACK
         if (!pud_present(*pud)) {
                 mm_inc_nr_pmds(mm);
@@ -3767,7 +3826,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
         } else /* Another has populated it */
                 pmd_free(mm, new);
  #endif /* __ARCH_HAS_4LEVEL_HACK */
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
         return 0;
  }
  #endif /* __PAGETABLE_PMD_FOLDED */
@@ -4155,6 +4214,38 @@ void copy_user_huge_page(struct page *dst, struct page *src,
                 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
         }
  }
+
+long copy_huge_page_from_user(struct page *dst_page,
+                               const void __user *usr_src,
+                               unsigned int pages_per_huge_page,
+                               bool allow_pagefault)
+{
+       void *src = (void *)usr_src;
+       void *page_kaddr;
+       unsigned long i, rc = 0;
+       unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+       for (i = 0; i < pages_per_huge_page; i++) {
+               if (allow_pagefault)
+                       page_kaddr = kmap(dst_page + i);
+               else
+                       page_kaddr = kmap_atomic(dst_page + i);
+               rc = copy_from_user(page_kaddr,
+                               (const void __user *)(src + i * PAGE_SIZE),
+                               PAGE_SIZE);
+               if (allow_pagefault)
+                       kunmap(dst_page + i);
+               else
+                       kunmap_atomic(page_kaddr);
+
+               ret_val -= (PAGE_SIZE - rc);
+               if (rc)
+                       break;
+
+               cond_resched();
+       }
+       return ret_val;
+}
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
  
  #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS