]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - mm/memory.c
mm: add new mmgrab() helper
[mirror_ubuntu-bionic-kernel.git] / mm / memory.c
index 6bf2b471e30ca566a55160e4131bf7e7b9c3c4ea..14fc0b40f0bb6cf3ee50cfff8e7db865ad442cdd 100644 (file)
@@ -30,7 +30,7 @@
 
 /*
  * 05.04.94  -  Multi-page memory management added for v1.1.
- *             Idea by Alex Bligh (alex@cconcepts.co.uk)
+ *              Idea by Alex Bligh (alex@cconcepts.co.uk)
  *
  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
  *             (Gerhard.Wichert@pdb.siemens.de)
@@ -82,9 +82,9 @@
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
-struct page *mem_map;
-
 EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
 EXPORT_SYMBOL(mem_map);
 #endif
 
@@ -95,8 +95,7 @@ EXPORT_SYMBOL(mem_map);
  * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
  * and ZONE_HIGHMEM.
  */
-void * high_memory;
-
+void *high_memory;
 EXPORT_SYMBOL(high_memory);
 
 /*
@@ -120,10 +119,10 @@ static int __init disable_randmaps(char *s)
 __setup("norandmaps", disable_randmaps);
 
 unsigned long zero_pfn __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
-
 EXPORT_SYMBOL(zero_pfn);
 
+unsigned long highest_memmap_pfn __read_mostly;
+
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
@@ -556,7 +555,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
-                               floor, next? next->vm_start: ceiling);
+                               floor, next ? next->vm_start : ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
@@ -569,7 +568,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
-                               floor, next? next->vm_start: ceiling);
+                               floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        }
@@ -1001,7 +1000,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                        int err;
-                       VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                       VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
                        err = copy_huge_pmd(dst_mm, src_mm,
                                            dst_pmd, src_pmd, addr, vma);
                        if (err == -ENOMEM)
@@ -1032,6 +1031,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pud = pud_offset(src_pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+               if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+                       int err;
+
+                       VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+                       err = copy_huge_pud(dst_mm, src_mm,
+                                           dst_pud, src_pud, addr, vma);
+                       if (err == -ENOMEM)
+                               return -ENOMEM;
+                       if (!err)
+                               continue;
+                       /* fall through */
+               }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1129,9 +1140,8 @@ again:
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
-               if (pte_none(ptent)) {
+               if (pte_none(ptent))
                        continue;
-               }
 
                if (pte_present(ptent)) {
                        struct page *page;
@@ -1155,12 +1165,6 @@ again:
 
                        if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
-                                       /*
-                                        * oom_reaper cannot tear down dirty
-                                        * pages
-                                        */
-                                       if (unlikely(details && details->ignore_dirty))
-                                               continue;
                                        force_flush = 1;
                                        set_page_dirty(page);
                                }
@@ -1179,8 +1183,8 @@ again:
                        }
                        continue;
                }
-               /* only check swap_entries if explicitly asked for in details */
-               if (unlikely(details && !details->check_swap_entries))
+               /* If details->check_mapping, we leave swap entries. */
+               if (unlikely(details))
                        continue;
 
                entry = pte_to_swp_entry(ptent);
@@ -1269,9 +1273,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+               if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+                       if (next - addr != HPAGE_PUD_SIZE) {
+                               VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+                               split_huge_pud(vma, pud, addr);
+                       } else if (zap_huge_pud(tlb, vma, pud, addr))
+                               goto next;
+                       /* fall through */
+               }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+               cond_resched();
        } while (pud++, addr = next, addr != end);
 
        return addr;
@@ -1376,12 +1390,11 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
-               unsigned long size, struct zap_details *details)
+               unsigned long size)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;
@@ -1392,7 +1405,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
        update_hiwater_rss(mm);
        mmu_notifier_invalidate_range_start(mm, start, end);
        for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-               unmap_single_vma(&tlb, vma, start, end, details);
+               unmap_single_vma(&tlb, vma, start, end, NULL);
        mmu_notifier_invalidate_range_end(mm, start, end);
        tlb_finish_mmu(&tlb, start, end);
 }
@@ -1448,10 +1461,10 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
 {
-       pgd_t * pgd = pgd_offset(mm, addr);
-       pud_t * pud = pud_alloc(mm, pgd, addr);
+       pgd_t *pgd = pgd_offset(mm, addr);
+       pud_t *pud = pud_alloc(mm, pgd, addr);
        if (pud) {
-               pmd_t * pmd = pmd_alloc(mm, pud, addr);
+               pmd_t *pmd = pmd_alloc(mm, pud, addr);
                if (pmd) {
                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
@@ -2042,7 +2055,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
 
        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 
-       ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+       ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2314,7 +2327,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
 
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                vmf->flags |= FAULT_FLAG_MKWRITE;
-               ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+               ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf);
@@ -2510,7 +2523,7 @@ void unmap_mapping_range(struct address_space *mapping,
                        hlen = ULONG_MAX - hba + 1;
        }
 
-       details.check_mapping = even_cows? NULL: mapping;
+       details.check_mapping = even_cows ? NULL : mapping;
        details.first_index = hba;
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
@@ -2868,7 +2881,7 @@ static int __do_fault(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        int ret;
 
-       ret = vma->vm_ops->fault(vma, vmf);
+       ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;
@@ -2905,7 +2918,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
                atomic_long_inc(&vma->vm_mm->nr_ptes);
                pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
                spin_unlock(vmf->ptl);
-               vmf->prealloc_pte = 0;
+               vmf->prealloc_pte = NULL;
        } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
                return VM_FAULT_OOM;
        }
@@ -2953,7 +2966,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
         * count that as nr_ptes.
         */
        atomic_long_inc(&vma->vm_mm->nr_ptes);
-       vmf->prealloc_pte = 0;
+       vmf->prealloc_pte = NULL;
 }
 
 static int do_set_pmd(struct vm_fault *vmf, struct page *page)
@@ -3359,7 +3372,7 @@ static int do_fault(struct vm_fault *vmf)
        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vma->vm_mm, vmf->prealloc_pte);
-               vmf->prealloc_pte = 0;
+               vmf->prealloc_pte = NULL;
        }
        return ret;
 }
@@ -3387,32 +3400,32 @@ static int do_numa_page(struct vm_fault *vmf)
        int last_cpupid;
        int target_nid;
        bool migrated = false;
-       pte_t pte = vmf->orig_pte;
-       bool was_writable = pte_write(pte);
+       pte_t pte;
+       bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;
 
        /*
-       * The "pte" at this point cannot be used safely without
-       * validation through pte_unmap_same(). It's of NUMA type but
-       * the pfn may be screwed if the read is non atomic.
-       *
-       * We can safely just do a "set_pte_at()", because the old
-       * page table entry is not accessible, so there would be no
-       * concurrent hardware modifications to the PTE.
-       */
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        */
        vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
-       if (unlikely(!pte_same(*vmf->pte, pte))) {
+       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }
 
-       /* Make it present again */
+       /*
+        * Make it present again, Depending on how arch implementes non
+        * accessible ptes, some can allow access by kernel mode.
+        */
+       pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
        pte = pte_modify(pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (was_writable)
                pte = pte_mkwrite(pte);
-       set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+       ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
        update_mmu_cache(vma, vmf->address, vmf->pte);
 
        page = vm_normal_page(vma, vmf->address, pte);
@@ -3471,12 +3484,10 @@ out:
 
 static int create_huge_pmd(struct vm_fault *vmf)
 {
-       struct vm_area_struct *vma = vmf->vma;
-       if (vma_is_anonymous(vma))
+       if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_anonymous_page(vmf);
-       if (vma->vm_ops->pmd_fault)
-               return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-                               vmf->flags);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
        return VM_FAULT_FALLBACK;
 }
 
@@ -3484,9 +3495,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
        if (vma_is_anonymous(vmf->vma))
                return do_huge_pmd_wp_page(vmf, orig_pmd);
-       if (vmf->vma->vm_ops->pmd_fault)
-               return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-                                                  vmf->pmd, vmf->flags);
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
        /* COW handled on pte level: split pmd */
        VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3500,6 +3510,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
        return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
 }
 
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* No support for anonymous transparent PUD pages yet */
+       if (vma_is_anonymous(vmf->vma))
+               return VM_FAULT_FALLBACK;
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* No support for anonymous transparent PUD pages yet */
+       if (vma_is_anonymous(vmf->vma))
+               return VM_FAULT_FALLBACK;
+       if (vmf->vma->vm_ops->huge_fault)
+               return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3615,22 +3649,46 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        };
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
-       pud_t *pud;
+       int ret;
 
        pgd = pgd_offset(mm, address);
-       pud = pud_alloc(mm, pgd, address);
-       if (!pud)
+
+       vmf.pud = pud_alloc(mm, pgd, address);
+       if (!vmf.pud)
                return VM_FAULT_OOM;
-       vmf.pmd = pmd_alloc(mm, pud, address);
+       if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+               ret = create_huge_pud(&vmf);
+               if (!(ret & VM_FAULT_FALLBACK))
+                       return ret;
+       } else {
+               pud_t orig_pud = *vmf.pud;
+
+               barrier();
+               if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+                       /* NUMA case for anonymous PUDs would go here */
+
+                       if (dirty && !pud_write(orig_pud)) {
+                               ret = wp_huge_pud(&vmf, orig_pud);
+                               if (!(ret & VM_FAULT_FALLBACK))
+                                       return ret;
+                       } else {
+                               huge_pud_set_accessed(&vmf, orig_pud);
+                               return 0;
+                       }
+               }
+       }
+
+       vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = create_huge_pmd(&vmf);
+               ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pmd_t orig_pmd = *vmf.pmd;
-               int ret;
 
                barrier();
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
@@ -3690,14 +3748,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_oom_disable();
-                /*
-                 * The task may have entered a memcg OOM situation but
-                 * if the allocation error was handled gracefully (no
-                 * VM_FAULT_OOM), there is no need to kill anything.
-                 * Just clean up the OOM state peacefully.
-                 */
-                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
-                        mem_cgroup_oom_synchronize(false);
+               /*
+                * The task may have entered a memcg OOM situation but
+                * if the allocation error was handled gracefully (no
+                * VM_FAULT_OOM), there is no need to kill anything.
+                * Just clean up the OOM state peacefully.
+                */
+               if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                       mem_cgroup_oom_synchronize(false);
        }
 
        /*
@@ -3747,13 +3805,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
+       spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;
 
        smp_wmb(); /* See comment in __pte_alloc */
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pud_lock(mm, pud);
 #ifndef __ARCH_HAS_4LEVEL_HACK
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
@@ -3767,7 +3826,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        } else /* Another has populated it */
                pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
@@ -4155,6 +4214,38 @@ void copy_user_huge_page(struct page *dst, struct page *src,
                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
+
+long copy_huge_page_from_user(struct page *dst_page,
+                               const void __user *usr_src,
+                               unsigned int pages_per_huge_page,
+                               bool allow_pagefault)
+{
+       void *src = (void *)usr_src;
+       void *page_kaddr;
+       unsigned long i, rc = 0;
+       unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+       for (i = 0; i < pages_per_huge_page; i++) {
+               if (allow_pagefault)
+                       page_kaddr = kmap(dst_page + i);
+               else
+                       page_kaddr = kmap_atomic(dst_page + i);
+               rc = copy_from_user(page_kaddr,
+                               (const void __user *)(src + i * PAGE_SIZE),
+                               PAGE_SIZE);
+               if (allow_pagefault)
+                       kunmap(dst_page + i);
+               else
+                       kunmap_atomic(page_kaddr);
+
+               ret_val -= (PAGE_SIZE - rc);
+               if (rc)
+                       break;
+
+               cond_resched();
+       }
+       return ret_val;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS