mm/cow: don't bother write protecting already write-protected pages

[mirror_ubuntu-jammy-kernel.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 348279ff6e51283770f4aea735bfa8092cfc669d..c467102a5cbc5de5ab2ea4dff740f2611c1124a4 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,23 +238,13 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
         __tlb_reset_range(tlb);
  }
  
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
  {
-       if (!tlb->end)
-               return;
+       struct mmu_gather_batch *batch;
  
-       tlb_flush(tlb);
-       mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
         tlb_table_flush(tlb);
  #endif
-       __tlb_reset_range(tlb);
-}
-
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
-       struct mmu_gather_batch *batch;
-
         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
                 free_pages_and_swap_cache(batch->pages, batch->nr);
                 batch->nr = 0;
@@ -326,20 +316,31 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
  
  #ifdef CONFIG_HAVE_RCU_TABLE_FREE
  
-static void tlb_remove_table_smp_sync(void *arg)
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
  {
-       struct mm_struct __maybe_unused *mm = arg;
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
         /*
-        * On most architectures this does nothing. Simply delivering the
-        * interrupt is enough to prevent races with software page table
-        * walking like that done in get_user_pages_fast.
-        *
-        * See the comment near struct mmu_table_batch.
+        * Invalidate page-table caches used by hardware walkers. Then we still
+        * need to RCU-sched wait while freeing the pages because software
+        * walkers can still be in-flight.
          */
-       tlb_flush_remove_tables_local(mm);
+       tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+       /* Simply deliver the interrupt */
  }
  
-static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
+static void tlb_remove_table_one(void *table)
  {
         /*
          * This isn't an RCU grace period and hence the page-tables cannot be
@@ -348,7 +349,7 @@ static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
          * It is however sufficient for software page-table walkers that rely on
          * IRQ disabling. See the comment near struct mmu_table_batch.
          */
-       smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
+       smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
         __tlb_remove_table(table);
  }
  
@@ -369,9 +370,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
  {
         struct mmu_table_batch **batch = &tlb->batch;
  
-       tlb_flush_remove_tables(tlb->mm);
-
         if (*batch) {
+               tlb_table_invalidate(tlb);
                 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                 *batch = NULL;
         }
@@ -381,23 +381,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  {
         struct mmu_table_batch **batch = &tlb->batch;
  
-       /*
-        * When there's less then two users of this mm there cannot be a
-        * concurrent page-table walk.
-        */
-       if (atomic_read(&tlb->mm->mm_users) < 2) {
-               __tlb_remove_table(table);
-               return;
-       }
-
         if (*batch == NULL) {
                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                 if (*batch == NULL) {
-                       tlb_remove_table_one(table, tlb);
+                       tlb_table_invalidate(tlb);
+                       tlb_remove_table_one(table);
                         return;
                 }
                 (*batch)->nr = 0;
         }
+
         (*batch)->tables[(*batch)->nr++] = table;
         if ((*batch)->nr == MAX_TABLE_BATCH)
                 tlb_table_flush(tlb);
@@ -859,6 +852,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                                 return NULL;
                         }
                 }
+
+               if (pte_devmap(pte))
+                       return NULL;
+
                 print_bad_pte(vma, addr, pte, NULL);
                 return NULL;
         }
@@ -923,6 +920,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                 }
         }
  
+       if (pmd_devmap(pmd))
+               return NULL;
         if (is_zero_pfn(pfn))
                 return NULL;
         if (unlikely(pfn > highest_memmap_pfn))
@@ -1023,7 +1022,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
          * If it's a COW mapping, write protect it both
          * in the parent and the child
          */
-       if (is_cow_mapping(vm_flags)) {
+       if (is_cow_mapping(vm_flags) && pte_write(pte)) {
                 ptep_set_wrprotect(src_mm, addr, src_pte);
                 pte = pte_wrprotect(pte);
         }
@@ -1607,20 +1606,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
         tlb_gather_mmu(&tlb, mm, start, end);
         update_hiwater_rss(mm);
         mmu_notifier_invalidate_range_start(mm, start, end);
-       for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+       for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
                 unmap_single_vma(&tlb, vma, start, end, NULL);
-
-               /*
-                * zap_page_range does not specify whether mmap_sem should be
-                * held for read or write. That allows parallel zap_page_range
-                * operations to unmap a PTE and defer a flush meaning that
-                * this call observes pte_none and fails to flush the TLB.
-                * Rather than adding a complex API, ensure that no stale
-                * TLB entries exist when this call returns.
-                */
-               flush_tlb_range(vma, start, end);
-       }
-
         mmu_notifier_invalidate_range_end(mm, start, end);
         tlb_finish_mmu(&tlb, start, end);
  }
@@ -2390,9 +2377,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
   *
   * We do this without the lock held, so that it can sleep if it needs to.
   */
-static int do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
  {
-       int ret;
+       vm_fault_t ret;
         struct page *page = vmf->page;
         unsigned int old_flags = vmf->flags;
  
@@ -2496,7 +2483,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
   *   held to the old page, as well as updating the rmap.
   * - In any case, unlock the PTL and drop the reference we took to the old page.
   */
-static int wp_page_copy(struct vm_fault *vmf)
+static vm_fault_t wp_page_copy(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct mm_struct *mm = vma->vm_mm;
@@ -2644,7 +2631,7 @@ oom:
   * The function expects the page to be locked or other protection against
   * concurrent faults / writeback (such as DAX radix tree locks).
   */
-int finish_mkwrite_fault(struct vm_fault *vmf)
+vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
  {
         WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
         vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@@ -2665,12 +2652,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf)
   * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
   * mapping
   */
-static int wp_pfn_shared(struct vm_fault *vmf)
+static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
  
         if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
-               int ret;
+               vm_fault_t ret;
  
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
                 vmf->flags |= FAULT_FLAG_MKWRITE;
@@ -2683,7 +2670,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
         return VM_FAULT_WRITE;
  }
  
-static int wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf)
         __releases(vmf->ptl)
  {
         struct vm_area_struct *vma = vmf->vma;
@@ -2691,7 +2678,7 @@ static int wp_page_shared(struct vm_fault *vmf)
         get_page(vmf->page);
  
         if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-               int tmp;
+               vm_fault_t tmp;
  
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
                 tmp = do_page_mkwrite(vmf);
@@ -2734,7 +2721,7 @@ static int wp_page_shared(struct vm_fault *vmf)
   * but allow concurrent faults), with pte both mapped and locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_wp_page(struct vm_fault *vmf)
+static vm_fault_t do_wp_page(struct vm_fault *vmf)
         __releases(vmf->ptl)
  {
         struct vm_area_struct *vma = vmf->vma;
@@ -2910,7 +2897,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
   * We return with the mmap_sem locked or unlocked in the same cases
   * as does filemap_fault().
   */
-int do_swap_page(struct vm_fault *vmf)
+vm_fault_t do_swap_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct page *page = NULL, *swapcache;
@@ -2919,7 +2906,7 @@ int do_swap_page(struct vm_fault *vmf)
         pte_t pte;
         int locked;
         int exclusive = 0;
-       int ret = 0;
+       vm_fault_t ret = 0;
  
         if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
                 goto out;
@@ -3130,12 +3117,12 @@ out_release:
   * but allow concurrent faults), and pte mapped but not yet locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_anonymous_page(struct vm_fault *vmf)
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct mem_cgroup *memcg;
         struct page *page;
-       int ret = 0;
+       vm_fault_t ret = 0;
         pte_t entry;
  
         /* File mapping without ->vm_ops ? */
@@ -3245,10 +3232,10 @@ oom:
   * released depending on flags and vma->vm_ops->fault() return value.
   * See filemap_fault() and __lock_page_retry().
   */
-static int __do_fault(struct vm_fault *vmf)
+static vm_fault_t __do_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       int ret;
+       vm_fault_t ret;
  
         ret = vma->vm_ops->fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
@@ -3282,7 +3269,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd)
         return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
  }
  
-static int pte_alloc_one_map(struct vm_fault *vmf)
+static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
  
@@ -3358,13 +3345,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
         vmf->prealloc_pte = NULL;
  }
  
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
  {
         struct vm_area_struct *vma = vmf->vma;
         bool write = vmf->flags & FAULT_FLAG_WRITE;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
         pmd_t entry;
-       int i, ret;
+       int i;
+       vm_fault_t ret;
  
         if (!transhuge_vma_suitable(vma, haddr))
                 return VM_FAULT_FALLBACK;
@@ -3394,7 +3382,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
         if (write)
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  
-       add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
+       add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
         page_add_file_rmap(page, true);
         /*
          * deposit and withdraw with pmd lock held
@@ -3414,7 +3402,7 @@ out:
         return ret;
  }
  #else
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
  {
         BUILD_BUG();
         return 0;
@@ -3435,13 +3423,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
   * Target users are page handler itself and implementations of
   * vm_ops->map_pages.
   */
-int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                 struct page *page)
  {
         struct vm_area_struct *vma = vmf->vma;
         bool write = vmf->flags & FAULT_FLAG_WRITE;
         pte_t entry;
-       int ret;
+       vm_fault_t ret;
  
         if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
                         IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
@@ -3500,10 +3488,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
   * The function expects the page to be locked and on success it consumes a
   * reference of a page being mapped (for the PTE which maps it).
   */
-int finish_fault(struct vm_fault *vmf)
+vm_fault_t finish_fault(struct vm_fault *vmf)
  {
         struct page *page;
-       int ret = 0;
+       vm_fault_t ret = 0;
  
         /* Did we COW the page? */
         if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3589,12 +3577,13 @@ late_initcall(fault_around_debugfs);
   * (and therefore to page order).  This way it's easier to guarantee
   * that we don't cross page table boundaries.
   */
-static int do_fault_around(struct vm_fault *vmf)
+static vm_fault_t do_fault_around(struct vm_fault *vmf)
  {
         unsigned long address = vmf->address, nr_pages, mask;
         pgoff_t start_pgoff = vmf->pgoff;
         pgoff_t end_pgoff;
-       int off, ret = 0;
+       int off;
+       vm_fault_t ret = 0;
  
         nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
         mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
@@ -3644,10 +3633,10 @@ out:
         return ret;
  }
  
-static int do_read_fault(struct vm_fault *vmf)
+static vm_fault_t do_read_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       int ret = 0;
+       vm_fault_t ret = 0;
  
         /*
          * Let's call ->map_pages() first and use ->fault() as fallback
@@ -3671,10 +3660,10 @@ static int do_read_fault(struct vm_fault *vmf)
         return ret;
  }
  
-static int do_cow_fault(struct vm_fault *vmf)
+static vm_fault_t do_cow_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       int ret;
+       vm_fault_t ret;
  
         if (unlikely(anon_vma_prepare(vma)))
                 return VM_FAULT_OOM;
@@ -3710,10 +3699,10 @@ uncharge_out:
         return ret;
  }
  
-static int do_shared_fault(struct vm_fault *vmf)
+static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       int ret, tmp;
+       vm_fault_t ret, tmp;
  
         ret = __do_fault(vmf);
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3751,10 +3740,10 @@ static int do_shared_fault(struct vm_fault *vmf)
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
-static int do_fault(struct vm_fault *vmf)
+static vm_fault_t do_fault(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       int ret;
+       vm_fault_t ret;
  
         /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
         if (!vma->vm_ops->fault)
@@ -3789,7 +3778,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
         return mpol_misplaced(page, vma, addr);
  }
  
-static int do_numa_page(struct vm_fault *vmf)
+static vm_fault_t do_numa_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
         struct page *page = NULL;
@@ -3879,7 +3868,7 @@ out:
         return 0;
  }
  
-static inline int create_huge_pmd(struct vm_fault *vmf)
+static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
  {
         if (vma_is_anonymous(vmf->vma))
                 return do_huge_pmd_anonymous_page(vmf);
@@ -3889,7 +3878,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf)
  }
  
  /* `inline' is required to avoid gcc 4.1.2 build error */
-static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
  {
         if (vma_is_anonymous(vmf->vma))
                 return do_huge_pmd_wp_page(vmf, orig_pmd);
@@ -3908,7 +3897,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
         return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
  }
  
-static int create_huge_pud(struct vm_fault *vmf)
+static vm_fault_t create_huge_pud(struct vm_fault *vmf)
  {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         /* No support for anonymous transparent PUD pages yet */
@@ -3920,7 +3909,7 @@ static int create_huge_pud(struct vm_fault *vmf)
         return VM_FAULT_FALLBACK;
  }
  
-static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
  {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         /* No support for anonymous transparent PUD pages yet */
@@ -3947,7 +3936,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
   * The mmap_sem may have been released depending on flags and our return value.
   * See filemap_fault() and __lock_page_or_retry().
   */
-static int handle_pte_fault(struct vm_fault *vmf)
+static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
  {
         pte_t entry;
  
@@ -4035,8 +4024,8 @@ unlock:
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
-static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
-               unsigned int flags)
+static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+               unsigned long address, unsigned int flags)
  {
         struct vm_fault vmf = {
                 .vma = vma,
@@ -4049,7 +4038,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         struct mm_struct *mm = vma->vm_mm;
         pgd_t *pgd;
         p4d_t *p4d;
-       int ret;
+       vm_fault_t ret;
  
         pgd = pgd_offset(mm, address);
         p4d = p4d_alloc(mm, pgd, address);
@@ -4124,10 +4113,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
   * The mmap_sem may have been released depending on flags and our
   * return value.  See filemap_fault() and __lock_page_or_retry().
   */
-int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 unsigned int flags)
  {
-       int ret;
+       vm_fault_t ret;
  
         __set_current_state(TASK_RUNNING);
  
@@ -4147,7 +4136,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
          * space.  Kernel faults are handled more gracefully.
          */
         if (flags & FAULT_FLAG_USER)
-               mem_cgroup_oom_enable();
+               mem_cgroup_enter_user_fault();
  
         if (unlikely(is_vm_hugetlb_page(vma)))
                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4155,7 +4144,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 ret = __handle_mm_fault(vma, address, flags);
  
         if (flags & FAULT_FLAG_USER) {
-               mem_cgroup_oom_disable();
+               mem_cgroup_exit_user_fault();
                 /*
                  * The task may have entered a memcg OOM situation but
                  * if the allocation error was handled gracefully (no
@@ -4593,71 +4582,93 @@ EXPORT_SYMBOL(__might_fault);
  #endif
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-static void clear_gigantic_page(struct page *page,
-                               unsigned long addr,
-                               unsigned int pages_per_huge_page)
-{
-       int i;
-       struct page *p = page;
-
-       might_sleep();
-       for (i = 0; i < pages_per_huge_page;
-            i++, p = mem_map_next(p, page, i)) {
-               cond_resched();
-               clear_user_highpage(p, addr + i * PAGE_SIZE);
-       }
-}
-void clear_huge_page(struct page *page,
-                    unsigned long addr_hint, unsigned int pages_per_huge_page)
+/*
+ * Process all subpages of the specified huge page with the specified
+ * operation.  The target subpage will be processed last to keep its
+ * cache lines hot.
+ */
+static inline void process_huge_page(
+       unsigned long addr_hint, unsigned int pages_per_huge_page,
+       void (*process_subpage)(unsigned long addr, int idx, void *arg),
+       void *arg)
  {
         int i, n, base, l;
         unsigned long addr = addr_hint &
                 ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
  
-       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-               clear_gigantic_page(page, addr, pages_per_huge_page);
-               return;
-       }
-
-       /* Clear sub-page to access last to keep its cache lines hot */
+       /* Process target subpage last to keep its cache lines hot */
         might_sleep();
         n = (addr_hint - addr) / PAGE_SIZE;
         if (2 * n <= pages_per_huge_page) {
-               /* If sub-page to access in first half of huge page */
+               /* If target subpage in first half of huge page */
                 base = 0;
                 l = n;
-               /* Clear sub-pages at the end of huge page */
+               /* Process subpages at the end of huge page */
                 for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                         cond_resched();
-                       clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+                       process_subpage(addr + i * PAGE_SIZE, i, arg);
                 }
         } else {
-               /* If sub-page to access in second half of huge page */
+               /* If target subpage in second half of huge page */
                 base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
                 l = pages_per_huge_page - n;
-               /* Clear sub-pages at the begin of huge page */
+               /* Process subpages at the begin of huge page */
                 for (i = 0; i < base; i++) {
                         cond_resched();
-                       clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+                       process_subpage(addr + i * PAGE_SIZE, i, arg);
                 }
         }
         /*
-        * Clear remaining sub-pages in left-right-left-right pattern
-        * towards the sub-page to access
+        * Process remaining subpages in left-right-left-right pattern
+        * towards the target subpage
          */
         for (i = 0; i < l; i++) {
                 int left_idx = base + i;
                 int right_idx = base + 2 * l - 1 - i;
  
                 cond_resched();
-               clear_user_highpage(page + left_idx,
-                                   addr + left_idx * PAGE_SIZE);
+               process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
+               cond_resched();
+               process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
+       }
+}
+
+static void clear_gigantic_page(struct page *page,
+                               unsigned long addr,
+                               unsigned int pages_per_huge_page)
+{
+       int i;
+       struct page *p = page;
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page;
+            i++, p = mem_map_next(p, page, i)) {
                 cond_resched();
-               clear_user_highpage(page + right_idx,
-                                   addr + right_idx * PAGE_SIZE);
+               clear_user_highpage(p, addr + i * PAGE_SIZE);
         }
  }
  
+static void clear_subpage(unsigned long addr, int idx, void *arg)
+{
+       struct page *page = arg;
+
+       clear_user_highpage(page + idx, addr);
+}
+
+void clear_huge_page(struct page *page,
+                    unsigned long addr_hint, unsigned int pages_per_huge_page)
+{
+       unsigned long addr = addr_hint &
+               ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+               clear_gigantic_page(page, addr, pages_per_huge_page);
+               return;
+       }
+
+       process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
+}
+
  static void copy_user_gigantic_page(struct page *dst, struct page *src,
                                     unsigned long addr,
                                     struct vm_area_struct *vma,
@@ -4677,11 +4688,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
         }
  }
  
+struct copy_subpage_arg {
+       struct page *dst;
+       struct page *src;
+       struct vm_area_struct *vma;
+};
+
+static void copy_subpage(unsigned long addr, int idx, void *arg)
+{
+       struct copy_subpage_arg *copy_arg = arg;
+
+       copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+                          addr, copy_arg->vma);
+}
+
  void copy_user_huge_page(struct page *dst, struct page *src,
-                        unsigned long addr, struct vm_area_struct *vma,
+                        unsigned long addr_hint, struct vm_area_struct *vma,
                          unsigned int pages_per_huge_page)
  {
-       int i;
+       unsigned long addr = addr_hint &
+               ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+       struct copy_subpage_arg arg = {
+               .dst = dst,
+               .src = src,
+               .vma = vma,
+       };
  
         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                 copy_user_gigantic_page(dst, src, addr, vma,
@@ -4689,11 +4720,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
                 return;
         }
  
-       might_sleep();
-       for (i = 0; i < pages_per_huge_page; i++) {
-               cond_resched();
-               copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
-       }
+       process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
  }
  
  long copy_huge_page_from_user(struct page *dst_page,