mm/khugepaged: take the right locks for page table retraction

author Jann Horn <jannh@google.com>

Tue, 6 Dec 2022 17:16:06 +0000 (18:16 +0100)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Wed, 14 Dec 2022 13:02:53 +0000 (14:02 +0100)
author Jann Horn <jannh@google.com>
Tue, 6 Dec 2022 17:16:06 +0000 (18:16 +0100)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 14 Dec 2022 13:02:53 +0000 (14:02 +0100)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index dd069afd9cb9c4ae9876365d9ec189f9e2c242b6..fc02de08e912c5b555df2165086a4a69d2f90892 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1456,6 +1456,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
                 return;
  
+       /*
+        * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+        * that got written to. Without this, we'd have to also lock the
+        * anon_vma if one exists.
+        */
+       if (vma->anon_vma)
+               return;
+
         hpage = find_lock_page(vma->vm_file->f_mapping,
                                linear_page_index(vma, haddr));
         if (!hpage)
@@ -1468,6 +1476,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         if (!pmd)
                 goto drop_hpage;
  
+       /*
+        * We need to lock the mapping so that from here on, only GUP-fast and
+        * hardware page walks can access the parts of the page tables that
+        * we're operating on.
+        */
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
+       /*
+        * This spinlock should be unnecessary: Nobody else should be accessing
+        * the page tables under spinlock protection here, only
+        * lockless_pages_from_mm() and the hardware page walker can access page
+        * tables while all the high-level locks are held in write mode.
+        */
         start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
  
         /* step 1: check all mapped PTEs are to the right huge page */
@@ -1514,12 +1535,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
         }
  
         /* step 4: collapse pmd */
-       ptl = pmd_lock(vma->vm_mm, pmd);
         _pmd = pmdp_collapse_flush(vma, haddr, pmd);
-       spin_unlock(ptl);
         mm_dec_nr_ptes(mm);
         pte_free(mm, pmd_pgtable(_pmd));
  
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+
  drop_hpage:
         unlock_page(hpage);
         put_page(hpage);
@@ -1527,6 +1548,7 @@ drop_hpage:
  
  abort:
         pte_unmap_unlock(start_pte, ptl);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
         goto drop_hpage;
  }
  
@@ -1575,7 +1597,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                  * An alternative would be drop the check, but check that page
                  * table is clear before calling pmdp_collapse_flush() under
                  * ptl. It has higher chance to recover THP for the VMA, but
-                * has higher cost too.
+                * has higher cost too. It would also probably require locking
+                * the anon_vma.
                  */
                 if (vma->anon_vma)
                         continue;
@@ -1597,10 +1620,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
                  */
                 if (mmap_write_trylock(mm)) {
                         if (!khugepaged_test_exit(mm)) {
-                               spinlock_t *ptl = pmd_lock(mm, pmd);
                                 /* assume page table is clear */
                                 _pmd = pmdp_collapse_flush(vma, addr, pmd);
-                               spin_unlock(ptl);
                                 mm_dec_nr_ptes(mm);
                                 pte_free(mm, pmd_pgtable(_pmd));
                         }
author	Jann Horn <jannh@google.com>
	Tue, 6 Dec 2022 17:16:06 +0000 (18:16 +0100)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Wed, 14 Dec 2022 13:02:53 +0000 (14:02 +0100)