mm: close page_mkwrite races

author Nick Piggin <npiggin@suse.de>

Thu, 30 Apr 2009 22:08:16 +0000 (15:08 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 2 May 2009 22:36:09 +0000 (15:36 -0700)
author Nick Piggin <npiggin@suse.de>
Thu, 30 Apr 2009 22:08:16 +0000 (15:08 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 2 May 2009 22:36:09 +0000 (15:36 -0700)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index 76efe5b71d7d8f744739a129f4b5333b898acf5c..3120f8dd2c316c66347051a95d3edce1ad090d44 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -512,16 +512,24 @@ locking rules:
                 BKL     mmap_sem        PageLocked(page)
  open:          no      yes
  close:         no      yes
-fault:         no      yes
-page_mkwrite:  no      yes             no
+fault:         no      yes             can return with page locked
+page_mkwrite:  no      yes             can return with page locked
  access:                no      yes
  
-       ->page_mkwrite() is called when a previously read-only page is
-about to become writeable. The file system is responsible for
-protecting against truncate races. Once appropriate action has been
-taking to lock out truncate, the page range should be verified to be
-within i_size. The page mapping should also be checked that it is not
-NULL.
+       ->fault() is called when a previously not present pte is about
+to be faulted in. The filesystem must find and return the page associated
+with the passed in "pgoff" in the vm_fault structure. If it is possible that
+the page may be truncated and/or invalidated, then the filesystem must lock
+the page, then ensure it is not already truncated (the page lock will block
+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
+locked. The VM will unlock the page.
+
+       ->page_mkwrite() is called when a previously read-only pte is
+about to become writeable. The filesystem again must ensure that there are
+no truncate/invalidate races, and then return with the page locked. If
+the page has been truncated, the filesystem should not look up a new page
+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
+will cause the VM to retry the fault.
  
         ->access() is called when get_user_pages() fails in
  acces_process_vm(), typically used to debug a process through
diff --git a/fs/buffer.c b/fs/buffer.c

index b3e5be7514f5ce366b2a87262489bc88e1835a59..aed297739eb0796d8c6212842003fb2976c5fba0 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2397,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
         if ((page->mapping != inode->i_mapping) ||
             (page_offset(page) > size)) {
                 /* page got truncated out from underneath us */
-               goto out_unlock;
+               unlock_page(page);
+               goto out;
         }
  
         /* page is wholly or partially inside EOF */
@@ -2411,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                 ret = block_commit_write(page, 0, end);
  
         if (unlikely(ret)) {
+               unlock_page(page);
                 if (ret == -ENOMEM)
                         ret = VM_FAULT_OOM;
                 else /* -ENOSPC, -EIO, etc */
                         ret = VM_FAULT_SIGBUS;
-       }
+       } else
+               ret = VM_FAULT_LOCKED;
  
-out_unlock:
-       unlock_page(page);
+out:
         return ret;
  }
  
diff --git a/mm/memory.c b/mm/memory.c

index 6a4ef0fd0711c4551333be01320a3d635d6cd20e..4126dd16778c36595af938988c9d8ec144d0f8aa 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 ret = tmp;
                                 goto unwritable_page;
                         }
+                       if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                               lock_page(old_page);
+                               if (!old_page->mapping) {
+                                       ret = 0; /* retry the fault */
+                                       unlock_page(old_page);
+                                       goto unwritable_page;
+                               }
+                       } else
+                               VM_BUG_ON(!PageLocked(old_page));
  
                         /*
                          * Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          */
                         page_table = pte_offset_map_lock(mm, pmd, address,
                                                          &ptl);
-                       page_cache_release(old_page);
-                       if (!pte_same(*page_table, orig_pte))
+                       if (!pte_same(*page_table, orig_pte)) {
+                               unlock_page(old_page);
+                               page_cache_release(old_page);
                                 goto unlock;
+                       }
  
                         page_mkwrite = 1;
                 }
@@ -2094,9 +2105,6 @@ gotten:
  unlock:
         pte_unmap_unlock(page_table, ptl);
         if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
-
                 /*
                  * Yes, Virginia, this is actually required to prevent a race
                  * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
                  *
                  * do_no_page is protected similarly.
                  */
-               wait_on_page_locked(dirty_page);
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (!page_mkwrite) {
+                       wait_on_page_locked(dirty_page);
+                       set_page_dirty_balance(dirty_page, page_mkwrite);
+               }
                 put_page(dirty_page);
+               if (page_mkwrite) {
+                       struct address_space *mapping = dirty_page->mapping;
+
+                       set_page_dirty(dirty_page);
+                       unlock_page(dirty_page);
+                       page_cache_release(dirty_page);
+                       if (mapping)    {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
         }
         return ret;
  oom_free_new:
         page_cache_release(new_page);
  oom:
-       if (old_page)
+       if (old_page) {
+               if (page_mkwrite) {
+                       unlock_page(old_page);
+                       page_cache_release(old_page);
+               }
                 page_cache_release(old_page);
+       }
         return VM_FAULT_OOM;
  
  unwritable_page:
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                 int tmp;
  
                                 unlock_page(page);
-                               vmf.flags |= FAULT_FLAG_MKWRITE;
+                               vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
                                 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
                                 if (unlikely(tmp &
                                           (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                                         ret = tmp;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out_unlocked;
-                               }
-                               lock_page(page);
-                               /*
-                                * XXX: this is not quite right (racy vs
-                                * invalidate) to unlock and relock the page
-                                * like this, however a better fix requires
-                                * reworking page_mkwrite locking API, which
-                                * is better done later.
-                                */
-                               if (!page->mapping) {
-                                       ret = 0;
-                                       anon = 1; /* no anon but release vmf.page */
-                                       goto out;
+                                       goto unwritable_page;
                                 }
+                               if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                                       lock_page(page);
+                                       if (!page->mapping) {
+                                               ret = 0; /* retry the fault */
+                                               unlock_page(page);
+                                               goto unwritable_page;
+                                       }
+                               } else
+                                       VM_BUG_ON(!PageLocked(page));
                                 page_mkwrite = 1;
                         }
                 }
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_unmap_unlock(page_table, ptl);
  
  out:
-       unlock_page(vmf.page);
-out_unlocked:
-       if (anon)
-               page_cache_release(vmf.page);
-       else if (dirty_page) {
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
+       if (dirty_page) {
+               struct address_space *mapping = page->mapping;
  
-               set_page_dirty_balance(dirty_page, page_mkwrite);
+               if (set_page_dirty(dirty_page))
+                       page_mkwrite = 1;
+               unlock_page(dirty_page);
                 put_page(dirty_page);
+               if (page_mkwrite && mapping) {
+                       /*
+                        * Some device drivers do not set page.mapping but still
+                        * dirty their pages
+                        */
+                       balance_dirty_pages_ratelimited(mapping);
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
+       } else {
+               unlock_page(vmf.page);
+               if (anon)
+                       page_cache_release(vmf.page);
         }
  
         return ret;
+
+unwritable_page:
+       page_cache_release(page);
+       return ret;
  }
  
  static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
author	Nick Piggin <npiggin@suse.de>
	Thu, 30 Apr 2009 22:08:16 +0000 (15:08 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 2 May 2009 22:36:09 +0000 (15:36 -0700)
Documentation/filesystems/Locking		patch \| blob \| blame \| history
fs/buffer.c		patch \| blob \| blame \| history
mm/memory.c		patch \| blob \| blame \| history