Btrfs: fix race between fsync and lockless direct IO writes

author Filipe Manana <fdmanana@suse.com>

Thu, 21 Jan 2016 10:17:54 +0000 (10:17 +0000)

committer Chris Mason <clm@fb.com>

Tue, 26 Jan 2016 00:50:26 +0000 (16:50 -0800)
author Filipe Manana <fdmanana@suse.com>
Thu, 21 Jan 2016 10:17:54 +0000 (10:17 +0000)
committer Chris Mason <clm@fb.com>
Tue, 26 Jan 2016 00:50:26 +0000 (16:50 -0800)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index b8bb7591ff9f4445f7988b7405e63c76a1b733b3..e4565456eb01d1d081d0cc2c5f2cc0c7ba54a3c3 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7115,21 +7115,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
         if (ret)
                 return ERR_PTR(ret);
  
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return em;
-       }
-
+       /*
+        * Create the ordered extent before the extent map. This is to avoid
+        * races with the fast fsync path that would lead to it logging file
+        * extent items that point to disk extents that were not yet written to.
+        * The fast fsync path collects ordered extents into a local list and
+        * then collects all the new extent maps, so we must create the ordered
+        * extent first and make sure the fast fsync path collects any new
+        * ordered extents after collecting new extent maps as well.
+        * The fsync path simply can not rely on inode_dio_wait() because it
+        * causes deadlock with AIO.
+        */
         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                            ins.offset, ins.offset, 0);
         if (ret) {
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               free_extent_map(em);
                 return ERR_PTR(ret);
         }
  
+       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+                             ins.offset, ins.offset, ins.offset, 0);
+       if (IS_ERR(em)) {
+               struct btrfs_ordered_extent *oe;
+
+               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+               oe = btrfs_lookup_ordered_extent(inode, start);
+               ASSERT(oe);
+               if (WARN_ON(!oe))
+                       return em;
+               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+               btrfs_remove_ordered_extent(inode, oe);
+               /* Once for our lookup and once for the ordered extents tree. */
+               btrfs_put_ordered_extent(oe);
+               btrfs_put_ordered_extent(oe);
+       }
         return em;
  }
  
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 323e12cc9d2f522388fe929ed66d4dd946b5881d..978c3a8108936381309de4681e90400aeb86874f 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                      struct inode *inode,
                                      struct btrfs_path *path,
                                      struct list_head *logged_list,
-                                    struct btrfs_log_ctx *ctx)
+                                    struct btrfs_log_ctx *ctx,
+                                    const u64 start,
+                                    const u64 end)
  {
         struct extent_map *em, *n;
         struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
         }
  
         list_sort(NULL, &extents, extent_cmp);
-
+       /*
+        * Collect any new ordered extents within the range. This is to
+        * prevent logging file extent items without waiting for the disk
+        * location they point to being written. We do this only to deal
+        * with races against concurrent lockless direct IO writes.
+        */
+       btrfs_get_logged_extents(inode, logged_list, start, end);
  process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
                         goto out_unlock;
                 }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                               &logged_list, ctx);
+                                               &logged_list, ctx, start, end);
                 if (ret) {
                         err = ret;
                         goto out_unlock;
author	Filipe Manana <fdmanana@suse.com>
	Thu, 21 Jan 2016 10:17:54 +0000 (10:17 +0000)
committer	Chris Mason <clm@fb.com>
	Tue, 26 Jan 2016 00:50:26 +0000 (16:50 -0800)
fs/btrfs/inode.c		patch \| blob \| blame \| history
fs/btrfs/tree-log.c		patch \| blob \| blame \| history