if (ret)
return ERR_PTR(ret);
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return em;
- }
-
+ /*
+ * Create the ordered extent before the extent map. This is to avoid
+ * races with the fast fsync path that would lead to it logging file
+ * extent items that point to disk extents that were not yet written to.
+ * The fast fsync path collects ordered extents into a local list and
+ * then collects all the new extent maps, so we must create the ordered
+ * extent first and make sure the fast fsync path collects any new
+ * ordered extents after collecting new extent maps as well.
+ * The fsync path simply can not rely on inode_dio_wait() because it
+ * causes deadlock with AIO.
+ */
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- free_extent_map(em);
return ERR_PTR(ret);
}
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, ins.offset, 0);
+ if (IS_ERR(em)) {
+ struct btrfs_ordered_extent *oe;
+
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ oe = btrfs_lookup_ordered_extent(inode, start);
+ ASSERT(oe);
+ if (WARN_ON(!oe))
+ return em;
+ set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+ set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+ btrfs_remove_ordered_extent(inode, oe);
+ /* Once for our lookup and once for the ordered extents tree. */
+ btrfs_put_ordered_extent(oe);
+ btrfs_put_ordered_extent(oe);
+ }
return em;
}
struct inode *inode,
struct btrfs_path *path,
struct list_head *logged_list,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ const u64 start,
+ const u64 end)
{
struct extent_map *em, *n;
struct list_head extents;
}
list_sort(NULL, &extents, extent_cmp);
-
+ /*
+ * Collect any new ordered extents within the range. This is to
+ * prevent logging file extent items without waiting for the disk
+ * location they point to being written. We do this only to deal
+ * with races against concurrent lockless direct IO writes.
+ */
+ btrfs_get_logged_extents(inode, logged_list, start, end);
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
goto out_unlock;
}
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx);
+ &logged_list, ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;