]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - fs/btrfs/inode.c
Btrfs: do not pin while under spin lock
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / inode.c
index ca405171363391f35ca58ebb66dcc6bb2600acac..c0e95b1554a00141f98dcb805d7c0040b49be656 100644 (file)
@@ -103,6 +103,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
                                           u64 orig_block_len, u64 ram_bytes,
                                           int type);
 
+static int btrfs_dirty_inode(struct inode *inode);
+
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@ -712,8 +714,10 @@ retry:
                                        async_extent->ram_size - 1, 0);
 
                em = alloc_extent_map();
-               if (!em)
+               if (!em) {
+                       ret = -ENOMEM;
                        goto out_free_reserve;
+               }
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
@@ -920,7 +924,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                }
 
                em = alloc_extent_map();
-               BUG_ON(!em); /* -ENOMEM */
+               if (!em) {
+                       ret = -ENOMEM;
+                       goto out_reserve;
+               }
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
@@ -947,11 +954,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                        btrfs_drop_extent_cache(inode, start,
                                                start + ram_size - 1, 0);
                }
+               if (ret)
+                       goto out_reserve;
 
                cur_alloc_size = ins.offset;
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
-               BUG_ON(ret); /* -ENOMEM */
+               if (ret)
+                       goto out_reserve;
 
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
@@ -959,7 +969,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                                                      cur_alloc_size);
                        if (ret) {
                                btrfs_abort_transaction(trans, root, ret);
-                               goto out_unlock;
+                               goto out_reserve;
                        }
                }
 
@@ -988,6 +998,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 out:
        return ret;
 
+out_reserve:
+       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_unlock:
        extent_clear_unlock_delalloc(inode,
                     &BTRFS_I(inode)->io_tree,
@@ -1516,13 +1528,53 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+                                     struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                             &root->delalloc_inodes);
+               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                       &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes++;
+               if (root->nr_delalloc_inodes == 1) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(!list_empty(&root->delalloc_root));
+                       list_add_tail(&root->delalloc_root,
+                                     &root->fs_info->delalloc_roots);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+                                    struct inode *inode)
+{
+       spin_lock(&root->delalloc_lock);
+       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes--;
+               if (!root->nr_delalloc_inodes) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(list_empty(&root->delalloc_root));
+                       list_del_init(&root->delalloc_root);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
 static void btrfs_set_bit_hook(struct inode *inode,
-                              struct extent_state *state, int *bits)
+                              struct extent_state *state, unsigned long *bits)
 {
 
        /*
@@ -1548,16 +1600,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                        &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                             &root->fs_info->delalloc_inodes);
-                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                       &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                                        &BTRFS_I(inode)->runtime_flags))
+                       btrfs_add_delalloc_inodes(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1566,7 +1610,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
 static void btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, int *bits)
+                                struct extent_state *state,
+                                unsigned long *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1599,15 +1644,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                            &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                         &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                            &BTRFS_I(inode)->runtime_flags))
+                       btrfs_del_delalloc_inode(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -2249,11 +2287,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
                        return 0;
                return PTR_ERR(root);
        }
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               srcu_read_unlock(&fs_info->subvol_srcu, index);
-               /* parse ENOENT to 0 */
-               return 0;
-       }
 
        /* step 2: get inode */
        key.objectid = backref->inum;
@@ -3018,7 +3051,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
  * We have done the truncate/delete so we can go ahead and remove the orphan
  * item for this particular inode.
  */
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+                           struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int delete_item = 0;
@@ -3645,53 +3679,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        }
        return ret;
 }
-               
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-                            struct btrfs_path *path)
-{
-       struct extent_buffer *eb;
-       int level;
-       u64 refs = 1;
-
-       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-               int ret;
-
-               if (!path->nodes[level])
-                       break;
-               eb = path->nodes[level];
-               if (!btrfs_block_can_be_shared(root, eb))
-                       continue;
-               ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
-                                              &refs, NULL);
-               if (refs > 1)
-                       return 1;
-       }
-       return 0;
-}
 
 /*
  * helper to start transaction for unlink and rmdir.
  *
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-                                                      struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-       struct btrfs_path *path;
-       struct btrfs_dir_item *di;
-       struct inode *inode = dentry->d_inode;
-       u64 index;
-       int check_link = 1;
-       int err = -ENOSPC;
        int ret;
-       u64 ino = btrfs_ino(inode);
-       u64 dir_ino = btrfs_ino(dir);
 
        /*
         * 1 for the possible orphan item
@@ -3704,158 +3705,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
 
-       if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-               return ERR_PTR(-ENOSPC);
-
-       /* check if there is someone else holds reference */
-       if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-               return ERR_PTR(-ENOSPC);
-
-       if (atomic_read(&inode->i_count) > 2)
-               return ERR_PTR(-ENOSPC);
-
-       if (xchg(&root->fs_info->enospc_unlink, 1))
-               return ERR_PTR(-ENOSPC);
-
-       path = btrfs_alloc_path();
-       if (!path) {
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(-ENOMEM);
-       }
-
-       /* 1 for the orphan item */
-       trans = btrfs_start_transaction(root, 1);
-       if (IS_ERR(trans)) {
-               btrfs_free_path(path);
-               root->fs_info->enospc_unlink = 0;
-               return trans;
-       }
-
-       path->skip_locking = 1;
-       path->search_commit_root = 1;
-
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(dir)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
-
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(inode)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
+       if (PTR_ERR(trans) == -ENOSPC) {
+               u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
 
-       if (ret == 0 && S_ISREG(inode->i_mode)) {
-               ret = btrfs_lookup_file_extent(trans, root, path,
-                                              ino, (u64)-1, 0);
-               if (ret < 0) {
-                       err = ret;
-                       goto out;
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans))
+                       return trans;
+               ret = btrfs_cond_migrate_bytes(root->fs_info,
+                                              &root->fs_info->trans_block_rsv,
+                                              num_bytes, 5);
+               if (ret) {
+                       btrfs_end_transaction(trans, root);
+                       return ERR_PTR(ret);
                }
-               BUG_ON(ret == 0); /* Corruption */
-               if (check_path_shared(root, path))
-                       goto out;
-               btrfs_release_path(path);
-       }
-
-       if (!check_link) {
-               err = 0;
-               goto out;
-       }
-
-       di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       if (di) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               err = 0;
-               goto out;
-       }
-       btrfs_release_path(path);
-
-       ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-                                       dentry->d_name.len, ino, dir_ino, 0,
-                                       &index);
-       if (ret) {
-               err = ret;
-               goto out;
-       }
-
-       if (check_path_shared(root, path))
-               goto out;
-
-       btrfs_release_path(path);
-
-       /*
-        * This is a commit root search, if we can lookup inode item and other
-        * relative items in the commit root, it means the transaction of
-        * dir/file creation has been committed, and the dir index item that we
-        * delay to insert has also been inserted into the commit root. So
-        * we needn't worry about the delayed insertion of the dir index item
-        * here.
-        */
-       di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       BUG_ON(ret == -ENOENT);
-       if (check_path_shared(root, path))
-               goto out;
-
-       err = 0;
-out:
-       btrfs_free_path(path);
-       /* Migrate the orphan reservation over */
-       if (!err)
-               err = btrfs_block_rsv_migrate(trans->block_rsv,
-                               &root->fs_info->global_block_rsv,
-                               trans->bytes_reserved);
-
-       if (err) {
-               btrfs_end_transaction(trans, root);
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(err);
-       }
-
-       trans->block_rsv = &root->fs_info->global_block_rsv;
-       return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root)
-{
-       if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
-               btrfs_block_rsv_release(root, trans->block_rsv,
-                                       trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-               BUG_ON(!root->fs_info->enospc_unlink);
-               root->fs_info->enospc_unlink = 0;
+               trans->bytes_reserved = num_bytes;
        }
-       btrfs_end_transaction(trans, root);
+       return trans;
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3865,7 +3731,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
 
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
@@ -3883,7 +3749,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        }
 
 out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
        return ret;
 }
@@ -3980,7 +3846,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -EPERM;
 
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
@@ -4002,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!err)
                btrfs_i_size_write(inode, 0);
 out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 
        return err;
@@ -4178,8 +4044,7 @@ search_again:
                                }
                                size =
                                    btrfs_file_extent_calc_inline_size(size);
-                               btrfs_truncate_item(trans, root, path,
-                                                   size, 1);
+                               btrfs_truncate_item(root, path, size, 1);
                        } else if (root->ref_cows) {
                                inode_sub_bytes(inode, item_end + 1 -
                                                found_key.offset);
@@ -4714,6 +4579,7 @@ void btrfs_evict_inode(struct inode *inode)
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 no_delete:
+       btrfs_remove_delayed_node(inode);
        clear_inode(inode);
        return;
 }
@@ -4807,11 +4673,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
                goto out;
        }
 
-       if (btrfs_root_refs(&new_root->root_item) == 0) {
-               err = -ENOENT;
-               goto out;
-       }
-
        *sub_root = new_root;
        location->objectid = btrfs_root_dirid(&new_root->root_item);
        location->type = BTRFS_INODE_ITEM_KEY;
@@ -4829,14 +4690,13 @@ static void inode_tree_add(struct inode *inode)
        struct rb_node **p;
        struct rb_node *parent;
        u64 ino = btrfs_ino(inode);
-again:
-       p = &root->inode_tree.rb_node;
-       parent = NULL;
 
        if (inode_unhashed(inode))
                return;
-
+again:
+       parent = NULL;
        spin_lock(&root->inode_lock);
+       p = &root->inode_tree.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -5337,7 +5197,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
@@ -6505,7 +6365,9 @@ out:
  * block must be cow'd
  */
 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, u64 offset, u64 len)
+                                     struct inode *inode, u64 offset, u64 *len,
+                                     u64 *orig_start, u64 *orig_block_len,
+                                     u64 *ram_bytes)
 {
        struct btrfs_path *path;
        int ret;
@@ -6562,8 +6424,12 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        backref_offset = btrfs_file_extent_offset(leaf, fi);
 
+       *orig_start = key.offset - backref_offset;
+       *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+       *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+
        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-       if (extent_end < offset + len) {
+       if (extent_end < offset + *len) {
                /* extent doesn't include our full range, must cow */
                goto out;
        }
@@ -6587,13 +6453,14 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
         */
        disk_bytenr += backref_offset;
        disk_bytenr += offset - key.offset;
-       num_bytes = min(offset + len, extent_end) - offset;
+       num_bytes = min(offset + *len, extent_end) - offset;
        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                goto out;
        /*
         * all of the above have passed, it is safe to overwrite this extent
         * without cow
         */
+       *len = num_bytes;
        ret = 1;
 out:
        btrfs_free_path(path);
@@ -6791,7 +6658,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
             em->block_start != EXTENT_MAP_HOLE)) {
                int type;
                int ret;
-               u64 block_start;
+               u64 block_start, orig_start, orig_block_len, ram_bytes;
 
                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        type = BTRFS_ORDERED_PREALLOC;
@@ -6809,11 +6676,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                if (IS_ERR(trans))
                        goto must_cow;
 
-               if (can_nocow_odirect(trans, inode, start, len) == 1) {
-                       u64 orig_start = em->orig_start;
-                       u64 orig_block_len = em->orig_block_len;
-                       u64 ram_bytes = em->ram_bytes;
-
+               if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
+                                     &orig_block_len, &ram_bytes) == 1) {
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
@@ -6914,7 +6778,11 @@ struct btrfs_dio_private {
        /* IO errors */
        int errors;
 
+       /* orig_bio is our btrfs_io_bio */
        struct bio *orig_bio;
+
+       /* dio_bio came from fs/direct-io.c */
+       struct bio *dio_bio;
 };
 
 static void btrfs_endio_direct_read(struct bio *bio, int err)
@@ -6924,6 +6792,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        struct bio_vec *bvec = bio->bi_io_vec;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct bio *dio_bio;
        u64 start;
 
        start = dip->logical_offset;
@@ -6963,14 +6832,15 @@ failed:
 
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
-       bio->bi_private = dip->private;
+       dio_bio = dip->dio_bio;
 
        kfree(dip);
 
        /* If we had a csum failure make sure to clear the uptodate flag */
        if (err)
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-       dio_end_io(bio, err);
+               clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+       dio_end_io(dio_bio, err);
+       bio_put(bio);
 }
 
 static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -6981,6 +6851,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_ordered_extent *ordered = NULL;
        u64 ordered_offset = dip->logical_offset;
        u64 ordered_bytes = dip->bytes;
+       struct bio *dio_bio;
        int ret;
 
        if (err)
@@ -7008,14 +6879,15 @@ out_test:
                goto again;
        }
 out_done:
-       bio->bi_private = dip->private;
+       dio_bio = dip->dio_bio;
 
        kfree(dip);
 
        /* If we had an error make sure to clear the uptodate flag */
        if (err)
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-       dio_end_io(bio, err);
+               clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+       dio_end_io(dio_bio, err);
+       bio_put(bio);
 }
 
 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -7051,10 +6923,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
        if (!atomic_dec_and_test(&dip->pending_bios))
                goto out;
 
-       if (dip->errors)
+       if (dip->errors) {
                bio_io_error(dip->orig_bio);
-       else {
-               set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+       else {
+               set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
                bio_endio(dip->orig_bio, 0);
        }
 out:
@@ -7229,25 +7101,34 @@ out_err:
        return 0;
 }
 
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
-                               loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+                               struct inode *inode, loff_t file_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec = dio_bio->bi_io_vec;
+       struct bio *io_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
        int ret = 0;
 
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
+       io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+
+       if (!io_bio) {
+               ret = -ENOMEM;
+               goto free_ordered;
+       }
+
        dip = kmalloc(sizeof(*dip), GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
-               goto free_ordered;
+               goto free_io_bio;
        }
 
-       dip->private = bio->bi_private;
+       dip->private = dio_bio->bi_private;
+       io_bio->bi_private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
 
@@ -7255,22 +7136,27 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        do {
                dip->bytes += bvec->bv_len;
                bvec++;
-       } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+       } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
 
-       dip->disk_bytenr = (u64)bio->bi_sector << 9;
-       bio->bi_private = dip;
+       dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+       io_bio->bi_private = dip;
        dip->errors = 0;
-       dip->orig_bio = bio;
+       dip->orig_bio = io_bio;
+       dip->dio_bio = dio_bio;
        atomic_set(&dip->pending_bios, 0);
 
        if (write)
-               bio->bi_end_io = btrfs_endio_direct_write;
+               io_bio->bi_end_io = btrfs_endio_direct_write;
        else
-               bio->bi_end_io = btrfs_endio_direct_read;
+               io_bio->bi_end_io = btrfs_endio_direct_read;
 
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
+
+free_io_bio:
+       bio_put(io_bio);
+
 free_ordered:
        /*
         * If this is a write, we need to clean up the reserved space and kill
@@ -7286,7 +7172,7 @@ free_ordered:
                btrfs_put_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
        }
-       bio_endio(bio, ret);
+       bio_endio(dio_bio, ret);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -7428,8 +7314,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-int btrfs_writepages(struct address_space *mapping,
-                    struct writeback_control *wbc)
+static int btrfs_writepages(struct address_space *mapping,
+                           struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
 
@@ -7937,9 +7823,9 @@ void btrfs_destroy_inode(struct inode *inode)
         */
        smp_mb();
        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
                list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
        }
 
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -7965,7 +7851,6 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-       btrfs_remove_delayed_node(inode);
        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
@@ -7973,6 +7858,9 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
 
+       if (root == NULL)
+               return 1;
+
        /* the snap/subvol tree is on deleting */
        if (btrfs_root_refs(&root->root_item) == 0 &&
            root != root->fs_info->tree_root)
@@ -8307,7 +8195,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -8316,30 +8204,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        struct list_head splice;
        int ret = 0;
 
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
 
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
 
-               list_del_init(&binode->delalloc_inodes);
-
+               list_move_tail(&binode->delalloc_inodes,
+                              &root->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode) {
-                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                 &binode->runtime_flags);
+                       cond_resched_lock(&root->delalloc_lock);
                        continue;
                }
-
-               list_add_tail(&binode->delalloc_inodes,
-                             &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
 
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
                if (unlikely(!work)) {
@@ -8351,16 +8232,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                                   &work->work);
 
                cond_resched();
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
 
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+       return 0;
+out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->delalloc_lock);
+               list_splice_tail(&splice, &root->delalloc_inodes);
+               spin_unlock(&root->delalloc_lock);
+       }
+       return ret;
+}
 
-       /* the filemap_flush will queue IO into the worker threads, but
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+       int ret;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       ret = __start_delalloc_inodes(root, delay_iput);
+       /*
+        * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
         */
@@ -8372,17 +8276,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-       return 0;
-out:
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
+       return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput)
+{
+       struct btrfs_root *root;
+       struct list_head splice;
+       int ret;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               list_move_tail(&root->delalloc_root,
+                              &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
+
+               ret = __start_delalloc_inodes(root, delay_iput);
+               btrfs_put_fs_root(root);
+               if (ret)
+                       goto out;
+
+               spin_lock(&fs_info->delalloc_root_lock);
        }
+       spin_unlock(&fs_info->delalloc_root_lock);
 
+       atomic_inc(&fs_info->async_submit_draining);
+       while (atomic_read(&fs_info->nr_async_submits) ||
+             atomic_read(&fs_info->async_delalloc_pages)) {
+               wait_event(fs_info->async_submit_wait,
+                  (atomic_read(&fs_info->nr_async_submits) == 0 &&
+                   atomic_read(&fs_info->async_delalloc_pages) == 0));
+       }
+       atomic_dec(&fs_info->async_submit_draining);
+       return 0;
+out:
        if (!list_empty_careful(&splice)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_lock(&fs_info->delalloc_root_lock);
+               list_splice_tail(&splice, &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
        }
        return ret;
 }