]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blobdiff - fs/btrfs/extent-tree.c
Btrfs: fix heavy delalloc related deadlock
[mirror_ubuntu-eoan-kernel.git] / fs / btrfs / extent-tree.c
index 0236de711989097bbf5191dbb6871281d840de22..277d2c26b03438d6894b60bedc8da9836dd9d6f5 100644 (file)
@@ -113,7 +113,8 @@ static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
        smp_mb();
-       return cache->cached == BTRFS_CACHE_FINISHED;
+       return cache->cached == BTRFS_CACHE_FINISHED ||
+               cache->cached == BTRFS_CACHE_ERROR;
 }
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
@@ -389,7 +390,7 @@ static noinline void caching_thread(struct btrfs_work *work)
        u64 total_found = 0;
        u64 last = 0;
        u32 nritems;
-       int ret = 0;
+       int ret = -ENOMEM;
 
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
@@ -420,6 +421,7 @@ again:
        /* need to make sure the commit_root doesn't disappear */
        down_read(&fs_info->extent_commit_sem);
 
+next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
@@ -459,6 +461,16 @@ again:
                        continue;
                }
 
+               if (key.objectid < last) {
+                       key.objectid = last;
+                       key.offset = 0;
+                       key.type = BTRFS_EXTENT_ITEM_KEY;
+
+                       caching_ctl->progress = last;
+                       btrfs_release_path(path);
+                       goto next;
+               }
+
                if (key.objectid < block_group->key.objectid) {
                        path->slots[0]++;
                        continue;
@@ -506,6 +518,12 @@ err:
 
        mutex_unlock(&caching_ctl->mutex);
 out:
+       if (ret) {
+               spin_lock(&block_group->lock);
+               block_group->caching_ctl = NULL;
+               block_group->cached = BTRFS_CACHE_ERROR;
+               spin_unlock(&block_group->lock);
+       }
        wake_up(&caching_ctl->wait);
 
        put_caching_control(caching_ctl);
@@ -771,10 +789,23 @@ again:
                goto out_free;
 
        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = root->leafsize;
-               btrfs_release_path(path);
-               goto again;
+               metadata = 0;
+               if (path->slots[0]) {
+                       path->slots[0]--;
+                       btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                             path->slots[0]);
+                       if (key.objectid == bytenr &&
+                           key.type == BTRFS_EXTENT_ITEM_KEY &&
+                           key.offset == root->leafsize)
+                               ret = 0;
+               }
+               if (ret) {
+                       key.objectid = bytenr;
+                       key.type = BTRFS_EXTENT_ITEM_KEY;
+                       key.offset = root->leafsize;
+                       btrfs_release_path(path);
+                       goto again;
+               }
        }
 
        if (ret == 0) {
@@ -2011,6 +2042,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
        ins.type = BTRFS_EXTENT_ITEM_KEY;
 
        ref = btrfs_delayed_node_to_data_ref(node);
+       trace_run_delayed_data_ref(node, ref, node->action);
+
        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
                parent = ref->parent;
        else
@@ -2154,6 +2187,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                 SKINNY_METADATA);
 
        ref = btrfs_delayed_node_to_tree_ref(node);
+       trace_run_delayed_tree_ref(node, ref, node->action);
+
        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                parent = ref->parent;
        else
@@ -2212,6 +2247,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                 */
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
+               trace_run_delayed_ref_head(node, head, node->action);
+
                if (insert_reserved) {
                        btrfs_pin_extent(root, node->bytenr,
                                         node->num_bytes, 1);
@@ -3799,8 +3836,12 @@ again:
        if (force < space_info->force_alloc)
                force = space_info->force_alloc;
        if (space_info->full) {
+               if (should_alloc_chunk(extent_root, space_info, force))
+                       ret = -ENOSPC;
+               else
+                       ret = 0;
                spin_unlock(&space_info->lock);
-               return 0;
+               return ret;
        }
 
        if (!should_alloc_chunk(extent_root, space_info, force)) {
@@ -4729,10 +4770,12 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv,
                                     int items,
-                                    u64 *qgroup_reserved)
+                                    u64 *qgroup_reserved,
+                                    bool use_global_rsv)
 {
        u64 num_bytes;
        int ret;
+       struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 
        if (root->fs_info->quota_enabled) {
                /* One for parent inode, two for dir entries */
@@ -4751,6 +4794,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                            BTRFS_BLOCK_GROUP_METADATA);
        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
                                  BTRFS_RESERVE_FLUSH_ALL);
+
+       if (ret == -ENOSPC && use_global_rsv)
+               ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
        if (ret) {
                if (*qgroup_reserved)
                        btrfs_qgroup_free(root, *qgroup_reserved);
@@ -5999,8 +6046,11 @@ static u64 stripe_align(struct btrfs_root *root,
  * for our min num_bytes.  Another option is to have it go ahead
  * and look in the rbtree for a free extent of a given size, but this
  * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
  */
-static noinline int
+static noinline void
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                u64 num_bytes)
 {
@@ -6008,28 +6058,29 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
-               return 0;
+               return;
 
        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
                   (cache->free_space_ctl->free_space >= num_bytes));
 
        put_caching_control(caching_ctl);
-       return 0;
 }
 
 static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_caching_control *caching_ctl;
+       int ret = 0;
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
-               return 0;
+               return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 
        wait_event(caching_ctl->wait, block_group_cache_done(cache));
-
+       if (cache->cached == BTRFS_CACHE_ERROR)
+               ret = -EIO;
        put_caching_control(caching_ctl);
-       return 0;
+       return ret;
 }
 
 int __get_raid_index(u64 flags)
@@ -6212,6 +6263,8 @@ have_block_group:
                        ret = 0;
                }
 
+               if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+                       goto loop;
                if (unlikely(block_group->ro))
                        goto loop;
 
@@ -7173,6 +7226,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
                if (!next)
                        return -ENOMEM;
+               btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+                                              level - 1);
                reada = 1;
        }
        btrfs_tree_lock(next);
@@ -7466,6 +7521,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int err = 0;
        int ret;
        int level;
+       bool root_dropped = false;
 
        path = btrfs_alloc_path();
        if (!path) {
@@ -7523,6 +7579,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                while (1) {
                        btrfs_tree_lock(path->nodes[level]);
                        btrfs_set_lock_blocking(path->nodes[level]);
+                       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
                        ret = btrfs_lookup_extent_info(trans, root,
                                                path->nodes[level]->start,
@@ -7538,6 +7595,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                break;
 
                        btrfs_tree_unlock(path->nodes[level]);
+                       path->locks[level] = 0;
                        WARN_ON(wc->refs[level] != 1);
                        level--;
                }
@@ -7552,11 +7610,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
-               if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
-                       pr_debug("btrfs: drop snapshot early exit\n");
-                       err = -EAGAIN;
-                       goto out_end_trans;
-               }
 
                ret = walk_down_tree(trans, root, path, wc);
                if (ret < 0) {
@@ -7584,7 +7637,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                }
 
                BUG_ON(wc->level == 0);
-               if (btrfs_should_end_transaction(trans, tree_root)) {
+               if (btrfs_should_end_transaction(trans, tree_root) ||
+                   (!for_reloc && btrfs_need_cleaner_sleep(root))) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
@@ -7595,6 +7649,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        }
 
                        btrfs_end_transaction_throttle(trans, tree_root);
+                       if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+                               pr_debug("btrfs: drop snapshot early exit\n");
+                               err = -EAGAIN;
+                               goto out_free;
+                       }
+
                        trans = btrfs_start_transaction(tree_root, 0);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
@@ -7639,12 +7699,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                free_extent_buffer(root->commit_root);
                btrfs_put_fs_root(root);
        }
+       root_dropped = true;
 out_end_trans:
        btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
        kfree(wc);
        btrfs_free_path(path);
 out:
+       /*
+        * So if we need to stop dropping the snapshot for whatever reason we
+        * need to make sure to add it back to the dead root list so that we
+        * keep trying to do the work later.  This also cleans up roots if we
+        * don't have it in the radix (like when we recover after a power fail
+        * or unmount) so we don't leak memory.
+        */
+       if (!for_reloc && root_dropped == false)
+               btrfs_add_dead_root(root);
        if (err)
                btrfs_std_error(root->fs_info, err);
        return err;
@@ -8177,7 +8247,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                 * We haven't cached this block group, which means we could
                 * possibly have excluded extents on this block group.
                 */
-               if (block_group->cached == BTRFS_CACHE_NO)
+               if (block_group->cached == BTRFS_CACHE_NO ||
+                   block_group->cached == BTRFS_CACHE_ERROR)
                        free_excluded_extents(info->extent_root, block_group);
 
                btrfs_remove_free_space_cache(block_group);
@@ -8394,9 +8465,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * avoid allocating from un-mirrored block group if there are
                 * mirrored block groups.
                 */
-               list_for_each_entry(cache, &space_info->block_groups[3], list)
+               list_for_each_entry(cache,
+                               &space_info->block_groups[BTRFS_RAID_RAID0],
+                               list)
                        set_block_group_ro(cache, 1);
-               list_for_each_entry(cache, &space_info->block_groups[4], list)
+               list_for_each_entry(cache,
+                               &space_info->block_groups[BTRFS_RAID_SINGLE],
+                               list)
                        set_block_group_ro(cache, 1);
        }