Btrfs: fix heavy delalloc related deadlock

[mirror_ubuntu-eoan-kernel.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 0236de711989097bbf5191dbb6871281d840de22..277d2c26b03438d6894b60bedc8da9836dd9d6f5 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -113,7 +113,8 @@ static noinline int
  block_group_cache_done(struct btrfs_block_group_cache *cache)
  {
         smp_mb();
-       return cache->cached == BTRFS_CACHE_FINISHED;
+       return cache->cached == BTRFS_CACHE_FINISHED ||
+               cache->cached == BTRFS_CACHE_ERROR;
  }
  
  static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
@@ -389,7 +390,7 @@ static noinline void caching_thread(struct btrfs_work *work)
         u64 total_found = 0;
         u64 last = 0;
         u32 nritems;
-       int ret = 0;
+       int ret = -ENOMEM;
  
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
@@ -420,6 +421,7 @@ again:
         /* need to make sure the commit_root doesn't disappear */
         down_read(&fs_info->extent_commit_sem);
  
+next:
         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
         if (ret < 0)
                 goto err;
@@ -459,6 +461,16 @@ again:
                         continue;
                 }
  
+               if (key.objectid < last) {
+                       key.objectid = last;
+                       key.offset = 0;
+                       key.type = BTRFS_EXTENT_ITEM_KEY;
+
+                       caching_ctl->progress = last;
+                       btrfs_release_path(path);
+                       goto next;
+               }
+
                 if (key.objectid < block_group->key.objectid) {
                         path->slots[0]++;
                         continue;
@@ -506,6 +518,12 @@ err:
  
         mutex_unlock(&caching_ctl->mutex);
  out:
+       if (ret) {
+               spin_lock(&block_group->lock);
+               block_group->caching_ctl = NULL;
+               block_group->cached = BTRFS_CACHE_ERROR;
+               spin_unlock(&block_group->lock);
+       }
         wake_up(&caching_ctl->wait);
  
         put_caching_control(caching_ctl);
@@ -771,10 +789,23 @@ again:
                 goto out_free;
  
         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = root->leafsize;
-               btrfs_release_path(path);
-               goto again;
+               metadata = 0;
+               if (path->slots[0]) {
+                       path->slots[0]--;
+                       btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                             path->slots[0]);
+                       if (key.objectid == bytenr &&
+                           key.type == BTRFS_EXTENT_ITEM_KEY &&
+                           key.offset == root->leafsize)
+                               ret = 0;
+               }
+               if (ret) {
+                       key.objectid = bytenr;
+                       key.type = BTRFS_EXTENT_ITEM_KEY;
+                       key.offset = root->leafsize;
+                       btrfs_release_path(path);
+                       goto again;
+               }
         }
  
         if (ret == 0) {
@@ -2011,6 +2042,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
         ins.type = BTRFS_EXTENT_ITEM_KEY;
  
         ref = btrfs_delayed_node_to_data_ref(node);
+       trace_run_delayed_data_ref(node, ref, node->action);
+
         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
                 parent = ref->parent;
         else
@@ -2154,6 +2187,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                  SKINNY_METADATA);
  
         ref = btrfs_delayed_node_to_tree_ref(node);
+       trace_run_delayed_tree_ref(node, ref, node->action);
+
         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                 parent = ref->parent;
         else
@@ -2212,6 +2247,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                  */
                 BUG_ON(extent_op);
                 head = btrfs_delayed_node_to_head(node);
+               trace_run_delayed_ref_head(node, head, node->action);
+
                 if (insert_reserved) {
                         btrfs_pin_extent(root, node->bytenr,
                                          node->num_bytes, 1);
@@ -3799,8 +3836,12 @@ again:
         if (force < space_info->force_alloc)
                 force = space_info->force_alloc;
         if (space_info->full) {
+               if (should_alloc_chunk(extent_root, space_info, force))
+                       ret = -ENOSPC;
+               else
+                       ret = 0;
                 spin_unlock(&space_info->lock);
-               return 0;
+               return ret;
         }
  
         if (!should_alloc_chunk(extent_root, space_info, force)) {
@@ -4729,10 +4770,12 @@ void btrfs_orphan_release_metadata(struct inode *inode)
  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                      struct btrfs_block_rsv *rsv,
                                      int items,
-                                    u64 *qgroup_reserved)
+                                    u64 *qgroup_reserved,
+                                    bool use_global_rsv)
  {
         u64 num_bytes;
         int ret;
+       struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
  
         if (root->fs_info->quota_enabled) {
                 /* One for parent inode, two for dir entries */
@@ -4751,6 +4794,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                             BTRFS_BLOCK_GROUP_METADATA);
         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
                                   BTRFS_RESERVE_FLUSH_ALL);
+
+       if (ret == -ENOSPC && use_global_rsv)
+               ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
         if (ret) {
                 if (*qgroup_reserved)
                         btrfs_qgroup_free(root, *qgroup_reserved);
@@ -5999,8 +6046,11 @@ static u64 stripe_align(struct btrfs_root *root,
   * for our min num_bytes.  Another option is to have it go ahead
   * and look in the rbtree for a free extent of a given size, but this
   * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
   */
-static noinline int
+static noinline void
  wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                 u64 num_bytes)
  {
@@ -6008,28 +6058,29 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
  
         caching_ctl = get_caching_control(cache);
         if (!caching_ctl)
-               return 0;
+               return;
  
         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
                    (cache->free_space_ctl->free_space >= num_bytes));
  
         put_caching_control(caching_ctl);
-       return 0;
  }
  
  static noinline int
  wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
  {
         struct btrfs_caching_control *caching_ctl;
+       int ret = 0;
  
         caching_ctl = get_caching_control(cache);
         if (!caching_ctl)
-               return 0;
+               return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
  
         wait_event(caching_ctl->wait, block_group_cache_done(cache));
-
+       if (cache->cached == BTRFS_CACHE_ERROR)
+               ret = -EIO;
         put_caching_control(caching_ctl);
-       return 0;
+       return ret;
  }
  
  int __get_raid_index(u64 flags)
@@ -6212,6 +6263,8 @@ have_block_group:
                         ret = 0;
                 }
  
+               if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+                       goto loop;
                 if (unlikely(block_group->ro))
                         goto loop;
  
@@ -7173,6 +7226,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
                 if (!next)
                         return -ENOMEM;
+               btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+                                              level - 1);
                 reada = 1;
         }
         btrfs_tree_lock(next);
@@ -7466,6 +7521,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
         int err = 0;
         int ret;
         int level;
+       bool root_dropped = false;
  
         path = btrfs_alloc_path();
         if (!path) {
@@ -7523,6 +7579,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 while (1) {
                         btrfs_tree_lock(path->nodes[level]);
                         btrfs_set_lock_blocking(path->nodes[level]);
+                       path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
  
                         ret = btrfs_lookup_extent_info(trans, root,
                                                 path->nodes[level]->start,
@@ -7538,6 +7595,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                 break;
  
                         btrfs_tree_unlock(path->nodes[level]);
+                       path->locks[level] = 0;
                         WARN_ON(wc->refs[level] != 1);
                         level--;
                 }
@@ -7552,11 +7610,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
  
         while (1) {
-               if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
-                       pr_debug("btrfs: drop snapshot early exit\n");
-                       err = -EAGAIN;
-                       goto out_end_trans;
-               }
  
                 ret = walk_down_tree(trans, root, path, wc);
                 if (ret < 0) {
@@ -7584,7 +7637,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 }
  
                 BUG_ON(wc->level == 0);
-               if (btrfs_should_end_transaction(trans, tree_root)) {
+               if (btrfs_should_end_transaction(trans, tree_root) ||
+                   (!for_reloc && btrfs_need_cleaner_sleep(root))) {
                         ret = btrfs_update_root(trans, tree_root,
                                                 &root->root_key,
                                                 root_item);
@@ -7595,6 +7649,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                         }
  
                         btrfs_end_transaction_throttle(trans, tree_root);
+                       if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+                               pr_debug("btrfs: drop snapshot early exit\n");
+                               err = -EAGAIN;
+                               goto out_free;
+                       }
+
                         trans = btrfs_start_transaction(tree_root, 0);
                         if (IS_ERR(trans)) {
                                 err = PTR_ERR(trans);
@@ -7639,12 +7699,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                 free_extent_buffer(root->commit_root);
                 btrfs_put_fs_root(root);
         }
+       root_dropped = true;
  out_end_trans:
         btrfs_end_transaction_throttle(trans, tree_root);
  out_free:
         kfree(wc);
         btrfs_free_path(path);
  out:
+       /*
+        * So if we need to stop dropping the snapshot for whatever reason we
+        * need to make sure to add it back to the dead root list so that we
+        * keep trying to do the work later.  This also cleans up roots if we
+        * don't have it in the radix (like when we recover after a power fail
+        * or unmount) so we don't leak memory.
+        */
+       if (!for_reloc && root_dropped == false)
+               btrfs_add_dead_root(root);
         if (err)
                 btrfs_std_error(root->fs_info, err);
         return err;
@@ -8177,7 +8247,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                  * We haven't cached this block group, which means we could
                  * possibly have excluded extents on this block group.
                  */
-               if (block_group->cached == BTRFS_CACHE_NO)
+               if (block_group->cached == BTRFS_CACHE_NO ||
+                   block_group->cached == BTRFS_CACHE_ERROR)
                         free_excluded_extents(info->extent_root, block_group);
  
                 btrfs_remove_free_space_cache(block_group);
@@ -8394,9 +8465,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                  * avoid allocating from un-mirrored block group if there are
                  * mirrored block groups.
                  */
-               list_for_each_entry(cache, &space_info->block_groups[3], list)
+               list_for_each_entry(cache,
+                               &space_info->block_groups[BTRFS_RAID_RAID0],
+                               list)
                         set_block_group_ro(cache, 1);
-               list_for_each_entry(cache, &space_info->block_groups[4], list)
+               list_for_each_entry(cache,
+                               &space_info->block_groups[BTRFS_RAID_SINGLE],
+                               list)
                         set_block_group_ro(cache, 1);
         }