]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
btrfs: do not start relocation until in progress drops are done
authorJosef Bacik <josef@toxicpanda.com>
Fri, 18 Feb 2022 19:56:10 +0000 (14:56 -0500)
committerPaolo Pisati <paolo.pisati@canonical.com>
Wed, 9 Mar 2022 14:17:59 +0000 (15:17 +0100)
BugLink: https://bugs.launchpad.net/bugs/1964361
commit b4be6aefa73c9a6899ef3ba9c5faaa8a66e333ef upstream.

We hit a bug with a recovering relocation on mount for one of our file
systems in production.  I reproduced this locally by injecting errors
into snapshot delete with balance running at the same time.  This
presented as an error while looking up an extent item

  WARNING: CPU: 5 PID: 1501 at fs/btrfs/extent-tree.c:866 lookup_inline_extent_backref+0x647/0x680
  CPU: 5 PID: 1501 Comm: btrfs-balance Not tainted 5.16.0-rc8+ #8
  RIP: 0010:lookup_inline_extent_backref+0x647/0x680
  RSP: 0018:ffffae0a023ab960 EFLAGS: 00010202
  RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: 000000000000000c RDI: 0000000000000000
  RBP: ffff943fd2a39b60 R08: 0000000000000000 R09: 0000000000000001
  R10: 0001434088152de0 R11: 0000000000000000 R12: 0000000001d05000
  R13: ffff943fd2a39b60 R14: ffff943fdb96f2a0 R15: ffff9442fc923000
  FS:  0000000000000000(0000) GS:ffff944e9eb40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f1157b1fca8 CR3: 000000010f092000 CR4: 0000000000350ee0
  Call Trace:
   <TASK>
   insert_inline_extent_backref+0x46/0xd0
   __btrfs_inc_extent_ref.isra.0+0x5f/0x200
   ? btrfs_merge_delayed_refs+0x164/0x190
   __btrfs_run_delayed_refs+0x561/0xfa0
   ? btrfs_search_slot+0x7b4/0xb30
   ? btrfs_update_root+0x1a9/0x2c0
   btrfs_run_delayed_refs+0x73/0x1f0
   ? btrfs_update_root+0x1a9/0x2c0
   btrfs_commit_transaction+0x50/0xa50
   ? btrfs_update_reloc_root+0x122/0x220
   prepare_to_merge+0x29f/0x320
   relocate_block_group+0x2b8/0x550
   btrfs_relocate_block_group+0x1a6/0x350
   btrfs_relocate_chunk+0x27/0xe0
   btrfs_balance+0x777/0xe60
   balance_kthread+0x35/0x50
   ? btrfs_balance+0xe60/0xe60
   kthread+0x16b/0x190
   ? set_kthread_struct+0x40/0x40
   ret_from_fork+0x22/0x30
   </TASK>

Normally snapshot deletion and relocation are excluded from running at
the same time by the fs_info->cleaner_mutex.  However if we had a
pending balance waiting to get the ->cleaner_mutex, and a snapshot
deletion was running, and then the box crashed, we would come up in a
state where we have a half deleted snapshot.

Again, in the normal case the snapshot deletion needs to complete before
relocation can start, but in this case relocation could very well start
before the snapshot deletion completes, as we simply add the root to the
dead roots list and wait for the next time the cleaner runs to clean up
the snapshot.

Fix this by setting a bit on the fs_info if we have any DEAD_ROOT's that
had a pending drop_progress key.  If they do then we know we were in the
middle of the drop operation and set a flag on the fs_info.  Then
balance can wait until this flag is cleared to start up again.

If there are DEAD_ROOT's that don't have a drop_progress set then we're
safe to start balance right away as we'll be properly protected by the
cleaner_mutex.

CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Paolo Pisati <paolo.pisati@canonical.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index ae06ad5593535be524ef9ccf20156f7192c9da41..b46409801647b414cdde7303168a7b9c6b030662 100644 (file)
@@ -593,6 +593,9 @@ enum {
        /* Indicate whether there are any tree modification log users */
        BTRFS_FS_TREE_MOD_LOG_USERS,
 
+       /* Indicate we have half completed snapshot deletions pending. */
+       BTRFS_FS_UNFINISHED_DROPS,
+
 #if BITS_PER_LONG == 32
        /* Indicate if we have error/warn message printed on 32bit systems */
        BTRFS_FS_32BIT_ERROR,
@@ -1098,8 +1101,15 @@ enum {
        BTRFS_ROOT_HAS_LOG_TREE,
        /* Qgroup flushing is in progress */
        BTRFS_ROOT_QGROUP_FLUSHING,
+       /* This root has a drop operation that was started previously. */
+       BTRFS_ROOT_UNFINISHED_DROP,
 };
 
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+       clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
 /*
  * Record swapped tree blocks of a subvolume tree for delayed subtree trace
  * code. For detail check comment in fs/btrfs/qgroup.c.
index 2c3e106a02704c4a3efc38ec25be02c8c48690de..2180fcef56cabb646aa092dc9f1474f7eb3b2838 100644 (file)
@@ -3659,6 +3659,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 
        set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 
+       /* Kick the cleaner thread so it'll start deleting snapshots. */
+       if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+               wake_up_process(fs_info->cleaner_kthread);
+
 clear_oneshot:
        btrfs_clear_oneshot_options(fs_info);
        return 0;
@@ -4340,6 +4344,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
         */
        kthread_park(fs_info->cleaner_kthread);
 
+       /*
+        * If we had UNFINISHED_DROPS we could still be processing them, so
+        * clear that bit and wake up relocation so it can stop.
+        */
+       btrfs_wake_unfinished_drop(fs_info);
+
        /* wait for the qgroup rescan worker to stop */
        btrfs_qgroup_wait_for_completion(fs_info, false);
 
index 87c23c5c0f26da21f6d0cc04c9c5fd35ce479a21..514adc83577f0f626a3b1e03a43fd2438d08cd59 100644 (file)
@@ -5541,6 +5541,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
        int ret;
        int level;
        bool root_dropped = false;
+       bool unfinished_drop = false;
 
        btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
 
@@ -5583,6 +5584,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
         * already dropped.
         */
        set_bit(BTRFS_ROOT_DELETING, &root->state);
+       unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
@@ -5757,6 +5760,13 @@ out_free:
        kfree(wc);
        btrfs_free_path(path);
 out:
+       /*
+        * We were an unfinished drop root, check to see if there are any
+        * pending, and if not clear and wake up any waiters.
+        */
+       if (!err && unfinished_drop)
+               btrfs_maybe_wake_unfinished_drop(fs_info);
+
        /*
         * So if we need to stop dropping the snapshot for whatever reason we
         * need to make sure to add it back to the dead root list so that we
index d81bee621d3738f6015338cff7231bb49adc6a38..a050f9748fa7fc8908ba4138f803d61138d74fa5 100644 (file)
@@ -3967,6 +3967,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
        int rw = 0;
        int err = 0;
 
+       /*
+        * This only gets set if we had a half-deleted snapshot on mount.  We
+        * cannot allow relocation to start while we're still trying to clean up
+        * these pending deletions.
+        */
+       ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+       if (ret)
+               return ret;
+
+       /* We may have been woken up by close_ctree, so bail if we're closing. */
+       if (btrfs_fs_closing(fs_info))
+               return -EINTR;
+
        bg = btrfs_lookup_block_group(fs_info, group_start);
        if (!bg)
                return -ENOENT;
index db37a37996497828fa6124df02547fc6e03a9a34..1fa0e5e2e3505e85f7fc2da0a65a520c553e6f47 100644 (file)
@@ -280,6 +280,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 
                WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
                if (btrfs_root_refs(&root->root_item) == 0) {
+                       struct btrfs_key drop_key;
+
+                       btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+                       /*
+                        * If we have a non-zero drop_progress then we know we
+                        * made it partly through deleting this snapshot, and
+                        * thus we need to make sure we block any balance from
+                        * happening until this snapshot is completely dropped.
+                        */
+                       if (drop_key.objectid != 0 || drop_key.type != 0 ||
+                           drop_key.offset != 0) {
+                               set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+                               set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+                       }
+
                        set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
                        btrfs_add_dead_root(root);
                }
index 3c5b1f72129acf05bb7c3bb0f60aef0f4794cc02..9a6009108ea5592c8da9ce4bd7b4d3bf141282d9 100644 (file)
@@ -1340,6 +1340,32 @@ again:
        return 0;
 }
 
+/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+       /*
+        * We put the drop in progress roots at the front of the list, so if the
+        * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+        * up.
+        */
+       spin_lock(&fs_info->trans_lock);
+       if (!list_empty(&fs_info->dead_roots)) {
+               struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+                                                          struct btrfs_root,
+                                                          root_list);
+               if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+                       spin_unlock(&fs_info->trans_lock);
+                       return;
+               }
+       }
+       spin_unlock(&fs_info->trans_lock);
+
+       btrfs_wake_unfinished_drop(fs_info);
+}
+
 /*
  * dead roots are old snapshots that need to be deleted.  This allocates
  * a dirty root struct and adds it into the list of dead roots that need to
@@ -1352,7 +1378,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
        spin_lock(&fs_info->trans_lock);
        if (list_empty(&root->root_list)) {
                btrfs_grab_root(root);
-               list_add_tail(&root->root_list, &fs_info->dead_roots);
+
+               /* We want to process the partially complete drops first. */
+               if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+                       list_add(&root->root_list, &fs_info->dead_roots);
+               else
+                       list_add_tail(&root->root_list, &fs_info->dead_roots);
        }
        spin_unlock(&fs_info->trans_lock);
 }
index eba07b8119bbd728369c272da74743382b3036ce..0ded32bbd001ec3a67a599183671b8a394bfda0b 100644 (file)
@@ -217,6 +217,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
 
 void btrfs_add_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);