]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/commitdiff
Btrfs: throttle delayed refs better
authorJosef Bacik <jbacik@fb.com>
Thu, 23 Jan 2014 15:54:11 +0000 (10:54 -0500)
committerChris Mason <clm@fb.com>
Tue, 28 Jan 2014 21:20:26 +0000 (13:20 -0800)
On one of our gluster clusters we noticed some pretty big lag spikes.  This
turned out to be because our transaction commit was taking like 3 minutes to
complete.  This is because we have like 30 gigs of metadata, so our global
reserve would end up being the max which is like 512 mb.  So our throttling code
would allow a ridiculous amount of delayed refs to build up and then they'd all
get run at transaction commit time, and for a cold mounted file system that
could take up to 3 minutes to run.  So fix the throttling to be based on both
the size of the global reserve and how long it takes us to run delayed refs.
This patch tracks the time it takes to run delayed refs and then only allows 1
seconds worth of outstanding delayed refs at a time.  This way it will auto-tune
itself from cold cache up to when everything is in memory and it no longer has
to go to disk.  This makes our transaction commits take much less time to run.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/transaction.c

index 3cebb4aeddc72061518d3e5fd026185b956a41d3..ca6bcc33d033713d6994bd1963b27d34184771fc 100644 (file)
@@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
 
        u64 generation;
        u64 last_trans_committed;
+       u64 avg_delayed_ref_runtime;
 
        /*
         * this is updated to the current trans every time a full commit
@@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 
 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
index ed23127a4b02e11b81327c24713a836ff60431bf..f0e7bbe148239188f528a3467330b51ba7c87113 100644 (file)
@@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-
+       fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        spin_lock_init(&fs_info->reada_lock);
index c77156c77de7d21429fe3bf3eaa1fe313f3408b3..b5322596d60bbaa8e20871ccf18522fa68aff5e5 100644 (file)
@@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        struct btrfs_delayed_extent_op *extent_op;
        struct btrfs_fs_info *fs_info = root->fs_info;
+       ktime_t start = ktime_get();
        int ret;
        unsigned long count = 0;
+       unsigned long actual_count = 0;
        int must_insert_reserved = 0;
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                                 &delayed_refs->href_root);
                        spin_unlock(&delayed_refs->lock);
                } else {
+                       actual_count++;
                        ref->in_tree = 0;
                        rb_erase(&ref->rb_node, &locked_ref->ref_root);
                }
@@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                count++;
                cond_resched();
        }
+
+       /*
+        * We don't want to include ref heads since we can have empty ref heads
+        * and those will drastically skew our runtime down since we just do
+        * accounting, no actual extent tree updates.
+        */
+       if (actual_count > 0) {
+               u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+               u64 avg;
+
+               /*
+                * We weigh the current average higher than our current runtime
+                * to avoid large swings in the average.
+                */
+               spin_lock(&delayed_refs->lock);
+               avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+               avg = div64_u64(avg, 4);
+               fs_info->avg_delayed_ref_runtime = avg;
+               spin_unlock(&delayed_refs->lock);
+       }
        return 0;
 }
 
@@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
 }
 
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root)
 {
        struct btrfs_block_rsv *global_rsv;
@@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 num_entries =
+               atomic_read(&trans->transaction->delayed_refs.num_entries);
+       u64 avg_runtime;
+
+       smp_mb();
+       avg_runtime = fs_info->avg_delayed_ref_runtime;
+       if (num_entries * avg_runtime >= NSEC_PER_SEC)
+               return 1;
+
+       return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
index fd1446496fe8da2b4c9ed8a65fa205d3bfeaba4d..5e2bfdaf8d14bc0c69e8330255454cf284b7d01b 100644 (file)
@@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
        if (root->fs_info->global_block_rsv.space_info->full &&
-           btrfs_should_throttle_delayed_refs(trans, root))
+           btrfs_check_space_for_delayed_refs(trans, root))
                return 1;
 
        return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
        trans->delayed_ref_updates = 0;
        if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
-               cur = max_t(unsigned long, cur, 1);
+               cur = max_t(unsigned long, cur, 32);
                trans->delayed_ref_updates = 0;
                btrfs_run_delayed_refs(trans, root, cur);
        }