]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blobdiff - fs/btrfs/extent-tree.c
Btrfs: allow clear_extent_dirty() to receive a cached extent state record
[mirror_ubuntu-jammy-kernel.git] / fs / btrfs / extent-tree.c
index a1febf155747e05c21212b65bf2d3ff758f37868..b32ccdbcc0bc9e812d21110c6710ddedfb4f8302 100644 (file)
@@ -51,6 +51,24 @@ enum {
        CHUNK_ALLOC_FORCE = 2,
 };
 
+/*
+ * Declare a helper function to detect underflow of various space info members
+ */
+#define DECLARE_SPACE_INFO_UPDATE(name)                                        \
+static inline void update_##name(struct btrfs_space_info *sinfo,       \
+                                s64 bytes)                             \
+{                                                                      \
+       if (bytes < 0 && sinfo->name < -bytes) {                        \
+               WARN_ON(1);                                             \
+               sinfo->name = 0;                                        \
+               return;                                                 \
+       }                                                               \
+       sinfo->name += bytes;                                           \
+}
+
+DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
+DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_ref_node *node, u64 parent,
                               u64 root_objectid, u64 owner_objectid,
@@ -4256,7 +4274,7 @@ commit_trans:
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
-       data_sinfo->bytes_may_use += bytes;
+       update_bytes_may_use(data_sinfo, bytes);
        trace_btrfs_space_reservation(fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
        spin_unlock(&data_sinfo->lock);
@@ -4309,10 +4327,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 
        data_sinfo = fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
-       if (WARN_ON(data_sinfo->bytes_may_use < len))
-               data_sinfo->bytes_may_use = 0;
-       else
-               data_sinfo->bytes_may_use -= len;
+       update_bytes_may_use(data_sinfo, -len);
        trace_btrfs_space_reservation(fs_info, "space_info",
                                      data_sinfo->flags, len, 0);
        spin_unlock(&data_sinfo->lock);
@@ -5108,7 +5123,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
                list_del_init(&ticket->list);
        if (ticket->bytes && ticket->bytes < orig_bytes) {
                u64 num_bytes = orig_bytes - ticket->bytes;
-               space_info->bytes_may_use -= num_bytes;
+               update_bytes_may_use(space_info, -num_bytes);
                trace_btrfs_space_reservation(fs_info, "space_info",
                                              space_info->flags, num_bytes, 0);
        }
@@ -5154,13 +5169,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
         * If not things get more complicated.
         */
        if (used + orig_bytes <= space_info->total_bytes) {
-               space_info->bytes_may_use += orig_bytes;
+               update_bytes_may_use(space_info, orig_bytes);
                trace_btrfs_space_reservation(fs_info, "space_info",
                                              space_info->flags, orig_bytes, 1);
                ret = 0;
        } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
                                  system_chunk)) {
-               space_info->bytes_may_use += orig_bytes;
+               update_bytes_may_use(space_info, orig_bytes);
                trace_btrfs_space_reservation(fs_info, "space_info",
                                              space_info->flags, orig_bytes, 1);
                ret = 0;
@@ -5223,7 +5238,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
        if (ticket.bytes) {
                if (ticket.bytes < orig_bytes) {
                        u64 num_bytes = orig_bytes - ticket.bytes;
-                       space_info->bytes_may_use -= num_bytes;
+                       update_bytes_may_use(space_info, -num_bytes);
                        trace_btrfs_space_reservation(fs_info, "space_info",
                                                      space_info->flags,
                                                      num_bytes, 0);
@@ -5407,7 +5422,7 @@ again:
                flush = BTRFS_RESERVE_FLUSH_ALL;
                goto again;
        }
-       space_info->bytes_may_use -= num_bytes;
+       update_bytes_may_use(space_info, -num_bytes);
        trace_btrfs_space_reservation(fs_info, "space_info",
                                      space_info->flags, num_bytes, 0);
        spin_unlock(&space_info->lock);
@@ -5435,7 +5450,7 @@ again:
                                                      ticket->bytes, 1);
                        list_del_init(&ticket->list);
                        num_bytes -= ticket->bytes;
-                       space_info->bytes_may_use += ticket->bytes;
+                       update_bytes_may_use(space_info, ticket->bytes);
                        ticket->bytes = 0;
                        space_info->tickets_id++;
                        wake_up(&ticket->wait);
@@ -5443,7 +5458,7 @@ again:
                        trace_btrfs_space_reservation(fs_info, "space_info",
                                                      space_info->flags,
                                                      num_bytes, 1);
-                       space_info->bytes_may_use += num_bytes;
+                       update_bytes_may_use(space_info, num_bytes);
                        ticket->bytes -= num_bytes;
                        num_bytes = 0;
                }
@@ -5750,14 +5765,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                        num_bytes = min(num_bytes,
                                        block_rsv->size - block_rsv->reserved);
                        block_rsv->reserved += num_bytes;
-                       sinfo->bytes_may_use += num_bytes;
+                       update_bytes_may_use(sinfo, num_bytes);
                        trace_btrfs_space_reservation(fs_info, "space_info",
                                                      sinfo->flags, num_bytes,
                                                      1);
                }
        } else if (block_rsv->reserved > block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
-               sinfo->bytes_may_use -= num_bytes;
+               update_bytes_may_use(sinfo, -num_bytes);
                trace_btrfs_space_reservation(fs_info, "space_info",
                                      sinfo->flags, num_bytes, 0);
                block_rsv->reserved = block_rsv->size;
@@ -6151,7 +6166,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val -= num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->pinned += num_bytes;
-                       cache->space_info->bytes_pinned += num_bytes;
+                       update_bytes_pinned(cache->space_info, num_bytes);
                        cache->space_info->bytes_used -= num_bytes;
                        cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -6222,7 +6237,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
-       cache->space_info->bytes_pinned += num_bytes;
+       update_bytes_pinned(cache->space_info, num_bytes);
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
@@ -6431,7 +6446,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
        } else {
                cache->reserved += num_bytes;
                space_info->bytes_reserved += num_bytes;
-               space_info->bytes_may_use -= ram_bytes;
+               update_bytes_may_use(space_info, -ram_bytes);
                if (delalloc)
                        cache->delalloc_bytes += num_bytes;
        }
@@ -6587,7 +6602,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
-               space_info->bytes_pinned -= len;
+               update_bytes_pinned(space_info, -len);
 
                trace_btrfs_space_reservation(fs_info, "pinned",
                                              space_info->flags, len, 0);
@@ -6608,7 +6623,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
                                to_add = min(len, global_rsv->size -
                                             global_rsv->reserved);
                                global_rsv->reserved += to_add;
-                               space_info->bytes_may_use += to_add;
+                               update_bytes_may_use(space_info, to_add);
                                if (global_rsv->reserved >= global_rsv->size)
                                        global_rsv->full = 1;
                                trace_btrfs_space_reservation(fs_info,
@@ -6647,9 +6662,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
                unpin = &fs_info->freed_extents[0];
 
        while (!trans->aborted) {
+               struct extent_state *cached_state = NULL;
+
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY, NULL);
+                                           EXTENT_DIRTY, &cached_state);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
@@ -6659,9 +6676,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
                        ret = btrfs_discard_extent(fs_info, start,
                                                   end + 1 - start, NULL);
 
-               clear_extent_dirty(unpin, start, end);
+               clear_extent_dirty(unpin, start, end, &cached_state);
                unpin_extent_range(fs_info, start, end, true);
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+               free_extent_state(cached_state);
                cond_resched();
        }
 
@@ -7238,6 +7256,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
        btrfs_put_block_group(cache);
 }
 
+/*
+ * Structure used internally for find_free_extent() function.  Wraps needed
+ * parameters.
+ */
+struct find_free_extent_ctl {
+       /* Basic allocation info */
+       u64 ram_bytes;
+       u64 num_bytes;
+       u64 empty_size;
+       u64 flags;
+       int delalloc;
+
+       /* Where to start the search inside the bg */
+       u64 search_start;
+
+       /* For clustered allocation */
+       u64 empty_cluster;
+
+       bool have_caching_bg;
+       bool orig_have_caching_bg;
+
+       /* RAID index, converted from flags */
+       int index;
+
+       /*
+        * Current loop number, check find_free_extent_update_loop() for details
+        */
+       int loop;
+
+       /*
+        * Whether we're refilling a cluster, if true we need to re-search
+        * current block group but don't try to refill the cluster again.
+        */
+       bool retry_clustered;
+
+       /*
+        * Whether we're updating free space cache, if true we need to re-search
+        * current block group but don't try updating free space cache again.
+        */
+       bool retry_unclustered;
+
+       /* If current block group is cached */
+       int cached;
+
+       /* Max contiguous hole found */
+       u64 max_extent_size;
+
+       /* Total free space from free space cache, not always contiguous */
+       u64 total_free_space;
+
+       /* Found result */
+       u64 found_offset;
+};
+
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+               struct btrfs_free_cluster *last_ptr,
+               struct find_free_extent_ctl *ffe_ctl,
+               struct btrfs_block_group_cache **cluster_bg_ret)
+{
+       struct btrfs_fs_info *fs_info = bg->fs_info;
+       struct btrfs_block_group_cache *cluster_bg;
+       u64 aligned_cluster;
+       u64 offset;
+       int ret;
+
+       cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+       if (!cluster_bg)
+               goto refill_cluster;
+       if (cluster_bg != bg && (cluster_bg->ro ||
+           !block_group_bits(cluster_bg, ffe_ctl->flags)))
+               goto release_cluster;
+
+       offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+                       ffe_ctl->num_bytes, cluster_bg->key.objectid,
+                       &ffe_ctl->max_extent_size);
+       if (offset) {
+               /* We have a block, we're done */
+               spin_unlock(&last_ptr->refill_lock);
+               trace_btrfs_reserve_extent_cluster(cluster_bg,
+                               ffe_ctl->search_start, ffe_ctl->num_bytes);
+               *cluster_bg_ret = cluster_bg;
+               ffe_ctl->found_offset = offset;
+               return 0;
+       }
+       WARN_ON(last_ptr->block_group != cluster_bg);
+
+release_cluster:
+       /*
+        * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+        * lets just skip it and let the allocator find whatever block it can
+        * find. If we reach this point, we will have tried the cluster
+        * allocator plenty of times and not have found anything, so we are
+        * likely way too fragmented for the clustering stuff to find anything.
+        *
+        * However, if the cluster is taken from the current block group,
+        * release the cluster first, so that we stand a better chance of
+        * succeeding in the unclustered allocation.
+        */
+       if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+               spin_unlock(&last_ptr->refill_lock);
+               btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+               return -ENOENT;
+       }
+
+       /* This cluster didn't work out, free it and start over */
+       btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+       if (cluster_bg != bg)
+               btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+       if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+               spin_unlock(&last_ptr->refill_lock);
+               return -ENOENT;
+       }
+
+       aligned_cluster = max_t(u64,
+                       ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+                       bg->full_stripe_len);
+       ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
+                       ffe_ctl->search_start, ffe_ctl->num_bytes,
+                       aligned_cluster);
+       if (ret == 0) {
+               /* Now pull our allocation out of this cluster */
+               offset = btrfs_alloc_from_cluster(bg, last_ptr,
+                               ffe_ctl->num_bytes, ffe_ctl->search_start,
+                               &ffe_ctl->max_extent_size);
+               if (offset) {
+                       /* We found one, proceed */
+                       spin_unlock(&last_ptr->refill_lock);
+                       trace_btrfs_reserve_extent_cluster(bg,
+                                       ffe_ctl->search_start,
+                                       ffe_ctl->num_bytes);
+                       ffe_ctl->found_offset = offset;
+                       return 0;
+               }
+       } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+                  !ffe_ctl->retry_clustered) {
+               spin_unlock(&last_ptr->refill_lock);
+
+               ffe_ctl->retry_clustered = true;
+               wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+                               ffe_ctl->empty_cluster + ffe_ctl->empty_size);
+               return -EAGAIN;
+       }
+       /*
+        * At this point we either didn't find a cluster or we weren't able to
+        * allocate a block from our cluster.  Free the cluster we've been
+        * trying to use, and go to the next block group.
+        */
+       btrfs_return_cluster_to_free_space(NULL, last_ptr);
+       spin_unlock(&last_ptr->refill_lock);
+       return 1;
+}
+
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+               struct btrfs_free_cluster *last_ptr,
+               struct find_free_extent_ctl *ffe_ctl)
+{
+       u64 offset;
+
+       /*
+        * We are doing an unclustered allocation, set the fragmented flag so
+        * we don't bother trying to setup a cluster again until we get more
+        * space.
+        */
+       if (unlikely(last_ptr)) {
+               spin_lock(&last_ptr->lock);
+               last_ptr->fragmented = 1;
+               spin_unlock(&last_ptr->lock);
+       }
+       if (ffe_ctl->cached) {
+               struct btrfs_free_space_ctl *free_space_ctl;
+
+               free_space_ctl = bg->free_space_ctl;
+               spin_lock(&free_space_ctl->tree_lock);
+               if (free_space_ctl->free_space <
+                   ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+                   ffe_ctl->empty_size) {
+                       ffe_ctl->total_free_space = max_t(u64,
+                                       ffe_ctl->total_free_space,
+                                       free_space_ctl->free_space);
+                       spin_unlock(&free_space_ctl->tree_lock);
+                       return 1;
+               }
+               spin_unlock(&free_space_ctl->tree_lock);
+       }
+
+       offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+                       ffe_ctl->num_bytes, ffe_ctl->empty_size,
+                       &ffe_ctl->max_extent_size);
+
+       /*
+        * If we didn't find a chunk, and we haven't failed on this block group
+        * before, and this block group is in the middle of caching and we are
+        * ok with waiting, then go ahead and wait for progress to be made, and
+        * set @retry_unclustered to true.
+        *
+        * If @retry_unclustered is true then we've already waited on this
+        * block group once and should move on to the next block group.
+        */
+       if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+           ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+               wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+                                               ffe_ctl->empty_size);
+               ffe_ctl->retry_unclustered = true;
+               return -EAGAIN;
+       } else if (!offset) {
+               return 1;
+       }
+       ffe_ctl->found_offset = offset;
+       return 0;
+}
+
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_free_cluster *last_ptr,
+                                       struct btrfs_key *ins,
+                                       struct find_free_extent_ctl *ffe_ctl,
+                                       int full_search, bool use_cluster)
+{
+       struct btrfs_root *root = fs_info->extent_root;
+       int ret;
+
+       if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+           ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+               ffe_ctl->orig_have_caching_bg = true;
+
+       if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
+           ffe_ctl->have_caching_bg)
+               return 1;
+
+       if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
+               return 1;
+
+       if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
+               return 0;
+       }
+
+       /*
+        * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+        *                      caching kthreads as we move along
+        * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+        * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+        * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+        *                     again
+        */
+       if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+               ffe_ctl->index = 0;
+               if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any uncached bgs and we've already done a
+                        * full search through.
+                        */
+                       if (ffe_ctl->orig_have_caching_bg || !full_search)
+                               ffe_ctl->loop = LOOP_CACHING_WAIT;
+                       else
+                               ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       ffe_ctl->loop++;
+               }
+
+               if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+                       struct btrfs_trans_handle *trans;
+                       int exist = 0;
+
+                       trans = current->journal_info;
+                       if (trans)
+                               exist = 1;
+                       else
+                               trans = btrfs_join_transaction(root);
+
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               return ret;
+                       }
+
+                       ret = do_chunk_alloc(trans, ffe_ctl->flags,
+                                            CHUNK_ALLOC_FORCE);
+
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+
+                       /* Do not bail out on ENOSPC since we can do more. */
+                       if (ret < 0 && ret != -ENOSPC)
+                               btrfs_abort_transaction(trans, ret);
+                       else
+                               ret = 0;
+                       if (!exist)
+                               btrfs_end_transaction(trans);
+                       if (ret)
+                               return ret;
+               }
+
+               if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (ffe_ctl->empty_size == 0 &&
+                           ffe_ctl->empty_cluster == 0)
+                               return -ENOSPC;
+                       ffe_ctl->empty_size = 0;
+                       ffe_ctl->empty_cluster = 0;
+               }
+               return 1;
+       }
+       return -ENOSPC;
+}
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -7248,6 +7605,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
  *
  * If there is no suitable free space, we will record the max size of
  * the free space extent currently.
+ *
+ * The overall logic and call chain:
+ *
+ * find_free_extent()
+ * |- Iterate through all block groups
+ * |  |- Get a valid block group
+ * |  |- Try to do clustered allocation in that block group
+ * |  |- Try to do unclustered allocation in that block group
+ * |  |- Check if the result is valid
+ * |  |  |- If valid, then exit
+ * |  |- Jump to next block group
+ * |
+ * |- Push harder to find free extents
+ *    |- If not found, re-iterate all block groups
  */
 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                u64 ram_bytes, u64 num_bytes, u64 empty_size,
@@ -7255,24 +7626,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                u64 flags, int delalloc)
 {
        int ret = 0;
-       struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
-       u64 search_start = 0;
-       u64 max_extent_size = 0;
-       u64 max_free_space = 0;
-       u64 empty_cluster = 0;
+       struct find_free_extent_ctl ffe_ctl = {0};
        struct btrfs_space_info *space_info;
-       int loop = 0;
-       int index = btrfs_bg_flags_to_raid_index(flags);
-       bool failed_cluster_refill = false;
-       bool failed_alloc = false;
        bool use_cluster = true;
-       bool have_caching_bg = false;
-       bool orig_have_caching_bg = false;
        bool full_search = false;
 
        WARN_ON(num_bytes < fs_info->sectorsize);
+
+       ffe_ctl.ram_bytes = ram_bytes;
+       ffe_ctl.num_bytes = num_bytes;
+       ffe_ctl.empty_size = empty_size;
+       ffe_ctl.flags = flags;
+       ffe_ctl.search_start = 0;
+       ffe_ctl.retry_clustered = false;
+       ffe_ctl.retry_unclustered = false;
+       ffe_ctl.delalloc = delalloc;
+       ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
+       ffe_ctl.have_caching_bg = false;
+       ffe_ctl.orig_have_caching_bg = false;
+       ffe_ctl.found_offset = 0;
+
        ins->type = BTRFS_EXTENT_ITEM_KEY;
        ins->objectid = 0;
        ins->offset = 0;
@@ -7308,7 +7683,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                spin_unlock(&space_info->lock);
        }
 
-       last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
+       last_ptr = fetch_cluster_info(fs_info, space_info,
+                                     &ffe_ctl.empty_cluster);
        if (last_ptr) {
                spin_lock(&last_ptr->lock);
                if (last_ptr->block_group)
@@ -7325,10 +7701,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                spin_unlock(&last_ptr->lock);
        }
 
-       search_start = max(search_start, first_logical_byte(fs_info, 0));
-       search_start = max(search_start, hint_byte);
-       if (search_start == hint_byte) {
-               block_group = btrfs_lookup_block_group(fs_info, search_start);
+       ffe_ctl.search_start = max(ffe_ctl.search_start,
+                                  first_logical_byte(fs_info, 0));
+       ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
+       if (ffe_ctl.search_start == hint_byte) {
+               block_group = btrfs_lookup_block_group(fs_info,
+                                                      ffe_ctl.search_start);
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -7350,7 +7728,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
-                               index = btrfs_bg_flags_to_raid_index(
+                               ffe_ctl.index = btrfs_bg_flags_to_raid_index(
                                                block_group->flags);
                                btrfs_lock_block_group(block_group, delalloc);
                                goto have_block_group;
@@ -7360,21 +7738,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                }
        }
 search:
-       have_caching_bg = false;
-       if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
+       ffe_ctl.have_caching_bg = false;
+       if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
+           ffe_ctl.index == 0)
                full_search = true;
        down_read(&space_info->groups_sem);
-       list_for_each_entry(block_group, &space_info->block_groups[index],
-                           list) {
-               u64 offset;
-               int cached;
-
+       list_for_each_entry(block_group,
+                           &space_info->block_groups[ffe_ctl.index], list) {
                /* If the block group is read-only, we can skip it entirely. */
                if (unlikely(block_group->ro))
                        continue;
 
                btrfs_grab_block_group(block_group, delalloc);
-               search_start = block_group->key.objectid;
+               ffe_ctl.search_start = block_group->key.objectid;
 
                /*
                 * this can happen if we end up cycling through all the
@@ -7398,9 +7774,9 @@ search:
                }
 
 have_block_group:
-               cached = block_group_cache_done(block_group);
-               if (unlikely(!cached)) {
-                       have_caching_bg = true;
+               ffe_ctl.cached = block_group_cache_done(block_group);
+               if (unlikely(!ffe_ctl.cached)) {
+                       ffe_ctl.have_caching_bg = true;
                        ret = cache_block_group(block_group, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
@@ -7414,322 +7790,92 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr && use_cluster) {
-                       struct btrfs_block_group_cache *used_block_group;
-                       unsigned long aligned_cluster;
-                       /*
-                        * the refill lock keeps out other
-                        * people trying to start a new cluster
-                        */
-                       used_block_group = btrfs_lock_cluster(block_group,
-                                                             last_ptr,
-                                                             delalloc);
-                       if (!used_block_group)
-                               goto refill_cluster;
-
-                       if (used_block_group != block_group &&
-                           (used_block_group->ro ||
-                            !block_group_bits(used_block_group, flags)))
-                               goto release_cluster;
-
-                       offset = btrfs_alloc_from_cluster(used_block_group,
-                                               last_ptr,
-                                               num_bytes,
-                                               used_block_group->key.objectid,
-                                               &max_extent_size);
-                       if (offset) {
-                               /* we have a block, we're done */
-                               spin_unlock(&last_ptr->refill_lock);
-                               trace_btrfs_reserve_extent_cluster(
-                                               used_block_group,
-                                               search_start, num_bytes);
-                               if (used_block_group != block_group) {
-                                       btrfs_release_block_group(block_group,
-                                                                 delalloc);
-                                       block_group = used_block_group;
-                               }
-                               goto checks;
-                       }
-
-                       WARN_ON(last_ptr->block_group != used_block_group);
-release_cluster:
-                       /* If we are on LOOP_NO_EMPTY_SIZE, we can't
-                        * set up a new clusters, so lets just skip it
-                        * and let the allocator find whatever block
-                        * it can find.  If we reach this point, we
-                        * will have tried the cluster allocator
-                        * plenty of times and not have found
-                        * anything, so we are likely way too
-                        * fragmented for the clustering stuff to find
-                        * anything.
-                        *
-                        * However, if the cluster is taken from the
-                        * current block group, release the cluster
-                        * first, so that we stand a better chance of
-                        * succeeding in the unclustered
-                        * allocation.  */
-                       if (loop >= LOOP_NO_EMPTY_SIZE &&
-                           used_block_group != block_group) {
-                               spin_unlock(&last_ptr->refill_lock);
-                               btrfs_release_block_group(used_block_group,
-                                                         delalloc);
-                               goto unclustered_alloc;
-                       }
+                       struct btrfs_block_group_cache *cluster_bg = NULL;
 
-                       /*
-                        * this cluster didn't work out, free it and
-                        * start over
-                        */
-                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
+                       ret = find_free_extent_clustered(block_group, last_ptr,
+                                                        &ffe_ctl, &cluster_bg);
 
-                       if (used_block_group != block_group)
-                               btrfs_release_block_group(used_block_group,
-                                                         delalloc);
-refill_cluster:
-                       if (loop >= LOOP_NO_EMPTY_SIZE) {
-                               spin_unlock(&last_ptr->refill_lock);
-                               goto unclustered_alloc;
-                       }
-
-                       aligned_cluster = max_t(unsigned long,
-                                               empty_cluster + empty_size,
-                                             block_group->full_stripe_len);
-
-                       /* allocate a cluster in this block group */
-                       ret = btrfs_find_space_cluster(fs_info, block_group,
-                                                      last_ptr, search_start,
-                                                      num_bytes,
-                                                      aligned_cluster);
                        if (ret == 0) {
-                               /*
-                                * now pull our allocation out of this
-                                * cluster
-                                */
-                               offset = btrfs_alloc_from_cluster(block_group,
-                                                       last_ptr,
-                                                       num_bytes,
-                                                       search_start,
-                                                       &max_extent_size);
-                               if (offset) {
-                                       /* we found one, proceed */
-                                       spin_unlock(&last_ptr->refill_lock);
-                                       trace_btrfs_reserve_extent_cluster(
-                                               block_group, search_start,
-                                               num_bytes);
-                                       goto checks;
+                               if (cluster_bg && cluster_bg != block_group) {
+                                       btrfs_release_block_group(block_group,
+                                                                 delalloc);
+                                       block_group = cluster_bg;
                                }
-                       } else if (!cached && loop > LOOP_CACHING_NOWAIT
-                                  && !failed_cluster_refill) {
-                               spin_unlock(&last_ptr->refill_lock);
-
-                               failed_cluster_refill = true;
-                               wait_block_group_cache_progress(block_group,
-                                      num_bytes + empty_cluster + empty_size);
+                               goto checks;
+                       } else if (ret == -EAGAIN) {
                                goto have_block_group;
-                       }
-
-                       /*
-                        * at this point we either didn't find a cluster
-                        * or we weren't able to allocate a block from our
-                        * cluster.  Free the cluster we've been trying
-                        * to use, and go to the next block group
-                        */
-                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
-                       spin_unlock(&last_ptr->refill_lock);
-                       goto loop;
-               }
-
-unclustered_alloc:
-               /*
-                * We are doing an unclustered alloc, set the fragmented flag so
-                * we don't bother trying to setup a cluster again until we get
-                * more space.
-                */
-               if (unlikely(last_ptr)) {
-                       spin_lock(&last_ptr->lock);
-                       last_ptr->fragmented = 1;
-                       spin_unlock(&last_ptr->lock);
-               }
-               if (cached) {
-                       struct btrfs_free_space_ctl *ctl =
-                               block_group->free_space_ctl;
-
-                       spin_lock(&ctl->tree_lock);
-                       if (ctl->free_space <
-                           num_bytes + empty_cluster + empty_size) {
-                               max_free_space = max(max_free_space,
-                                                    ctl->free_space);
-                               spin_unlock(&ctl->tree_lock);
+                       } else if (ret > 0) {
                                goto loop;
                        }
-                       spin_unlock(&ctl->tree_lock);
+                       /* ret == -ENOENT case falls through */
                }
 
-               offset = btrfs_find_space_for_alloc(block_group, search_start,
-                                                   num_bytes, empty_size,
-                                                   &max_extent_size);
-               /*
-                * If we didn't find a chunk, and we haven't failed on this
-                * block group before, and this block group is in the middle of
-                * caching and we are ok with waiting, then go ahead and wait
-                * for progress to be made, and set failed_alloc to true.
-                *
-                * If failed_alloc is true then we've already waited on this
-                * block group once and should move on to the next block group.
-                */
-               if (!offset && !failed_alloc && !cached &&
-                   loop > LOOP_CACHING_NOWAIT) {
-                       wait_block_group_cache_progress(block_group,
-                                               num_bytes + empty_size);
-                       failed_alloc = true;
+               ret = find_free_extent_unclustered(block_group, last_ptr,
+                                                  &ffe_ctl);
+               if (ret == -EAGAIN)
                        goto have_block_group;
-               } else if (!offset) {
+               else if (ret > 0)
                        goto loop;
-               }
+               /* ret == 0 case falls through */
 checks:
-               search_start = round_up(offset, fs_info->stripesize);
+               ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
+                                            fs_info->stripesize);
 
                /* move on to the next group */
-               if (search_start + num_bytes >
+               if (ffe_ctl.search_start + num_bytes >
                    block_group->key.objectid + block_group->key.offset) {
-                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+                                            num_bytes);
                        goto loop;
                }
 
-               if (offset < search_start)
-                       btrfs_add_free_space(block_group, offset,
-                                            search_start - offset);
+               if (ffe_ctl.found_offset < ffe_ctl.search_start)
+                       btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+                               ffe_ctl.search_start - ffe_ctl.found_offset);
 
                ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
                                num_bytes, delalloc);
                if (ret == -EAGAIN) {
-                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       btrfs_add_free_space(block_group, ffe_ctl.found_offset,
+                                            num_bytes);
                        goto loop;
                }
                btrfs_inc_block_group_reservations(block_group);
 
                /* we are all good, lets return */
-               ins->objectid = search_start;
+               ins->objectid = ffe_ctl.search_start;
                ins->offset = num_bytes;
 
-               trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
+               trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
+                                          num_bytes);
                btrfs_release_block_group(block_group, delalloc);
                break;
 loop:
-               failed_cluster_refill = false;
-               failed_alloc = false;
+               ffe_ctl.retry_clustered = false;
+               ffe_ctl.retry_unclustered = false;
                BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
-                      index);
+                      ffe_ctl.index);
                btrfs_release_block_group(block_group, delalloc);
                cond_resched();
        }
        up_read(&space_info->groups_sem);
 
-       if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
-               && !orig_have_caching_bg)
-               orig_have_caching_bg = true;
-
-       if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
-               goto search;
-
-       if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+       ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
+                                          full_search, use_cluster);
+       if (ret > 0)
                goto search;
 
-       /*
-        * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
-        *                      caching kthreads as we move along
-        * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
-        * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
-        * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
-        *                      again
-        */
-       if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
-               index = 0;
-               if (loop == LOOP_CACHING_NOWAIT) {
-                       /*
-                        * We want to skip the LOOP_CACHING_WAIT step if we
-                        * don't have any uncached bgs and we've already done a
-                        * full search through.
-                        */
-                       if (orig_have_caching_bg || !full_search)
-                               loop = LOOP_CACHING_WAIT;
-                       else
-                               loop = LOOP_ALLOC_CHUNK;
-               } else {
-                       loop++;
-               }
-
-               if (loop == LOOP_ALLOC_CHUNK) {
-                       struct btrfs_trans_handle *trans;
-                       int exist = 0;
-
-                       trans = current->journal_info;
-                       if (trans)
-                               exist = 1;
-                       else
-                               trans = btrfs_join_transaction(root);
-
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                               goto out;
-                       }
-
-                       ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
-
-                       /*
-                        * If we can't allocate a new chunk we've already looped
-                        * through at least once, move on to the NO_EMPTY_SIZE
-                        * case.
-                        */
-                       if (ret == -ENOSPC)
-                               loop = LOOP_NO_EMPTY_SIZE;
-
-                       /*
-                        * Do not bail out on ENOSPC since we
-                        * can do more things.
-                        */
-                       if (ret < 0 && ret != -ENOSPC)
-                               btrfs_abort_transaction(trans, ret);
-                       else
-                               ret = 0;
-                       if (!exist)
-                               btrfs_end_transaction(trans);
-                       if (ret)
-                               goto out;
-               }
-
-               if (loop == LOOP_NO_EMPTY_SIZE) {
-                       /*
-                        * Don't loop again if we already have no empty_size and
-                        * no empty_cluster.
-                        */
-                       if (empty_size == 0 &&
-                           empty_cluster == 0) {
-                               ret = -ENOSPC;
-                               goto out;
-                       }
-                       empty_size = 0;
-                       empty_cluster = 0;
-               }
-
-               goto search;
-       } else if (!ins->objectid) {
-               ret = -ENOSPC;
-       } else if (ins->objectid) {
-               if (!use_cluster && last_ptr) {
-                       spin_lock(&last_ptr->lock);
-                       last_ptr->window_start = ins->objectid;
-                       spin_unlock(&last_ptr->lock);
-               }
-               ret = 0;
-       }
-out:
        if (ret == -ENOSPC) {
-               if (!max_extent_size)
-                       max_extent_size = max_free_space;
+               /*
+                * Use ffe_ctl->total_free_space as fallback if we can't find
+                * any contiguous hole.
+                */
+               if (!ffe_ctl.max_extent_size)
+                       ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
                spin_lock(&space_info->lock);
-               space_info->max_extent_size = max_extent_size;
+               space_info->max_extent_size = ffe_ctl.max_extent_size;
                spin_unlock(&space_info->lock);
-               ins->offset = max_extent_size;
+               ins->offset = ffe_ctl.max_extent_size;
        }
        return ret;
 }
@@ -8169,7 +8315,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(buf, owner);
-       write_extent_buffer_fsid(buf, fs_info->fsid);
+       write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
        write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                buf->log_index = root->log_transid % 2;
@@ -10698,7 +10844,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                spin_lock(&space_info->lock);
                spin_lock(&block_group->lock);
 
-               space_info->bytes_pinned -= block_group->pinned;
+               update_bytes_pinned(space_info, -block_group->pinned);
                space_info->bytes_readonly += block_group->pinned;
                percpu_counter_add_batch(&space_info->total_bytes_pinned,
                                   -block_group->pinned,