]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent-tree.c
btrfs: migrate the block-rsv code to block-rsv.c
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
995946dd 19#include "tree-log.h"
fec577fb
CM
20#include "disk-io.h"
21#include "print-tree.h"
0b86a832 22#include "volumes.h"
53b381b3 23#include "raid56.h"
925baedd 24#include "locking.h"
fa9c0d79 25#include "free-space-cache.h"
1e144fb8 26#include "free-space-tree.h"
3fed40cc 27#include "math.h"
6ab0a202 28#include "sysfs.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
fec577fb 33
709c0486
AJ
34#undef SCRAMBLE_DELAYED_REFS
35
9f9b8e8d 36
5d4f98a2 37static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
38 struct btrfs_delayed_ref_node *node, u64 parent,
39 u64 root_objectid, u64 owner_objectid,
40 u64 owner_offset, int refs_to_drop,
41 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
42static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
43 struct extent_buffer *leaf,
44 struct btrfs_extent_item *ei);
45static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
46 u64 parent, u64 root_objectid,
47 u64 flags, u64 owner, u64 offset,
48 struct btrfs_key *ins, int ref_mod);
49static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 50 struct btrfs_delayed_ref_node *node,
21ebfbe7 51 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
52static int find_next_key(struct btrfs_path *path, int level,
53 struct btrfs_key *key);
6a63209f 54
817d52f8
JB
55static noinline int
56block_group_cache_done(struct btrfs_block_group_cache *cache)
57{
58 smp_mb();
36cce922
JB
59 return cache->cached == BTRFS_CACHE_FINISHED ||
60 cache->cached == BTRFS_CACHE_ERROR;
817d52f8
JB
61}
62
0f9dd46c
JB
63static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
64{
65 return (cache->flags & bits) == bits;
66}
67
758f2dfc 68void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
11dfe35a
JB
69{
70 atomic_inc(&cache->count);
71}
72
73void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
74{
f0486c68
YZ
75 if (atomic_dec_and_test(&cache->count)) {
76 WARN_ON(cache->pinned > 0);
77 WARN_ON(cache->reserved > 0);
0966a7b1
QW
78
79 /*
80 * If not empty, someone is still holding mutex of
81 * full_stripe_lock, which can only be released by caller.
82 * And it will definitely cause use-after-free when caller
83 * tries to release full stripe lock.
84 *
85 * No better way to resolve, but only to warn.
86 */
87 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
34d52cb6 88 kfree(cache->free_space_ctl);
11dfe35a 89 kfree(cache);
f0486c68 90 }
11dfe35a
JB
91}
92
0f9dd46c
JB
93/*
94 * this adds the block group to the fs_info rb tree for the block group
95 * cache
96 */
b2950863 97static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
0f9dd46c
JB
98 struct btrfs_block_group_cache *block_group)
99{
100 struct rb_node **p;
101 struct rb_node *parent = NULL;
102 struct btrfs_block_group_cache *cache;
103
104 spin_lock(&info->block_group_cache_lock);
105 p = &info->block_group_cache_tree.rb_node;
106
107 while (*p) {
108 parent = *p;
109 cache = rb_entry(parent, struct btrfs_block_group_cache,
110 cache_node);
111 if (block_group->key.objectid < cache->key.objectid) {
112 p = &(*p)->rb_left;
113 } else if (block_group->key.objectid > cache->key.objectid) {
114 p = &(*p)->rb_right;
115 } else {
116 spin_unlock(&info->block_group_cache_lock);
117 return -EEXIST;
118 }
119 }
120
121 rb_link_node(&block_group->cache_node, parent, p);
122 rb_insert_color(&block_group->cache_node,
123 &info->block_group_cache_tree);
a1897fdd
LB
124
125 if (info->first_logical_byte > block_group->key.objectid)
126 info->first_logical_byte = block_group->key.objectid;
127
0f9dd46c
JB
128 spin_unlock(&info->block_group_cache_lock);
129
130 return 0;
131}
132
133/*
134 * This will return the block group at or after bytenr if contains is 0, else
135 * it will return the block group that contains the bytenr
136 */
137static struct btrfs_block_group_cache *
138block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
139 int contains)
140{
141 struct btrfs_block_group_cache *cache, *ret = NULL;
142 struct rb_node *n;
143 u64 end, start;
144
145 spin_lock(&info->block_group_cache_lock);
146 n = info->block_group_cache_tree.rb_node;
147
148 while (n) {
149 cache = rb_entry(n, struct btrfs_block_group_cache,
150 cache_node);
151 end = cache->key.objectid + cache->key.offset - 1;
152 start = cache->key.objectid;
153
154 if (bytenr < start) {
155 if (!contains && (!ret || start < ret->key.objectid))
156 ret = cache;
157 n = n->rb_left;
158 } else if (bytenr > start) {
159 if (contains && bytenr <= end) {
160 ret = cache;
161 break;
162 }
163 n = n->rb_right;
164 } else {
165 ret = cache;
166 break;
167 }
168 }
a1897fdd 169 if (ret) {
11dfe35a 170 btrfs_get_block_group(ret);
a1897fdd
LB
171 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
172 info->first_logical_byte = ret->key.objectid;
173 }
0f9dd46c
JB
174 spin_unlock(&info->block_group_cache_lock);
175
176 return ret;
177}
178
2ff7e61e 179static int add_excluded_extent(struct btrfs_fs_info *fs_info,
11833d66 180 u64 start, u64 num_bytes)
817d52f8 181{
11833d66 182 u64 end = start + num_bytes - 1;
0b246afa 183 set_extent_bits(&fs_info->freed_extents[0],
ceeb0ae7 184 start, end, EXTENT_UPTODATE);
0b246afa 185 set_extent_bits(&fs_info->freed_extents[1],
ceeb0ae7 186 start, end, EXTENT_UPTODATE);
11833d66
YZ
187 return 0;
188}
817d52f8 189
9e715da8 190static void free_excluded_extents(struct btrfs_block_group_cache *cache)
11833d66 191{
9e715da8 192 struct btrfs_fs_info *fs_info = cache->fs_info;
11833d66 193 u64 start, end;
817d52f8 194
11833d66
YZ
195 start = cache->key.objectid;
196 end = start + cache->key.offset - 1;
197
0b246afa 198 clear_extent_bits(&fs_info->freed_extents[0],
91166212 199 start, end, EXTENT_UPTODATE);
0b246afa 200 clear_extent_bits(&fs_info->freed_extents[1],
91166212 201 start, end, EXTENT_UPTODATE);
817d52f8
JB
202}
203
3c4da657 204static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
817d52f8 205{
3c4da657 206 struct btrfs_fs_info *fs_info = cache->fs_info;
817d52f8
JB
207 u64 bytenr;
208 u64 *logical;
209 int stripe_len;
210 int i, nr, ret;
211
06b2331f
YZ
212 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
213 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
214 cache->bytes_super += stripe_len;
2ff7e61e 215 ret = add_excluded_extent(fs_info, cache->key.objectid,
06b2331f 216 stripe_len);
835d974f
JB
217 if (ret)
218 return ret;
06b2331f
YZ
219 }
220
817d52f8
JB
221 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
222 bytenr = btrfs_sb_offset(i);
0b246afa 223 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
63a9c7b9 224 bytenr, &logical, &nr, &stripe_len);
835d974f
JB
225 if (ret)
226 return ret;
11833d66 227
817d52f8 228 while (nr--) {
51bf5f0b
JB
229 u64 start, len;
230
231 if (logical[nr] > cache->key.objectid +
232 cache->key.offset)
233 continue;
234
235 if (logical[nr] + stripe_len <= cache->key.objectid)
236 continue;
237
238 start = logical[nr];
239 if (start < cache->key.objectid) {
240 start = cache->key.objectid;
241 len = (logical[nr] + stripe_len) - start;
242 } else {
243 len = min_t(u64, stripe_len,
244 cache->key.objectid +
245 cache->key.offset - start);
246 }
247
248 cache->bytes_super += len;
2ff7e61e 249 ret = add_excluded_extent(fs_info, start, len);
835d974f
JB
250 if (ret) {
251 kfree(logical);
252 return ret;
253 }
817d52f8 254 }
11833d66 255
817d52f8
JB
256 kfree(logical);
257 }
817d52f8
JB
258 return 0;
259}
260
11833d66
YZ
261static struct btrfs_caching_control *
262get_caching_control(struct btrfs_block_group_cache *cache)
263{
264 struct btrfs_caching_control *ctl;
265
266 spin_lock(&cache->lock);
dde5abee
JB
267 if (!cache->caching_ctl) {
268 spin_unlock(&cache->lock);
11833d66
YZ
269 return NULL;
270 }
271
272 ctl = cache->caching_ctl;
1e4f4714 273 refcount_inc(&ctl->count);
11833d66
YZ
274 spin_unlock(&cache->lock);
275 return ctl;
276}
277
278static void put_caching_control(struct btrfs_caching_control *ctl)
279{
1e4f4714 280 if (refcount_dec_and_test(&ctl->count))
11833d66
YZ
281 kfree(ctl);
282}
283
d0bd4560 284#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 285static void fragment_free_space(struct btrfs_block_group_cache *block_group)
d0bd4560 286{
2ff7e61e 287 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
288 u64 start = block_group->key.objectid;
289 u64 len = block_group->key.offset;
290 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
0b246afa 291 fs_info->nodesize : fs_info->sectorsize;
d0bd4560
JB
292 u64 step = chunk << 1;
293
294 while (len > chunk) {
295 btrfs_remove_free_space(block_group, start, chunk);
296 start += step;
297 if (len < step)
298 len = 0;
299 else
300 len -= step;
301 }
302}
303#endif
304
0f9dd46c
JB
305/*
306 * this is only called by cache_block_group, since we could have freed extents
307 * we need to check the pinned_extents for any extents that can't be used yet
308 * since their free space will be released as soon as the transaction commits.
309 */
a5ed9182 310u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
4457c1c7 311 u64 start, u64 end)
0f9dd46c 312{
4457c1c7 313 struct btrfs_fs_info *info = block_group->fs_info;
817d52f8 314 u64 extent_start, extent_end, size, total_added = 0;
0f9dd46c
JB
315 int ret;
316
317 while (start < end) {
11833d66 318 ret = find_first_extent_bit(info->pinned_extents, start,
0f9dd46c 319 &extent_start, &extent_end,
e6138876
JB
320 EXTENT_DIRTY | EXTENT_UPTODATE,
321 NULL);
0f9dd46c
JB
322 if (ret)
323 break;
324
06b2331f 325 if (extent_start <= start) {
0f9dd46c
JB
326 start = extent_end + 1;
327 } else if (extent_start > start && extent_start < end) {
328 size = extent_start - start;
817d52f8 329 total_added += size;
ea6a478e
JB
330 ret = btrfs_add_free_space(block_group, start,
331 size);
79787eaa 332 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
333 start = extent_end + 1;
334 } else {
335 break;
336 }
337 }
338
339 if (start < end) {
340 size = end - start;
817d52f8 341 total_added += size;
ea6a478e 342 ret = btrfs_add_free_space(block_group, start, size);
79787eaa 343 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
344 }
345
817d52f8 346 return total_added;
0f9dd46c
JB
347}
348
73fa48b6 349static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
e37c9e69 350{
0b246afa
JM
351 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
352 struct btrfs_fs_info *fs_info = block_group->fs_info;
353 struct btrfs_root *extent_root = fs_info->extent_root;
e37c9e69 354 struct btrfs_path *path;
5f39d397 355 struct extent_buffer *leaf;
11833d66 356 struct btrfs_key key;
817d52f8 357 u64 total_found = 0;
11833d66
YZ
358 u64 last = 0;
359 u32 nritems;
73fa48b6 360 int ret;
d0bd4560 361 bool wakeup = true;
f510cfec 362
e37c9e69
CM
363 path = btrfs_alloc_path();
364 if (!path)
73fa48b6 365 return -ENOMEM;
7d7d6068 366
817d52f8 367 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
11833d66 368
d0bd4560
JB
369#ifdef CONFIG_BTRFS_DEBUG
370 /*
371 * If we're fragmenting we don't want to make anybody think we can
372 * allocate from this block group until we've had a chance to fragment
373 * the free space.
374 */
2ff7e61e 375 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
376 wakeup = false;
377#endif
5cd57b2c 378 /*
817d52f8
JB
379 * We don't want to deadlock with somebody trying to allocate a new
380 * extent for the extent root while also trying to search the extent
381 * root to add free space. So we skip locking and search the commit
382 * root, since its read-only
5cd57b2c
CM
383 */
384 path->skip_locking = 1;
817d52f8 385 path->search_commit_root = 1;
e4058b54 386 path->reada = READA_FORWARD;
817d52f8 387
e4404d6e 388 key.objectid = last;
e37c9e69 389 key.offset = 0;
11833d66 390 key.type = BTRFS_EXTENT_ITEM_KEY;
013f1b12 391
52ee28d2 392next:
11833d66 393 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
e37c9e69 394 if (ret < 0)
73fa48b6 395 goto out;
a512bbf8 396
11833d66
YZ
397 leaf = path->nodes[0];
398 nritems = btrfs_header_nritems(leaf);
399
d397712b 400 while (1) {
7841cb28 401 if (btrfs_fs_closing(fs_info) > 1) {
f25784b3 402 last = (u64)-1;
817d52f8 403 break;
f25784b3 404 }
817d52f8 405
11833d66
YZ
406 if (path->slots[0] < nritems) {
407 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
408 } else {
409 ret = find_next_key(path, 0, &key);
410 if (ret)
e37c9e69 411 break;
817d52f8 412
c9ea7b24 413 if (need_resched() ||
9e351cc8 414 rwsem_is_contended(&fs_info->commit_root_sem)) {
d0bd4560
JB
415 if (wakeup)
416 caching_ctl->progress = last;
ff5714cc 417 btrfs_release_path(path);
9e351cc8 418 up_read(&fs_info->commit_root_sem);
589d8ade 419 mutex_unlock(&caching_ctl->mutex);
11833d66 420 cond_resched();
73fa48b6
OS
421 mutex_lock(&caching_ctl->mutex);
422 down_read(&fs_info->commit_root_sem);
423 goto next;
589d8ade 424 }
0a3896d0
JB
425
426 ret = btrfs_next_leaf(extent_root, path);
427 if (ret < 0)
73fa48b6 428 goto out;
0a3896d0
JB
429 if (ret)
430 break;
589d8ade
JB
431 leaf = path->nodes[0];
432 nritems = btrfs_header_nritems(leaf);
433 continue;
11833d66 434 }
817d52f8 435
52ee28d2
LB
436 if (key.objectid < last) {
437 key.objectid = last;
438 key.offset = 0;
439 key.type = BTRFS_EXTENT_ITEM_KEY;
440
d0bd4560
JB
441 if (wakeup)
442 caching_ctl->progress = last;
52ee28d2
LB
443 btrfs_release_path(path);
444 goto next;
445 }
446
11833d66
YZ
447 if (key.objectid < block_group->key.objectid) {
448 path->slots[0]++;
817d52f8 449 continue;
e37c9e69 450 }
0f9dd46c 451
e37c9e69 452 if (key.objectid >= block_group->key.objectid +
0f9dd46c 453 block_group->key.offset)
e37c9e69 454 break;
7d7d6068 455
3173a18f
JB
456 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
457 key.type == BTRFS_METADATA_ITEM_KEY) {
4457c1c7 458 total_found += add_new_free_space(block_group, last,
817d52f8 459 key.objectid);
3173a18f
JB
460 if (key.type == BTRFS_METADATA_ITEM_KEY)
461 last = key.objectid +
da17066c 462 fs_info->nodesize;
3173a18f
JB
463 else
464 last = key.objectid + key.offset;
817d52f8 465
73fa48b6 466 if (total_found > CACHING_CTL_WAKE_UP) {
11833d66 467 total_found = 0;
d0bd4560
JB
468 if (wakeup)
469 wake_up(&caching_ctl->wait);
11833d66 470 }
817d52f8 471 }
e37c9e69
CM
472 path->slots[0]++;
473 }
817d52f8 474 ret = 0;
e37c9e69 475
4457c1c7 476 total_found += add_new_free_space(block_group, last,
817d52f8
JB
477 block_group->key.objectid +
478 block_group->key.offset);
11833d66 479 caching_ctl->progress = (u64)-1;
817d52f8 480
73fa48b6
OS
481out:
482 btrfs_free_path(path);
483 return ret;
484}
485
486static noinline void caching_thread(struct btrfs_work *work)
487{
488 struct btrfs_block_group_cache *block_group;
489 struct btrfs_fs_info *fs_info;
490 struct btrfs_caching_control *caching_ctl;
491 int ret;
492
493 caching_ctl = container_of(work, struct btrfs_caching_control, work);
494 block_group = caching_ctl->block_group;
495 fs_info = block_group->fs_info;
496
497 mutex_lock(&caching_ctl->mutex);
498 down_read(&fs_info->commit_root_sem);
499
1e144fb8
OS
500 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
501 ret = load_free_space_tree(caching_ctl);
502 else
503 ret = load_extent_tree_free(caching_ctl);
73fa48b6 504
817d52f8 505 spin_lock(&block_group->lock);
11833d66 506 block_group->caching_ctl = NULL;
73fa48b6 507 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
817d52f8 508 spin_unlock(&block_group->lock);
0f9dd46c 509
d0bd4560 510#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 511 if (btrfs_should_fragment_free_space(block_group)) {
d0bd4560
JB
512 u64 bytes_used;
513
514 spin_lock(&block_group->space_info->lock);
515 spin_lock(&block_group->lock);
516 bytes_used = block_group->key.offset -
517 btrfs_block_group_used(&block_group->item);
518 block_group->space_info->bytes_used += bytes_used >> 1;
519 spin_unlock(&block_group->lock);
520 spin_unlock(&block_group->space_info->lock);
2ff7e61e 521 fragment_free_space(block_group);
d0bd4560
JB
522 }
523#endif
524
525 caching_ctl->progress = (u64)-1;
11833d66 526
9e351cc8 527 up_read(&fs_info->commit_root_sem);
9e715da8 528 free_excluded_extents(block_group);
11833d66 529 mutex_unlock(&caching_ctl->mutex);
73fa48b6 530
11833d66
YZ
531 wake_up(&caching_ctl->wait);
532
533 put_caching_control(caching_ctl);
11dfe35a 534 btrfs_put_block_group(block_group);
817d52f8
JB
535}
536
9d66e233 537static int cache_block_group(struct btrfs_block_group_cache *cache,
9d66e233 538 int load_cache_only)
817d52f8 539{
291c7d2f 540 DEFINE_WAIT(wait);
11833d66
YZ
541 struct btrfs_fs_info *fs_info = cache->fs_info;
542 struct btrfs_caching_control *caching_ctl;
817d52f8
JB
543 int ret = 0;
544
291c7d2f 545 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
79787eaa
JM
546 if (!caching_ctl)
547 return -ENOMEM;
291c7d2f
JB
548
549 INIT_LIST_HEAD(&caching_ctl->list);
550 mutex_init(&caching_ctl->mutex);
551 init_waitqueue_head(&caching_ctl->wait);
552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid;
1e4f4714 554 refcount_set(&caching_ctl->count, 1);
9e0af237
LB
555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
291c7d2f
JB
557
558 spin_lock(&cache->lock);
559 /*
560 * This should be a rare occasion, but this could happen I think in the
561 * case where one thread starts to load the space cache info, and then
562 * some other thread starts a transaction commit which tries to do an
563 * allocation while the other thread is still loading the space cache
564 * info. The previous loop should have kept us from choosing this block
565 * group, but if we've moved to the state where we will wait on caching
566 * block groups we need to first check if we're doing a fast load here,
567 * so we can wait for it to finish, otherwise we could end up allocating
568 * from a block group who's cache gets evicted for one reason or
569 * another.
570 */
571 while (cache->cached == BTRFS_CACHE_FAST) {
572 struct btrfs_caching_control *ctl;
573
574 ctl = cache->caching_ctl;
1e4f4714 575 refcount_inc(&ctl->count);
291c7d2f
JB
576 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
577 spin_unlock(&cache->lock);
578
579 schedule();
580
581 finish_wait(&ctl->wait, &wait);
582 put_caching_control(ctl);
583 spin_lock(&cache->lock);
584 }
585
586 if (cache->cached != BTRFS_CACHE_NO) {
587 spin_unlock(&cache->lock);
588 kfree(caching_ctl);
11833d66 589 return 0;
291c7d2f
JB
590 }
591 WARN_ON(cache->caching_ctl);
592 cache->caching_ctl = caching_ctl;
593 cache->cached = BTRFS_CACHE_FAST;
594 spin_unlock(&cache->lock);
11833d66 595
d8953d69 596 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
cb83b7b8 597 mutex_lock(&caching_ctl->mutex);
bb6cb1c5 598 ret = load_free_space_cache(cache);
9d66e233
JB
599
600 spin_lock(&cache->lock);
601 if (ret == 1) {
291c7d2f 602 cache->caching_ctl = NULL;
9d66e233
JB
603 cache->cached = BTRFS_CACHE_FINISHED;
604 cache->last_byte_to_unpin = (u64)-1;
cb83b7b8 605 caching_ctl->progress = (u64)-1;
9d66e233 606 } else {
291c7d2f
JB
607 if (load_cache_only) {
608 cache->caching_ctl = NULL;
609 cache->cached = BTRFS_CACHE_NO;
610 } else {
611 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 612 cache->has_caching_ctl = 1;
291c7d2f 613 }
9d66e233
JB
614 }
615 spin_unlock(&cache->lock);
d0bd4560
JB
616#ifdef CONFIG_BTRFS_DEBUG
617 if (ret == 1 &&
2ff7e61e 618 btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
619 u64 bytes_used;
620
621 spin_lock(&cache->space_info->lock);
622 spin_lock(&cache->lock);
623 bytes_used = cache->key.offset -
624 btrfs_block_group_used(&cache->item);
625 cache->space_info->bytes_used += bytes_used >> 1;
626 spin_unlock(&cache->lock);
627 spin_unlock(&cache->space_info->lock);
2ff7e61e 628 fragment_free_space(cache);
d0bd4560
JB
629 }
630#endif
cb83b7b8
JB
631 mutex_unlock(&caching_ctl->mutex);
632
291c7d2f 633 wake_up(&caching_ctl->wait);
3c14874a 634 if (ret == 1) {
291c7d2f 635 put_caching_control(caching_ctl);
9e715da8 636 free_excluded_extents(cache);
9d66e233 637 return 0;
3c14874a 638 }
291c7d2f
JB
639 } else {
640 /*
1e144fb8
OS
641 * We're either using the free space tree or no caching at all.
642 * Set cached to the appropriate value and wakeup any waiters.
291c7d2f
JB
643 */
644 spin_lock(&cache->lock);
645 if (load_cache_only) {
646 cache->caching_ctl = NULL;
647 cache->cached = BTRFS_CACHE_NO;
648 } else {
649 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 650 cache->has_caching_ctl = 1;
291c7d2f
JB
651 }
652 spin_unlock(&cache->lock);
653 wake_up(&caching_ctl->wait);
9d66e233
JB
654 }
655
291c7d2f
JB
656 if (load_cache_only) {
657 put_caching_control(caching_ctl);
11833d66 658 return 0;
817d52f8 659 }
817d52f8 660
9e351cc8 661 down_write(&fs_info->commit_root_sem);
1e4f4714 662 refcount_inc(&caching_ctl->count);
11833d66 663 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
9e351cc8 664 up_write(&fs_info->commit_root_sem);
11833d66 665
11dfe35a 666 btrfs_get_block_group(cache);
11833d66 667
e66f0bb1 668 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
817d52f8 669
ef8bbdfe 670 return ret;
e37c9e69
CM
671}
672
0f9dd46c
JB
673/*
674 * return the block group that starts at or after bytenr
675 */
d397712b
CM
676static struct btrfs_block_group_cache *
677btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
0ef3e66b 678{
e2c89907 679 return block_group_cache_tree_search(info, bytenr, 0);
0ef3e66b
CM
680}
681
0f9dd46c 682/*
9f55684c 683 * return the block group that contains the given bytenr
0f9dd46c 684 */
d397712b
CM
685struct btrfs_block_group_cache *btrfs_lookup_block_group(
686 struct btrfs_fs_info *info,
687 u64 bytenr)
be744175 688{
e2c89907 689 return block_group_cache_tree_search(info, bytenr, 1);
be744175 690}
0b86a832 691
78192442 692static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
0d9f824d 693{
ddf30cf0
QW
694 if (ref->type == BTRFS_REF_METADATA) {
695 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
78192442 696 return BTRFS_BLOCK_GROUP_SYSTEM;
0d9f824d 697 else
78192442 698 return BTRFS_BLOCK_GROUP_METADATA;
0d9f824d 699 }
78192442
QW
700 return BTRFS_BLOCK_GROUP_DATA;
701}
702
703static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
704 struct btrfs_ref *ref)
705{
706 struct btrfs_space_info *space_info;
707 u64 flags = generic_ref_to_space_flags(ref);
708
280c2908 709 space_info = btrfs_find_space_info(fs_info, flags);
78192442
QW
710 ASSERT(space_info);
711 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
712 BTRFS_TOTAL_BYTES_PINNED_BATCH);
713}
714
715static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
716 struct btrfs_ref *ref)
717{
718 struct btrfs_space_info *space_info;
719 u64 flags = generic_ref_to_space_flags(ref);
0d9f824d 720
280c2908 721 space_info = btrfs_find_space_info(fs_info, flags);
55e8196a 722 ASSERT(space_info);
78192442 723 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
dec59fa3 724 BTRFS_TOTAL_BYTES_PINNED_BATCH);
0d9f824d
OS
725}
726
1a4ed8fd 727/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 728int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5
CM
729{
730 int ret;
731 struct btrfs_key key;
31840ae1 732 struct btrfs_path *path;
e02119d5 733
31840ae1 734 path = btrfs_alloc_path();
d8926bb3
MF
735 if (!path)
736 return -ENOMEM;
737
e02119d5
CM
738 key.objectid = start;
739 key.offset = len;
3173a18f 740 key.type = BTRFS_EXTENT_ITEM_KEY;
0b246afa 741 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
31840ae1 742 btrfs_free_path(path);
7bb86316
CM
743 return ret;
744}
745
a22285a6 746/*
3173a18f 747 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
748 *
749 * the head node for delayed ref is used to store the sum of all the
750 * reference count modifications queued up in the rbtree. the head
751 * node may also store the extent flags to set. This way you can check
752 * to see what the reference count and extent flags would be if all of
753 * the delayed refs are not processed.
754 */
755int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 756 struct btrfs_fs_info *fs_info, u64 bytenr,
3173a18f 757 u64 offset, int metadata, u64 *refs, u64 *flags)
a22285a6
YZ
758{
759 struct btrfs_delayed_ref_head *head;
760 struct btrfs_delayed_ref_root *delayed_refs;
761 struct btrfs_path *path;
762 struct btrfs_extent_item *ei;
763 struct extent_buffer *leaf;
764 struct btrfs_key key;
765 u32 item_size;
766 u64 num_refs;
767 u64 extent_flags;
768 int ret;
769
3173a18f
JB
770 /*
771 * If we don't have skinny metadata, don't bother doing anything
772 * different
773 */
0b246afa
JM
774 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
775 offset = fs_info->nodesize;
3173a18f
JB
776 metadata = 0;
777 }
778
a22285a6
YZ
779 path = btrfs_alloc_path();
780 if (!path)
781 return -ENOMEM;
782
a22285a6
YZ
783 if (!trans) {
784 path->skip_locking = 1;
785 path->search_commit_root = 1;
786 }
639eefc8
FDBM
787
788search_again:
789 key.objectid = bytenr;
790 key.offset = offset;
791 if (metadata)
792 key.type = BTRFS_METADATA_ITEM_KEY;
793 else
794 key.type = BTRFS_EXTENT_ITEM_KEY;
795
0b246afa 796 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
a22285a6
YZ
797 if (ret < 0)
798 goto out_free;
799
3173a18f 800 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
801 if (path->slots[0]) {
802 path->slots[0]--;
803 btrfs_item_key_to_cpu(path->nodes[0], &key,
804 path->slots[0]);
805 if (key.objectid == bytenr &&
806 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 807 key.offset == fs_info->nodesize)
74be9510
FDBM
808 ret = 0;
809 }
3173a18f
JB
810 }
811
a22285a6
YZ
812 if (ret == 0) {
813 leaf = path->nodes[0];
814 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
815 if (item_size >= sizeof(*ei)) {
816 ei = btrfs_item_ptr(leaf, path->slots[0],
817 struct btrfs_extent_item);
818 num_refs = btrfs_extent_refs(leaf, ei);
819 extent_flags = btrfs_extent_flags(leaf, ei);
820 } else {
ba3c2b19
NB
821 ret = -EINVAL;
822 btrfs_print_v0_err(fs_info);
823 if (trans)
824 btrfs_abort_transaction(trans, ret);
825 else
826 btrfs_handle_fs_error(fs_info, ret, NULL);
827
828 goto out_free;
a22285a6 829 }
ba3c2b19 830
a22285a6
YZ
831 BUG_ON(num_refs == 0);
832 } else {
833 num_refs = 0;
834 extent_flags = 0;
835 ret = 0;
836 }
837
838 if (!trans)
839 goto out;
840
841 delayed_refs = &trans->transaction->delayed_refs;
842 spin_lock(&delayed_refs->lock);
f72ad18e 843 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
844 if (head) {
845 if (!mutex_trylock(&head->mutex)) {
d278850e 846 refcount_inc(&head->refs);
a22285a6
YZ
847 spin_unlock(&delayed_refs->lock);
848
b3b4aa74 849 btrfs_release_path(path);
a22285a6 850
8cc33e5c
DS
851 /*
852 * Mutex was contended, block until it's released and try
853 * again
854 */
a22285a6
YZ
855 mutex_lock(&head->mutex);
856 mutex_unlock(&head->mutex);
d278850e 857 btrfs_put_delayed_ref_head(head);
639eefc8 858 goto search_again;
a22285a6 859 }
d7df2c79 860 spin_lock(&head->lock);
a22285a6
YZ
861 if (head->extent_op && head->extent_op->update_flags)
862 extent_flags |= head->extent_op->flags_to_set;
863 else
864 BUG_ON(num_refs == 0);
865
d278850e 866 num_refs += head->ref_mod;
d7df2c79 867 spin_unlock(&head->lock);
a22285a6
YZ
868 mutex_unlock(&head->mutex);
869 }
870 spin_unlock(&delayed_refs->lock);
871out:
872 WARN_ON(num_refs == 0);
873 if (refs)
874 *refs = num_refs;
875 if (flags)
876 *flags = extent_flags;
877out_free:
878 btrfs_free_path(path);
879 return ret;
880}
881
d8d5f3e1
CM
882/*
883 * Back reference rules. Back refs have three main goals:
884 *
885 * 1) differentiate between all holders of references to an extent so that
886 * when a reference is dropped we can make sure it was a valid reference
887 * before freeing the extent.
888 *
889 * 2) Provide enough information to quickly find the holders of an extent
890 * if we notice a given block is corrupted or bad.
891 *
892 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
893 * maintenance. This is actually the same as #2, but with a slightly
894 * different use case.
895 *
5d4f98a2
YZ
896 * There are two kinds of back refs. The implicit back refs is optimized
897 * for pointers in non-shared tree blocks. For a given pointer in a block,
898 * back refs of this kind provide information about the block's owner tree
899 * and the pointer's key. These information allow us to find the block by
900 * b-tree searching. The full back refs is for pointers in tree blocks not
901 * referenced by their owner trees. The location of tree block is recorded
902 * in the back refs. Actually the full back refs is generic, and can be
903 * used in all cases the implicit back refs is used. The major shortcoming
904 * of the full back refs is its overhead. Every time a tree block gets
905 * COWed, we have to update back refs entry for all pointers in it.
906 *
907 * For a newly allocated tree block, we use implicit back refs for
908 * pointers in it. This means most tree related operations only involve
909 * implicit back refs. For a tree block created in old transaction, the
910 * only way to drop a reference to it is COW it. So we can detect the
911 * event that tree block loses its owner tree's reference and do the
912 * back refs conversion.
913 *
01327610 914 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
915 *
916 * The reference count of the block is one and the tree is the block's
917 * owner tree. Nothing to do in this case.
918 *
919 * The reference count of the block is one and the tree is not the
920 * block's owner tree. In this case, full back refs is used for pointers
921 * in the block. Remove these full back refs, add implicit back refs for
922 * every pointers in the new block.
923 *
924 * The reference count of the block is greater than one and the tree is
925 * the block's owner tree. In this case, implicit back refs is used for
926 * pointers in the block. Add full back refs for every pointers in the
927 * block, increase lower level extents' reference counts. The original
928 * implicit back refs are entailed to the new block.
929 *
930 * The reference count of the block is greater than one and the tree is
931 * not the block's owner tree. Add implicit back refs for every pointer in
932 * the new block, increase lower level extents' reference count.
933 *
934 * Back Reference Key composing:
935 *
936 * The key objectid corresponds to the first byte in the extent,
937 * The key type is used to differentiate between types of back refs.
938 * There are different meanings of the key offset for different types
939 * of back refs.
940 *
d8d5f3e1
CM
941 * File extents can be referenced by:
942 *
943 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 944 * - different files inside a single subvolume
d8d5f3e1
CM
945 * - different offsets inside a file (bookend extents in file.c)
946 *
5d4f98a2 947 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
948 *
949 * - Objectid of the subvolume root
d8d5f3e1 950 * - objectid of the file holding the reference
5d4f98a2
YZ
951 * - original offset in the file
952 * - how many bookend extents
d8d5f3e1 953 *
5d4f98a2
YZ
954 * The key offset for the implicit back refs is hash of the first
955 * three fields.
d8d5f3e1 956 *
5d4f98a2 957 * The extent ref structure for the full back refs has field for:
d8d5f3e1 958 *
5d4f98a2 959 * - number of pointers in the tree leaf
d8d5f3e1 960 *
5d4f98a2
YZ
961 * The key offset for the implicit back refs is the first byte of
962 * the tree leaf
d8d5f3e1 963 *
5d4f98a2
YZ
964 * When a file extent is allocated, The implicit back refs is used.
965 * the fields are filled in:
d8d5f3e1 966 *
5d4f98a2 967 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 968 *
5d4f98a2
YZ
969 * When a file extent is removed file truncation, we find the
970 * corresponding implicit back refs and check the following fields:
d8d5f3e1 971 *
5d4f98a2 972 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 973 *
5d4f98a2 974 * Btree extents can be referenced by:
d8d5f3e1 975 *
5d4f98a2 976 * - Different subvolumes
d8d5f3e1 977 *
5d4f98a2
YZ
978 * Both the implicit back refs and the full back refs for tree blocks
979 * only consist of key. The key offset for the implicit back refs is
980 * objectid of block's owner tree. The key offset for the full back refs
981 * is the first byte of parent block.
d8d5f3e1 982 *
5d4f98a2
YZ
983 * When implicit back refs is used, information about the lowest key and
984 * level of the tree block are required. These information are stored in
985 * tree block info structure.
d8d5f3e1 986 */
31840ae1 987
167ce953
LB
988/*
989 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 990 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
991 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
992 */
993int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
994 struct btrfs_extent_inline_ref *iref,
995 enum btrfs_inline_ref_type is_data)
996{
997 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 998 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953
LB
999
1000 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1001 type == BTRFS_SHARED_BLOCK_REF_KEY ||
1002 type == BTRFS_SHARED_DATA_REF_KEY ||
1003 type == BTRFS_EXTENT_DATA_REF_KEY) {
1004 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 1005 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 1006 return type;
64ecdb64
LB
1007 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1008 ASSERT(eb->fs_info);
1009 /*
1010 * Every shared one has parent tree
1011 * block, which must be aligned to
1012 * nodesize.
1013 */
1014 if (offset &&
1015 IS_ALIGNED(offset, eb->fs_info->nodesize))
1016 return type;
1017 }
167ce953 1018 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 1019 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 1020 return type;
64ecdb64
LB
1021 if (type == BTRFS_SHARED_DATA_REF_KEY) {
1022 ASSERT(eb->fs_info);
1023 /*
1024 * Every shared one has parent tree
1025 * block, which must be aligned to
1026 * nodesize.
1027 */
1028 if (offset &&
1029 IS_ALIGNED(offset, eb->fs_info->nodesize))
1030 return type;
1031 }
167ce953
LB
1032 } else {
1033 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1034 return type;
1035 }
1036 }
1037
1038 btrfs_print_leaf((struct extent_buffer *)eb);
1039 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1040 eb->start, type);
1041 WARN_ON(1);
1042
1043 return BTRFS_REF_TYPE_INVALID;
1044}
1045
5d4f98a2
YZ
1046static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1047{
1048 u32 high_crc = ~(u32)0;
1049 u32 low_crc = ~(u32)0;
1050 __le64 lenum;
1051
1052 lenum = cpu_to_le64(root_objectid);
65019df8 1053 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 1054 lenum = cpu_to_le64(owner);
65019df8 1055 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 1056 lenum = cpu_to_le64(offset);
65019df8 1057 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
1058
1059 return ((u64)high_crc << 31) ^ (u64)low_crc;
1060}
1061
1062static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1063 struct btrfs_extent_data_ref *ref)
1064{
1065 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1066 btrfs_extent_data_ref_objectid(leaf, ref),
1067 btrfs_extent_data_ref_offset(leaf, ref));
1068}
1069
1070static int match_extent_data_ref(struct extent_buffer *leaf,
1071 struct btrfs_extent_data_ref *ref,
1072 u64 root_objectid, u64 owner, u64 offset)
1073{
1074 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1075 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1076 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1077 return 0;
1078 return 1;
1079}
1080
1081static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1082 struct btrfs_path *path,
1083 u64 bytenr, u64 parent,
1084 u64 root_objectid,
1085 u64 owner, u64 offset)
1086{
bd1d53ef 1087 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2
YZ
1088 struct btrfs_key key;
1089 struct btrfs_extent_data_ref *ref;
31840ae1 1090 struct extent_buffer *leaf;
5d4f98a2 1091 u32 nritems;
74493f7a 1092 int ret;
5d4f98a2
YZ
1093 int recow;
1094 int err = -ENOENT;
74493f7a 1095
31840ae1 1096 key.objectid = bytenr;
5d4f98a2
YZ
1097 if (parent) {
1098 key.type = BTRFS_SHARED_DATA_REF_KEY;
1099 key.offset = parent;
1100 } else {
1101 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1102 key.offset = hash_extent_data_ref(root_objectid,
1103 owner, offset);
1104 }
1105again:
1106 recow = 0;
1107 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1108 if (ret < 0) {
1109 err = ret;
1110 goto fail;
1111 }
31840ae1 1112
5d4f98a2
YZ
1113 if (parent) {
1114 if (!ret)
1115 return 0;
5d4f98a2 1116 goto fail;
31840ae1
ZY
1117 }
1118
1119 leaf = path->nodes[0];
5d4f98a2
YZ
1120 nritems = btrfs_header_nritems(leaf);
1121 while (1) {
1122 if (path->slots[0] >= nritems) {
1123 ret = btrfs_next_leaf(root, path);
1124 if (ret < 0)
1125 err = ret;
1126 if (ret)
1127 goto fail;
1128
1129 leaf = path->nodes[0];
1130 nritems = btrfs_header_nritems(leaf);
1131 recow = 1;
1132 }
1133
1134 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1135 if (key.objectid != bytenr ||
1136 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1137 goto fail;
1138
1139 ref = btrfs_item_ptr(leaf, path->slots[0],
1140 struct btrfs_extent_data_ref);
1141
1142 if (match_extent_data_ref(leaf, ref, root_objectid,
1143 owner, offset)) {
1144 if (recow) {
b3b4aa74 1145 btrfs_release_path(path);
5d4f98a2
YZ
1146 goto again;
1147 }
1148 err = 0;
1149 break;
1150 }
1151 path->slots[0]++;
31840ae1 1152 }
5d4f98a2
YZ
1153fail:
1154 return err;
31840ae1
ZY
1155}
1156
5d4f98a2 1157static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1158 struct btrfs_path *path,
1159 u64 bytenr, u64 parent,
1160 u64 root_objectid, u64 owner,
1161 u64 offset, int refs_to_add)
31840ae1 1162{
62b895af 1163 struct btrfs_root *root = trans->fs_info->extent_root;
31840ae1
ZY
1164 struct btrfs_key key;
1165 struct extent_buffer *leaf;
5d4f98a2 1166 u32 size;
31840ae1
ZY
1167 u32 num_refs;
1168 int ret;
74493f7a 1169
74493f7a 1170 key.objectid = bytenr;
5d4f98a2
YZ
1171 if (parent) {
1172 key.type = BTRFS_SHARED_DATA_REF_KEY;
1173 key.offset = parent;
1174 size = sizeof(struct btrfs_shared_data_ref);
1175 } else {
1176 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1177 key.offset = hash_extent_data_ref(root_objectid,
1178 owner, offset);
1179 size = sizeof(struct btrfs_extent_data_ref);
1180 }
74493f7a 1181
5d4f98a2
YZ
1182 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1183 if (ret && ret != -EEXIST)
1184 goto fail;
1185
1186 leaf = path->nodes[0];
1187 if (parent) {
1188 struct btrfs_shared_data_ref *ref;
31840ae1 1189 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
1190 struct btrfs_shared_data_ref);
1191 if (ret == 0) {
1192 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1193 } else {
1194 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1195 num_refs += refs_to_add;
1196 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 1197 }
5d4f98a2
YZ
1198 } else {
1199 struct btrfs_extent_data_ref *ref;
1200 while (ret == -EEXIST) {
1201 ref = btrfs_item_ptr(leaf, path->slots[0],
1202 struct btrfs_extent_data_ref);
1203 if (match_extent_data_ref(leaf, ref, root_objectid,
1204 owner, offset))
1205 break;
b3b4aa74 1206 btrfs_release_path(path);
5d4f98a2
YZ
1207 key.offset++;
1208 ret = btrfs_insert_empty_item(trans, root, path, &key,
1209 size);
1210 if (ret && ret != -EEXIST)
1211 goto fail;
31840ae1 1212
5d4f98a2
YZ
1213 leaf = path->nodes[0];
1214 }
1215 ref = btrfs_item_ptr(leaf, path->slots[0],
1216 struct btrfs_extent_data_ref);
1217 if (ret == 0) {
1218 btrfs_set_extent_data_ref_root(leaf, ref,
1219 root_objectid);
1220 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1221 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1222 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1223 } else {
1224 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1225 num_refs += refs_to_add;
1226 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 1227 }
31840ae1 1228 }
5d4f98a2
YZ
1229 btrfs_mark_buffer_dirty(leaf);
1230 ret = 0;
1231fail:
b3b4aa74 1232 btrfs_release_path(path);
7bb86316 1233 return ret;
74493f7a
CM
1234}
1235
5d4f98a2 1236static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2 1237 struct btrfs_path *path,
fcebe456 1238 int refs_to_drop, int *last_ref)
31840ae1 1239{
5d4f98a2
YZ
1240 struct btrfs_key key;
1241 struct btrfs_extent_data_ref *ref1 = NULL;
1242 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 1243 struct extent_buffer *leaf;
5d4f98a2 1244 u32 num_refs = 0;
31840ae1
ZY
1245 int ret = 0;
1246
1247 leaf = path->nodes[0];
5d4f98a2
YZ
1248 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1249
1250 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1251 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1252 struct btrfs_extent_data_ref);
1253 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1254 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1255 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1256 struct btrfs_shared_data_ref);
1257 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
6d8ff4e4 1258 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
ba3c2b19
NB
1259 btrfs_print_v0_err(trans->fs_info);
1260 btrfs_abort_transaction(trans, -EINVAL);
1261 return -EINVAL;
5d4f98a2
YZ
1262 } else {
1263 BUG();
1264 }
1265
56bec294
CM
1266 BUG_ON(num_refs < refs_to_drop);
1267 num_refs -= refs_to_drop;
5d4f98a2 1268
31840ae1 1269 if (num_refs == 0) {
e9f6290d 1270 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
fcebe456 1271 *last_ref = 1;
31840ae1 1272 } else {
5d4f98a2
YZ
1273 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1274 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1275 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1276 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
31840ae1
ZY
1277 btrfs_mark_buffer_dirty(leaf);
1278 }
31840ae1
ZY
1279 return ret;
1280}
1281
9ed0dea0 1282static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 1283 struct btrfs_extent_inline_ref *iref)
15916de8 1284{
5d4f98a2
YZ
1285 struct btrfs_key key;
1286 struct extent_buffer *leaf;
1287 struct btrfs_extent_data_ref *ref1;
1288 struct btrfs_shared_data_ref *ref2;
1289 u32 num_refs = 0;
3de28d57 1290 int type;
5d4f98a2
YZ
1291
1292 leaf = path->nodes[0];
1293 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19
NB
1294
1295 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
5d4f98a2 1296 if (iref) {
3de28d57
LB
1297 /*
1298 * If type is invalid, we should have bailed out earlier than
1299 * this call.
1300 */
1301 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1302 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1303 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
1304 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1305 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1306 } else {
1307 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1308 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1309 }
1310 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1311 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1312 struct btrfs_extent_data_ref);
1313 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1314 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1315 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1316 struct btrfs_shared_data_ref);
1317 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
1318 } else {
1319 WARN_ON(1);
1320 }
1321 return num_refs;
1322}
15916de8 1323
5d4f98a2 1324static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1325 struct btrfs_path *path,
1326 u64 bytenr, u64 parent,
1327 u64 root_objectid)
1f3c79a2 1328{
b8582eea 1329 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2 1330 struct btrfs_key key;
1f3c79a2 1331 int ret;
1f3c79a2 1332
5d4f98a2
YZ
1333 key.objectid = bytenr;
1334 if (parent) {
1335 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1336 key.offset = parent;
1337 } else {
1338 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1339 key.offset = root_objectid;
1f3c79a2
LH
1340 }
1341
5d4f98a2
YZ
1342 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1343 if (ret > 0)
1344 ret = -ENOENT;
5d4f98a2 1345 return ret;
1f3c79a2
LH
1346}
1347
5d4f98a2 1348static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1349 struct btrfs_path *path,
1350 u64 bytenr, u64 parent,
1351 u64 root_objectid)
31840ae1 1352{
5d4f98a2 1353 struct btrfs_key key;
31840ae1 1354 int ret;
31840ae1 1355
5d4f98a2
YZ
1356 key.objectid = bytenr;
1357 if (parent) {
1358 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1359 key.offset = parent;
1360 } else {
1361 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1362 key.offset = root_objectid;
1363 }
1364
10728404 1365 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
87bde3cd 1366 path, &key, 0);
b3b4aa74 1367 btrfs_release_path(path);
31840ae1
ZY
1368 return ret;
1369}
1370
5d4f98a2 1371static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 1372{
5d4f98a2
YZ
1373 int type;
1374 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1375 if (parent > 0)
1376 type = BTRFS_SHARED_BLOCK_REF_KEY;
1377 else
1378 type = BTRFS_TREE_BLOCK_REF_KEY;
1379 } else {
1380 if (parent > 0)
1381 type = BTRFS_SHARED_DATA_REF_KEY;
1382 else
1383 type = BTRFS_EXTENT_DATA_REF_KEY;
1384 }
1385 return type;
31840ae1 1386}
56bec294 1387
2c47e605
YZ
1388static int find_next_key(struct btrfs_path *path, int level,
1389 struct btrfs_key *key)
56bec294 1390
02217ed2 1391{
2c47e605 1392 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
1393 if (!path->nodes[level])
1394 break;
5d4f98a2
YZ
1395 if (path->slots[level] + 1 >=
1396 btrfs_header_nritems(path->nodes[level]))
1397 continue;
1398 if (level == 0)
1399 btrfs_item_key_to_cpu(path->nodes[level], key,
1400 path->slots[level] + 1);
1401 else
1402 btrfs_node_key_to_cpu(path->nodes[level], key,
1403 path->slots[level] + 1);
1404 return 0;
1405 }
1406 return 1;
1407}
037e6390 1408
5d4f98a2
YZ
1409/*
1410 * look for inline back ref. if back ref is found, *ref_ret is set
1411 * to the address of inline back ref, and 0 is returned.
1412 *
1413 * if back ref isn't found, *ref_ret is set to the address where it
1414 * should be inserted, and -ENOENT is returned.
1415 *
1416 * if insert is true and there are too many inline back refs, the path
1417 * points to the extent item, and -EAGAIN is returned.
1418 *
1419 * NOTE: inline back refs are ordered in the same way that back ref
1420 * items in the tree are ordered.
1421 */
1422static noinline_for_stack
1423int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1424 struct btrfs_path *path,
1425 struct btrfs_extent_inline_ref **ref_ret,
1426 u64 bytenr, u64 num_bytes,
1427 u64 parent, u64 root_objectid,
1428 u64 owner, u64 offset, int insert)
1429{
867cc1fb 1430 struct btrfs_fs_info *fs_info = trans->fs_info;
87bde3cd 1431 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
1432 struct btrfs_key key;
1433 struct extent_buffer *leaf;
1434 struct btrfs_extent_item *ei;
1435 struct btrfs_extent_inline_ref *iref;
1436 u64 flags;
1437 u64 item_size;
1438 unsigned long ptr;
1439 unsigned long end;
1440 int extra_size;
1441 int type;
1442 int want;
1443 int ret;
1444 int err = 0;
0b246afa 1445 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 1446 int needed;
26b8003f 1447
db94535d 1448 key.objectid = bytenr;
31840ae1 1449 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 1450 key.offset = num_bytes;
31840ae1 1451
5d4f98a2
YZ
1452 want = extent_ref_type(parent, owner);
1453 if (insert) {
1454 extra_size = btrfs_extent_inline_ref_size(want);
85d4198e 1455 path->keep_locks = 1;
5d4f98a2
YZ
1456 } else
1457 extra_size = -1;
3173a18f
JB
1458
1459 /*
16d1c062
NB
1460 * Owner is our level, so we can just add one to get the level for the
1461 * block we are interested in.
3173a18f
JB
1462 */
1463 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1464 key.type = BTRFS_METADATA_ITEM_KEY;
1465 key.offset = owner;
1466 }
1467
1468again:
5d4f98a2 1469 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 1470 if (ret < 0) {
5d4f98a2
YZ
1471 err = ret;
1472 goto out;
1473 }
3173a18f
JB
1474
1475 /*
1476 * We may be a newly converted file system which still has the old fat
1477 * extent entries for metadata, so try and see if we have one of those.
1478 */
1479 if (ret > 0 && skinny_metadata) {
1480 skinny_metadata = false;
1481 if (path->slots[0]) {
1482 path->slots[0]--;
1483 btrfs_item_key_to_cpu(path->nodes[0], &key,
1484 path->slots[0]);
1485 if (key.objectid == bytenr &&
1486 key.type == BTRFS_EXTENT_ITEM_KEY &&
1487 key.offset == num_bytes)
1488 ret = 0;
1489 }
1490 if (ret) {
9ce49a0b 1491 key.objectid = bytenr;
3173a18f
JB
1492 key.type = BTRFS_EXTENT_ITEM_KEY;
1493 key.offset = num_bytes;
1494 btrfs_release_path(path);
1495 goto again;
1496 }
1497 }
1498
79787eaa
JM
1499 if (ret && !insert) {
1500 err = -ENOENT;
1501 goto out;
fae7f21c 1502 } else if (WARN_ON(ret)) {
492104c8 1503 err = -EIO;
492104c8 1504 goto out;
79787eaa 1505 }
5d4f98a2
YZ
1506
1507 leaf = path->nodes[0];
1508 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
6d8ff4e4 1509 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
1510 err = -EINVAL;
1511 btrfs_print_v0_err(fs_info);
1512 btrfs_abort_transaction(trans, err);
1513 goto out;
1514 }
5d4f98a2 1515
5d4f98a2
YZ
1516 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1517 flags = btrfs_extent_flags(leaf, ei);
1518
1519 ptr = (unsigned long)(ei + 1);
1520 end = (unsigned long)ei + item_size;
1521
3173a18f 1522 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
1523 ptr += sizeof(struct btrfs_tree_block_info);
1524 BUG_ON(ptr > end);
5d4f98a2
YZ
1525 }
1526
3de28d57
LB
1527 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1528 needed = BTRFS_REF_TYPE_DATA;
1529 else
1530 needed = BTRFS_REF_TYPE_BLOCK;
1531
5d4f98a2
YZ
1532 err = -ENOENT;
1533 while (1) {
1534 if (ptr >= end) {
1535 WARN_ON(ptr > end);
1536 break;
1537 }
1538 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57
LB
1539 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1540 if (type == BTRFS_REF_TYPE_INVALID) {
af431dcb 1541 err = -EUCLEAN;
3de28d57
LB
1542 goto out;
1543 }
1544
5d4f98a2
YZ
1545 if (want < type)
1546 break;
1547 if (want > type) {
1548 ptr += btrfs_extent_inline_ref_size(type);
1549 continue;
1550 }
1551
1552 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1553 struct btrfs_extent_data_ref *dref;
1554 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1555 if (match_extent_data_ref(leaf, dref, root_objectid,
1556 owner, offset)) {
1557 err = 0;
1558 break;
1559 }
1560 if (hash_extent_data_ref_item(leaf, dref) <
1561 hash_extent_data_ref(root_objectid, owner, offset))
1562 break;
1563 } else {
1564 u64 ref_offset;
1565 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1566 if (parent > 0) {
1567 if (parent == ref_offset) {
1568 err = 0;
1569 break;
1570 }
1571 if (ref_offset < parent)
1572 break;
1573 } else {
1574 if (root_objectid == ref_offset) {
1575 err = 0;
1576 break;
1577 }
1578 if (ref_offset < root_objectid)
1579 break;
1580 }
1581 }
1582 ptr += btrfs_extent_inline_ref_size(type);
1583 }
1584 if (err == -ENOENT && insert) {
1585 if (item_size + extra_size >=
1586 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1587 err = -EAGAIN;
1588 goto out;
1589 }
1590 /*
1591 * To add new inline back ref, we have to make sure
1592 * there is no corresponding back ref item.
1593 * For simplicity, we just do not add new inline back
1594 * ref if there is any kind of item for this block
1595 */
2c47e605
YZ
1596 if (find_next_key(path, 0, &key) == 0 &&
1597 key.objectid == bytenr &&
85d4198e 1598 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
1599 err = -EAGAIN;
1600 goto out;
1601 }
1602 }
1603 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1604out:
85d4198e 1605 if (insert) {
5d4f98a2
YZ
1606 path->keep_locks = 0;
1607 btrfs_unlock_up_safe(path, 1);
1608 }
1609 return err;
1610}
1611
1612/*
1613 * helper to add new inline back ref
1614 */
1615static noinline_for_stack
87bde3cd 1616void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
1617 struct btrfs_path *path,
1618 struct btrfs_extent_inline_ref *iref,
1619 u64 parent, u64 root_objectid,
1620 u64 owner, u64 offset, int refs_to_add,
1621 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
1622{
1623 struct extent_buffer *leaf;
1624 struct btrfs_extent_item *ei;
1625 unsigned long ptr;
1626 unsigned long end;
1627 unsigned long item_offset;
1628 u64 refs;
1629 int size;
1630 int type;
5d4f98a2
YZ
1631
1632 leaf = path->nodes[0];
1633 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1634 item_offset = (unsigned long)iref - (unsigned long)ei;
1635
1636 type = extent_ref_type(parent, owner);
1637 size = btrfs_extent_inline_ref_size(type);
1638
c71dd880 1639 btrfs_extend_item(path, size);
5d4f98a2
YZ
1640
1641 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1642 refs = btrfs_extent_refs(leaf, ei);
1643 refs += refs_to_add;
1644 btrfs_set_extent_refs(leaf, ei, refs);
1645 if (extent_op)
1646 __run_delayed_extent_op(extent_op, leaf, ei);
1647
1648 ptr = (unsigned long)ei + item_offset;
1649 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1650 if (ptr < end - size)
1651 memmove_extent_buffer(leaf, ptr + size, ptr,
1652 end - size - ptr);
1653
1654 iref = (struct btrfs_extent_inline_ref *)ptr;
1655 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1656 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1657 struct btrfs_extent_data_ref *dref;
1658 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1659 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1660 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1661 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1662 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1663 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1664 struct btrfs_shared_data_ref *sref;
1665 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1666 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1667 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1668 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1669 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1670 } else {
1671 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1672 }
1673 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1674}
1675
1676static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1677 struct btrfs_path *path,
1678 struct btrfs_extent_inline_ref **ref_ret,
1679 u64 bytenr, u64 num_bytes, u64 parent,
1680 u64 root_objectid, u64 owner, u64 offset)
1681{
1682 int ret;
1683
867cc1fb
NB
1684 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1685 num_bytes, parent, root_objectid,
1686 owner, offset, 0);
5d4f98a2 1687 if (ret != -ENOENT)
54aa1f4d 1688 return ret;
5d4f98a2 1689
b3b4aa74 1690 btrfs_release_path(path);
5d4f98a2
YZ
1691 *ref_ret = NULL;
1692
1693 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1694 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1695 root_objectid);
5d4f98a2 1696 } else {
bd1d53ef
NB
1697 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1698 root_objectid, owner, offset);
b9473439 1699 }
5d4f98a2
YZ
1700 return ret;
1701}
31840ae1 1702
5d4f98a2
YZ
1703/*
1704 * helper to update/remove inline back ref
1705 */
1706static noinline_for_stack
61a18f1c 1707void update_inline_extent_backref(struct btrfs_path *path,
143bede5
JM
1708 struct btrfs_extent_inline_ref *iref,
1709 int refs_to_mod,
fcebe456
JB
1710 struct btrfs_delayed_extent_op *extent_op,
1711 int *last_ref)
5d4f98a2 1712{
61a18f1c 1713 struct extent_buffer *leaf = path->nodes[0];
5d4f98a2
YZ
1714 struct btrfs_extent_item *ei;
1715 struct btrfs_extent_data_ref *dref = NULL;
1716 struct btrfs_shared_data_ref *sref = NULL;
1717 unsigned long ptr;
1718 unsigned long end;
1719 u32 item_size;
1720 int size;
1721 int type;
5d4f98a2
YZ
1722 u64 refs;
1723
5d4f98a2
YZ
1724 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1725 refs = btrfs_extent_refs(leaf, ei);
1726 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1727 refs += refs_to_mod;
1728 btrfs_set_extent_refs(leaf, ei, refs);
1729 if (extent_op)
1730 __run_delayed_extent_op(extent_op, leaf, ei);
1731
3de28d57
LB
1732 /*
1733 * If type is invalid, we should have bailed out after
1734 * lookup_inline_extent_backref().
1735 */
1736 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1737 ASSERT(type != BTRFS_REF_TYPE_INVALID);
5d4f98a2
YZ
1738
1739 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1740 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1741 refs = btrfs_extent_data_ref_count(leaf, dref);
1742 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1743 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1744 refs = btrfs_shared_data_ref_count(leaf, sref);
1745 } else {
1746 refs = 1;
1747 BUG_ON(refs_to_mod != -1);
56bec294 1748 }
31840ae1 1749
5d4f98a2
YZ
1750 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1751 refs += refs_to_mod;
1752
1753 if (refs > 0) {
1754 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1755 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1756 else
1757 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1758 } else {
fcebe456 1759 *last_ref = 1;
5d4f98a2
YZ
1760 size = btrfs_extent_inline_ref_size(type);
1761 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1762 ptr = (unsigned long)iref;
1763 end = (unsigned long)ei + item_size;
1764 if (ptr + size < end)
1765 memmove_extent_buffer(leaf, ptr, ptr + size,
1766 end - ptr - size);
1767 item_size -= size;
78ac4f9e 1768 btrfs_truncate_item(path, item_size, 1);
5d4f98a2
YZ
1769 }
1770 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1771}
1772
1773static noinline_for_stack
1774int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1775 struct btrfs_path *path,
1776 u64 bytenr, u64 num_bytes, u64 parent,
1777 u64 root_objectid, u64 owner,
1778 u64 offset, int refs_to_add,
1779 struct btrfs_delayed_extent_op *extent_op)
1780{
1781 struct btrfs_extent_inline_ref *iref;
1782 int ret;
1783
867cc1fb
NB
1784 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1785 num_bytes, parent, root_objectid,
1786 owner, offset, 1);
5d4f98a2
YZ
1787 if (ret == 0) {
1788 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
61a18f1c
NB
1789 update_inline_extent_backref(path, iref, refs_to_add,
1790 extent_op, NULL);
5d4f98a2 1791 } else if (ret == -ENOENT) {
a639cdeb 1792 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
143bede5
JM
1793 root_objectid, owner, offset,
1794 refs_to_add, extent_op);
1795 ret = 0;
771ed689 1796 }
5d4f98a2
YZ
1797 return ret;
1798}
31840ae1 1799
5d4f98a2 1800static int insert_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1801 struct btrfs_path *path,
1802 u64 bytenr, u64 parent, u64 root_objectid,
1803 u64 owner, u64 offset, int refs_to_add)
1804{
1805 int ret;
1806 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1807 BUG_ON(refs_to_add != 1);
10728404
NB
1808 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1809 root_objectid);
5d4f98a2 1810 } else {
62b895af
NB
1811 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1812 root_objectid, owner, offset,
1813 refs_to_add);
5d4f98a2
YZ
1814 }
1815 return ret;
1816}
56bec294 1817
5d4f98a2 1818static int remove_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1819 struct btrfs_path *path,
1820 struct btrfs_extent_inline_ref *iref,
fcebe456 1821 int refs_to_drop, int is_data, int *last_ref)
5d4f98a2 1822{
143bede5 1823 int ret = 0;
b9473439 1824
5d4f98a2
YZ
1825 BUG_ON(!is_data && refs_to_drop != 1);
1826 if (iref) {
61a18f1c
NB
1827 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1828 last_ref);
5d4f98a2 1829 } else if (is_data) {
e9f6290d 1830 ret = remove_extent_data_ref(trans, path, refs_to_drop,
fcebe456 1831 last_ref);
5d4f98a2 1832 } else {
fcebe456 1833 *last_ref = 1;
87cc7a8a 1834 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
5d4f98a2
YZ
1835 }
1836 return ret;
1837}
1838
d04c6b88
JM
1839static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1840 u64 *discarded_bytes)
5d4f98a2 1841{
86557861
JM
1842 int j, ret = 0;
1843 u64 bytes_left, end;
4d89d377 1844 u64 aligned_start = ALIGN(start, 1 << 9);
d04c6b88 1845
4d89d377
JM
1846 if (WARN_ON(start != aligned_start)) {
1847 len -= aligned_start - start;
1848 len = round_down(len, 1 << 9);
1849 start = aligned_start;
1850 }
d04c6b88 1851
4d89d377 1852 *discarded_bytes = 0;
86557861
JM
1853
1854 if (!len)
1855 return 0;
1856
1857 end = start + len;
1858 bytes_left = len;
1859
1860 /* Skip any superblocks on this device. */
1861 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1862 u64 sb_start = btrfs_sb_offset(j);
1863 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1864 u64 size = sb_start - start;
1865
1866 if (!in_range(sb_start, start, bytes_left) &&
1867 !in_range(sb_end, start, bytes_left) &&
1868 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1869 continue;
1870
1871 /*
1872 * Superblock spans beginning of range. Adjust start and
1873 * try again.
1874 */
1875 if (sb_start <= start) {
1876 start += sb_end - start;
1877 if (start > end) {
1878 bytes_left = 0;
1879 break;
1880 }
1881 bytes_left = end - start;
1882 continue;
1883 }
1884
1885 if (size) {
1886 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1887 GFP_NOFS, 0);
1888 if (!ret)
1889 *discarded_bytes += size;
1890 else if (ret != -EOPNOTSUPP)
1891 return ret;
1892 }
1893
1894 start = sb_end;
1895 if (start > end) {
1896 bytes_left = 0;
1897 break;
1898 }
1899 bytes_left = end - start;
1900 }
1901
1902 if (bytes_left) {
1903 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
4d89d377
JM
1904 GFP_NOFS, 0);
1905 if (!ret)
86557861 1906 *discarded_bytes += bytes_left;
4d89d377 1907 }
d04c6b88 1908 return ret;
5d4f98a2 1909}
5d4f98a2 1910
2ff7e61e 1911int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1912 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1913{
5d4f98a2 1914 int ret;
5378e607 1915 u64 discarded_bytes = 0;
a1d3c478 1916 struct btrfs_bio *bbio = NULL;
5d4f98a2 1917
e244a0ae 1918
2999241d
FM
1919 /*
1920 * Avoid races with device replace and make sure our bbio has devices
1921 * associated to its stripes that don't go away while we are discarding.
1922 */
0b246afa 1923 btrfs_bio_counter_inc_blocked(fs_info);
5d4f98a2 1924 /* Tell the block device(s) that the sectors can be discarded */
0b246afa
JM
1925 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1926 &bbio, 0);
79787eaa 1927 /* Error condition is -ENOMEM */
5d4f98a2 1928 if (!ret) {
a1d3c478 1929 struct btrfs_bio_stripe *stripe = bbio->stripes;
5d4f98a2
YZ
1930 int i;
1931
5d4f98a2 1932
a1d3c478 1933 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
d04c6b88 1934 u64 bytes;
38b5f68e
AJ
1935 struct request_queue *req_q;
1936
627e0873
FM
1937 if (!stripe->dev->bdev) {
1938 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1939 continue;
1940 }
38b5f68e
AJ
1941 req_q = bdev_get_queue(stripe->dev->bdev);
1942 if (!blk_queue_discard(req_q))
d5e2003c
JB
1943 continue;
1944
5378e607
LD
1945 ret = btrfs_issue_discard(stripe->dev->bdev,
1946 stripe->physical,
d04c6b88
JM
1947 stripe->length,
1948 &bytes);
5378e607 1949 if (!ret)
d04c6b88 1950 discarded_bytes += bytes;
5378e607 1951 else if (ret != -EOPNOTSUPP)
79787eaa 1952 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
d5e2003c
JB
1953
1954 /*
1955 * Just in case we get back EOPNOTSUPP for some reason,
1956 * just ignore the return value so we don't screw up
1957 * people calling discard_extent.
1958 */
1959 ret = 0;
5d4f98a2 1960 }
6e9606d2 1961 btrfs_put_bbio(bbio);
5d4f98a2 1962 }
0b246afa 1963 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1964
1965 if (actual_bytes)
1966 *actual_bytes = discarded_bytes;
1967
5d4f98a2 1968
53b381b3
DW
1969 if (ret == -EOPNOTSUPP)
1970 ret = 0;
5d4f98a2 1971 return ret;
5d4f98a2
YZ
1972}
1973
79787eaa 1974/* Can return -ENOMEM */
5d4f98a2 1975int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1976 struct btrfs_ref *generic_ref)
5d4f98a2 1977{
82fa113f 1978 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 1979 int old_ref_mod, new_ref_mod;
5d4f98a2 1980 int ret;
66d7e7f0 1981
82fa113f
QW
1982 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1983 generic_ref->action);
1984 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1985 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1986
82fa113f
QW
1987 if (generic_ref->type == BTRFS_REF_METADATA)
1988 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
ed4f255b 1989 NULL, &old_ref_mod, &new_ref_mod);
82fa113f
QW
1990 else
1991 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
d7eae340 1992 &old_ref_mod, &new_ref_mod);
d7eae340 1993
82fa113f 1994 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1995
ddf30cf0 1996 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
78192442 1997 sub_pinned_bytes(fs_info, generic_ref);
d7eae340 1998
5d4f98a2
YZ
1999 return ret;
2000}
2001
bd3c685e
NB
2002/*
2003 * __btrfs_inc_extent_ref - insert backreference for a given extent
2004 *
2005 * @trans: Handle of transaction
2006 *
2007 * @node: The delayed ref node used to get the bytenr/length for
2008 * extent whose references are incremented.
2009 *
2010 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2011 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2012 * bytenr of the parent block. Since new extents are always
2013 * created with indirect references, this will only be the case
2014 * when relocating a shared extent. In that case, root_objectid
2015 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2016 * be 0
2017 *
2018 * @root_objectid: The id of the root where this modification has originated,
2019 * this can be either one of the well-known metadata trees or
2020 * the subvolume id which references this extent.
2021 *
2022 * @owner: For data extents it is the inode number of the owning file.
2023 * For metadata extents this parameter holds the level in the
2024 * tree of the extent.
2025 *
2026 * @offset: For metadata extents the offset is ignored and is currently
2027 * always passed as 0. For data extents it is the fileoffset
2028 * this extent belongs to.
2029 *
2030 * @refs_to_add Number of references to add
2031 *
2032 * @extent_op Pointer to a structure, holding information necessary when
2033 * updating a tree block's flags
2034 *
2035 */
5d4f98a2 2036static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 2037 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
2038 u64 parent, u64 root_objectid,
2039 u64 owner, u64 offset, int refs_to_add,
2040 struct btrfs_delayed_extent_op *extent_op)
2041{
2042 struct btrfs_path *path;
2043 struct extent_buffer *leaf;
2044 struct btrfs_extent_item *item;
fcebe456 2045 struct btrfs_key key;
c682f9b3
QW
2046 u64 bytenr = node->bytenr;
2047 u64 num_bytes = node->num_bytes;
5d4f98a2
YZ
2048 u64 refs;
2049 int ret;
5d4f98a2
YZ
2050
2051 path = btrfs_alloc_path();
2052 if (!path)
2053 return -ENOMEM;
2054
e4058b54 2055 path->reada = READA_FORWARD;
5d4f98a2
YZ
2056 path->leave_spinning = 1;
2057 /* this will setup the path even if it fails to insert the back ref */
a639cdeb
NB
2058 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2059 parent, root_objectid, owner,
2060 offset, refs_to_add, extent_op);
0ed4792a 2061 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 2062 goto out;
fcebe456
JB
2063
2064 /*
2065 * Ok we had -EAGAIN which means we didn't have space to insert and
2066 * inline extent ref, so just update the reference count and add a
2067 * normal backref.
2068 */
5d4f98a2 2069 leaf = path->nodes[0];
fcebe456 2070 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
2071 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2072 refs = btrfs_extent_refs(leaf, item);
2073 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2074 if (extent_op)
2075 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 2076
5d4f98a2 2077 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 2078 btrfs_release_path(path);
56bec294 2079
e4058b54 2080 path->reada = READA_FORWARD;
b9473439 2081 path->leave_spinning = 1;
56bec294 2082 /* now insert the actual backref */
37593410
NB
2083 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2084 owner, offset, refs_to_add);
79787eaa 2085 if (ret)
66642832 2086 btrfs_abort_transaction(trans, ret);
5d4f98a2 2087out:
56bec294 2088 btrfs_free_path(path);
30d133fc 2089 return ret;
56bec294
CM
2090}
2091
5d4f98a2 2092static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2093 struct btrfs_delayed_ref_node *node,
2094 struct btrfs_delayed_extent_op *extent_op,
2095 int insert_reserved)
56bec294 2096{
5d4f98a2
YZ
2097 int ret = 0;
2098 struct btrfs_delayed_data_ref *ref;
2099 struct btrfs_key ins;
2100 u64 parent = 0;
2101 u64 ref_root = 0;
2102 u64 flags = 0;
2103
2104 ins.objectid = node->bytenr;
2105 ins.offset = node->num_bytes;
2106 ins.type = BTRFS_EXTENT_ITEM_KEY;
2107
2108 ref = btrfs_delayed_node_to_data_ref(node);
2bf98ef3 2109 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
599c75ec 2110
5d4f98a2
YZ
2111 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2112 parent = ref->parent;
fcebe456 2113 ref_root = ref->root;
5d4f98a2
YZ
2114
2115 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2116 if (extent_op)
5d4f98a2 2117 flags |= extent_op->flags_to_set;
ef89b824
NB
2118 ret = alloc_reserved_file_extent(trans, parent, ref_root,
2119 flags, ref->objectid,
2120 ref->offset, &ins,
2121 node->ref_mod);
5d4f98a2 2122 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
2123 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2124 ref->objectid, ref->offset,
2125 node->ref_mod, extent_op);
5d4f98a2 2126 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 2127 ret = __btrfs_free_extent(trans, node, parent,
5d4f98a2
YZ
2128 ref_root, ref->objectid,
2129 ref->offset, node->ref_mod,
c682f9b3 2130 extent_op);
5d4f98a2
YZ
2131 } else {
2132 BUG();
2133 }
2134 return ret;
2135}
2136
2137static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2138 struct extent_buffer *leaf,
2139 struct btrfs_extent_item *ei)
2140{
2141 u64 flags = btrfs_extent_flags(leaf, ei);
2142 if (extent_op->update_flags) {
2143 flags |= extent_op->flags_to_set;
2144 btrfs_set_extent_flags(leaf, ei, flags);
2145 }
2146
2147 if (extent_op->update_key) {
2148 struct btrfs_tree_block_info *bi;
2149 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2150 bi = (struct btrfs_tree_block_info *)(ei + 1);
2151 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2152 }
2153}
2154
2155static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 2156 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
2157 struct btrfs_delayed_extent_op *extent_op)
2158{
20b9a2d6 2159 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
2160 struct btrfs_key key;
2161 struct btrfs_path *path;
2162 struct btrfs_extent_item *ei;
2163 struct extent_buffer *leaf;
2164 u32 item_size;
56bec294 2165 int ret;
5d4f98a2 2166 int err = 0;
b1c79e09 2167 int metadata = !extent_op->is_data;
5d4f98a2 2168
79787eaa
JM
2169 if (trans->aborted)
2170 return 0;
2171
0b246afa 2172 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
2173 metadata = 0;
2174
5d4f98a2
YZ
2175 path = btrfs_alloc_path();
2176 if (!path)
2177 return -ENOMEM;
2178
d278850e 2179 key.objectid = head->bytenr;
5d4f98a2 2180
3173a18f 2181 if (metadata) {
3173a18f 2182 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 2183 key.offset = extent_op->level;
3173a18f
JB
2184 } else {
2185 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 2186 key.offset = head->num_bytes;
3173a18f
JB
2187 }
2188
2189again:
e4058b54 2190 path->reada = READA_FORWARD;
5d4f98a2 2191 path->leave_spinning = 1;
0b246afa 2192 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
5d4f98a2
YZ
2193 if (ret < 0) {
2194 err = ret;
2195 goto out;
2196 }
2197 if (ret > 0) {
3173a18f 2198 if (metadata) {
55994887
FDBM
2199 if (path->slots[0] > 0) {
2200 path->slots[0]--;
2201 btrfs_item_key_to_cpu(path->nodes[0], &key,
2202 path->slots[0]);
d278850e 2203 if (key.objectid == head->bytenr &&
55994887 2204 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 2205 key.offset == head->num_bytes)
55994887
FDBM
2206 ret = 0;
2207 }
2208 if (ret > 0) {
2209 btrfs_release_path(path);
2210 metadata = 0;
3173a18f 2211
d278850e
JB
2212 key.objectid = head->bytenr;
2213 key.offset = head->num_bytes;
55994887
FDBM
2214 key.type = BTRFS_EXTENT_ITEM_KEY;
2215 goto again;
2216 }
2217 } else {
2218 err = -EIO;
2219 goto out;
3173a18f 2220 }
5d4f98a2
YZ
2221 }
2222
2223 leaf = path->nodes[0];
2224 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ba3c2b19 2225
6d8ff4e4 2226 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
2227 err = -EINVAL;
2228 btrfs_print_v0_err(fs_info);
2229 btrfs_abort_transaction(trans, err);
2230 goto out;
2231 }
2232
5d4f98a2
YZ
2233 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2234 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 2235
5d4f98a2
YZ
2236 btrfs_mark_buffer_dirty(leaf);
2237out:
2238 btrfs_free_path(path);
2239 return err;
56bec294
CM
2240}
2241
5d4f98a2 2242static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2243 struct btrfs_delayed_ref_node *node,
2244 struct btrfs_delayed_extent_op *extent_op,
2245 int insert_reserved)
56bec294
CM
2246{
2247 int ret = 0;
5d4f98a2 2248 struct btrfs_delayed_tree_ref *ref;
5d4f98a2
YZ
2249 u64 parent = 0;
2250 u64 ref_root = 0;
56bec294 2251
5d4f98a2 2252 ref = btrfs_delayed_node_to_tree_ref(node);
f97806f2 2253 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
599c75ec 2254
5d4f98a2
YZ
2255 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2256 parent = ref->parent;
fcebe456 2257 ref_root = ref->root;
5d4f98a2 2258
02794222 2259 if (node->ref_mod != 1) {
f97806f2 2260 btrfs_err(trans->fs_info,
02794222
LB
2261 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2262 node->bytenr, node->ref_mod, node->action, ref_root,
2263 parent);
2264 return -EIO;
2265 }
5d4f98a2 2266 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2267 BUG_ON(!extent_op || !extent_op->update_flags);
21ebfbe7 2268 ret = alloc_reserved_tree_block(trans, node, extent_op);
5d4f98a2 2269 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
2270 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2271 ref->level, 0, 1, extent_op);
5d4f98a2 2272 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 2273 ret = __btrfs_free_extent(trans, node, parent, ref_root,
c682f9b3 2274 ref->level, 0, 1, extent_op);
5d4f98a2
YZ
2275 } else {
2276 BUG();
2277 }
56bec294
CM
2278 return ret;
2279}
2280
2281/* helper function to actually process a single delayed ref entry */
5d4f98a2 2282static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
2283 struct btrfs_delayed_ref_node *node,
2284 struct btrfs_delayed_extent_op *extent_op,
2285 int insert_reserved)
56bec294 2286{
79787eaa
JM
2287 int ret = 0;
2288
857cc2fc
JB
2289 if (trans->aborted) {
2290 if (insert_reserved)
5fac7f9e 2291 btrfs_pin_extent(trans->fs_info, node->bytenr,
857cc2fc 2292 node->num_bytes, 1);
79787eaa 2293 return 0;
857cc2fc 2294 }
79787eaa 2295
5d4f98a2
YZ
2296 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2297 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
f97806f2 2298 ret = run_delayed_tree_ref(trans, node, extent_op,
5d4f98a2
YZ
2299 insert_reserved);
2300 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2301 node->type == BTRFS_SHARED_DATA_REF_KEY)
2bf98ef3 2302 ret = run_delayed_data_ref(trans, node, extent_op,
5d4f98a2
YZ
2303 insert_reserved);
2304 else
2305 BUG();
80ee54bf
JB
2306 if (ret && insert_reserved)
2307 btrfs_pin_extent(trans->fs_info, node->bytenr,
2308 node->num_bytes, 1);
5d4f98a2 2309 return ret;
56bec294
CM
2310}
2311
c6fc2454 2312static inline struct btrfs_delayed_ref_node *
56bec294
CM
2313select_delayed_ref(struct btrfs_delayed_ref_head *head)
2314{
cffc3374
FM
2315 struct btrfs_delayed_ref_node *ref;
2316
e3d03965 2317 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 2318 return NULL;
d7df2c79 2319
cffc3374
FM
2320 /*
2321 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2322 * This is to prevent a ref count from going down to zero, which deletes
2323 * the extent item from the extent tree, when there still are references
2324 * to add, which would fail because they would not find the extent item.
2325 */
1d57ee94
WX
2326 if (!list_empty(&head->ref_add_list))
2327 return list_first_entry(&head->ref_add_list,
2328 struct btrfs_delayed_ref_node, add_list);
2329
e3d03965 2330 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 2331 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
2332 ASSERT(list_empty(&ref->add_list));
2333 return ref;
56bec294
CM
2334}
2335
2eadaa22
JB
2336static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2337 struct btrfs_delayed_ref_head *head)
2338{
2339 spin_lock(&delayed_refs->lock);
2340 head->processing = 0;
2341 delayed_refs->num_heads_ready++;
2342 spin_unlock(&delayed_refs->lock);
2343 btrfs_delayed_ref_unlock(head);
2344}
2345
bedc6617
JB
2346static struct btrfs_delayed_extent_op *cleanup_extent_op(
2347 struct btrfs_delayed_ref_head *head)
b00e6250
JB
2348{
2349 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
2350
2351 if (!extent_op)
bedc6617
JB
2352 return NULL;
2353
b00e6250 2354 if (head->must_insert_reserved) {
bedc6617 2355 head->extent_op = NULL;
b00e6250 2356 btrfs_free_delayed_extent_op(extent_op);
bedc6617 2357 return NULL;
b00e6250 2358 }
bedc6617
JB
2359 return extent_op;
2360}
2361
2362static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2363 struct btrfs_delayed_ref_head *head)
2364{
2365 struct btrfs_delayed_extent_op *extent_op;
2366 int ret;
2367
2368 extent_op = cleanup_extent_op(head);
2369 if (!extent_op)
2370 return 0;
2371 head->extent_op = NULL;
b00e6250 2372 spin_unlock(&head->lock);
20b9a2d6 2373 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
2374 btrfs_free_delayed_extent_op(extent_op);
2375 return ret ? ret : 1;
2376}
2377
31890da0
JB
2378void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2379 struct btrfs_delayed_ref_root *delayed_refs,
2380 struct btrfs_delayed_ref_head *head)
07c47775 2381{
ba2c4d4e 2382 int nr_items = 1; /* Dropping this ref head update. */
07c47775
JB
2383
2384 if (head->total_ref_mod < 0) {
2385 struct btrfs_space_info *space_info;
2386 u64 flags;
2387
2388 if (head->is_data)
2389 flags = BTRFS_BLOCK_GROUP_DATA;
2390 else if (head->is_system)
2391 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2392 else
2393 flags = BTRFS_BLOCK_GROUP_METADATA;
280c2908 2394 space_info = btrfs_find_space_info(fs_info, flags);
07c47775
JB
2395 ASSERT(space_info);
2396 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2397 -head->num_bytes,
2398 BTRFS_TOTAL_BYTES_PINNED_BATCH);
2399
ba2c4d4e
JB
2400 /*
2401 * We had csum deletions accounted for in our delayed refs rsv,
2402 * we need to drop the csum leaves for this update from our
2403 * delayed_refs_rsv.
2404 */
07c47775
JB
2405 if (head->is_data) {
2406 spin_lock(&delayed_refs->lock);
2407 delayed_refs->pending_csums -= head->num_bytes;
2408 spin_unlock(&delayed_refs->lock);
ba2c4d4e
JB
2409 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2410 head->num_bytes);
07c47775
JB
2411 }
2412 }
2413
ba2c4d4e 2414 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
07c47775
JB
2415}
2416
194ab0bc 2417static int cleanup_ref_head(struct btrfs_trans_handle *trans,
194ab0bc
JB
2418 struct btrfs_delayed_ref_head *head)
2419{
f9871edd
NB
2420
2421 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
2422 struct btrfs_delayed_ref_root *delayed_refs;
2423 int ret;
2424
2425 delayed_refs = &trans->transaction->delayed_refs;
2426
bedc6617 2427 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
2428 if (ret < 0) {
2429 unselect_delayed_ref_head(delayed_refs, head);
2430 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2431 return ret;
2432 } else if (ret) {
2433 return ret;
2434 }
2435
2436 /*
2437 * Need to drop our head ref lock and re-acquire the delayed ref lock
2438 * and then re-check to make sure nobody got added.
2439 */
2440 spin_unlock(&head->lock);
2441 spin_lock(&delayed_refs->lock);
2442 spin_lock(&head->lock);
e3d03965 2443 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
2444 spin_unlock(&head->lock);
2445 spin_unlock(&delayed_refs->lock);
2446 return 1;
2447 }
d7baffda 2448 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 2449 spin_unlock(&head->lock);
1e7a1421 2450 spin_unlock(&delayed_refs->lock);
c1103f7a 2451
c1103f7a 2452 if (head->must_insert_reserved) {
d278850e
JB
2453 btrfs_pin_extent(fs_info, head->bytenr,
2454 head->num_bytes, 1);
c1103f7a 2455 if (head->is_data) {
d278850e
JB
2456 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2457 head->num_bytes);
c1103f7a
JB
2458 }
2459 }
2460
31890da0 2461 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
2462
2463 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 2464 btrfs_delayed_ref_unlock(head);
d278850e 2465 btrfs_put_delayed_ref_head(head);
194ab0bc
JB
2466 return 0;
2467}
2468
b1cdbcb5
NB
2469static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2470 struct btrfs_trans_handle *trans)
2471{
2472 struct btrfs_delayed_ref_root *delayed_refs =
2473 &trans->transaction->delayed_refs;
2474 struct btrfs_delayed_ref_head *head = NULL;
2475 int ret;
2476
2477 spin_lock(&delayed_refs->lock);
5637c74b 2478 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
2479 if (!head) {
2480 spin_unlock(&delayed_refs->lock);
2481 return head;
2482 }
2483
2484 /*
2485 * Grab the lock that says we are going to process all the refs for
2486 * this head
2487 */
9e920a6f 2488 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
2489 spin_unlock(&delayed_refs->lock);
2490
2491 /*
2492 * We may have dropped the spin lock to get the head mutex lock, and
2493 * that might have given someone else time to free the head. If that's
2494 * true, it has been removed from our list and we can move on.
2495 */
2496 if (ret == -EAGAIN)
2497 head = ERR_PTR(-EAGAIN);
2498
2499 return head;
2500}
2501
e7261386
NB
2502static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2503 struct btrfs_delayed_ref_head *locked_ref,
2504 unsigned long *run_refs)
2505{
2506 struct btrfs_fs_info *fs_info = trans->fs_info;
2507 struct btrfs_delayed_ref_root *delayed_refs;
2508 struct btrfs_delayed_extent_op *extent_op;
2509 struct btrfs_delayed_ref_node *ref;
2510 int must_insert_reserved = 0;
2511 int ret;
2512
2513 delayed_refs = &trans->transaction->delayed_refs;
2514
0110a4c4
NB
2515 lockdep_assert_held(&locked_ref->mutex);
2516 lockdep_assert_held(&locked_ref->lock);
2517
e7261386
NB
2518 while ((ref = select_delayed_ref(locked_ref))) {
2519 if (ref->seq &&
2520 btrfs_check_delayed_seq(fs_info, ref->seq)) {
2521 spin_unlock(&locked_ref->lock);
2522 unselect_delayed_ref_head(delayed_refs, locked_ref);
2523 return -EAGAIN;
2524 }
2525
2526 (*run_refs)++;
2527 ref->in_tree = 0;
2528 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2529 RB_CLEAR_NODE(&ref->ref_node);
2530 if (!list_empty(&ref->add_list))
2531 list_del(&ref->add_list);
2532 /*
2533 * When we play the delayed ref, also correct the ref_mod on
2534 * head
2535 */
2536 switch (ref->action) {
2537 case BTRFS_ADD_DELAYED_REF:
2538 case BTRFS_ADD_DELAYED_EXTENT:
2539 locked_ref->ref_mod -= ref->ref_mod;
2540 break;
2541 case BTRFS_DROP_DELAYED_REF:
2542 locked_ref->ref_mod += ref->ref_mod;
2543 break;
2544 default:
2545 WARN_ON(1);
2546 }
2547 atomic_dec(&delayed_refs->num_entries);
2548
2549 /*
2550 * Record the must_insert_reserved flag before we drop the
2551 * spin lock.
2552 */
2553 must_insert_reserved = locked_ref->must_insert_reserved;
2554 locked_ref->must_insert_reserved = 0;
2555
2556 extent_op = locked_ref->extent_op;
2557 locked_ref->extent_op = NULL;
2558 spin_unlock(&locked_ref->lock);
2559
2560 ret = run_one_delayed_ref(trans, ref, extent_op,
2561 must_insert_reserved);
2562
2563 btrfs_free_delayed_extent_op(extent_op);
2564 if (ret) {
2565 unselect_delayed_ref_head(delayed_refs, locked_ref);
2566 btrfs_put_delayed_ref(ref);
2567 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2568 ret);
2569 return ret;
2570 }
2571
2572 btrfs_put_delayed_ref(ref);
2573 cond_resched();
2574
2575 spin_lock(&locked_ref->lock);
2576 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2577 }
2578
2579 return 0;
2580}
2581
79787eaa
JM
2582/*
2583 * Returns 0 on success or if called with an already aborted transaction.
2584 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2585 */
d7df2c79 2586static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
d7df2c79 2587 unsigned long nr)
56bec294 2588{
0a1e458a 2589 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 2590 struct btrfs_delayed_ref_root *delayed_refs;
56bec294 2591 struct btrfs_delayed_ref_head *locked_ref = NULL;
0a2b2a84 2592 ktime_t start = ktime_get();
56bec294 2593 int ret;
d7df2c79 2594 unsigned long count = 0;
0a2b2a84 2595 unsigned long actual_count = 0;
56bec294
CM
2596
2597 delayed_refs = &trans->transaction->delayed_refs;
0110a4c4 2598 do {
56bec294 2599 if (!locked_ref) {
b1cdbcb5 2600 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
2601 if (IS_ERR_OR_NULL(locked_ref)) {
2602 if (PTR_ERR(locked_ref) == -EAGAIN) {
2603 continue;
2604 } else {
2605 break;
2606 }
56bec294 2607 }
0110a4c4 2608 count++;
56bec294 2609 }
2c3cf7d5
FM
2610 /*
2611 * We need to try and merge add/drops of the same ref since we
2612 * can run into issues with relocate dropping the implicit ref
2613 * and then it being added back again before the drop can
2614 * finish. If we merged anything we need to re-loop so we can
2615 * get a good ref.
2616 * Or we can get node references of the same type that weren't
2617 * merged when created due to bumps in the tree mod seq, and
2618 * we need to merge them to prevent adding an inline extent
2619 * backref before dropping it (triggering a BUG_ON at
2620 * insert_inline_extent_backref()).
2621 */
d7df2c79 2622 spin_lock(&locked_ref->lock);
be97f133 2623 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
ae1e206b 2624
0110a4c4
NB
2625 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2626 &actual_count);
2627 if (ret < 0 && ret != -EAGAIN) {
2628 /*
2629 * Error, btrfs_run_delayed_refs_for_head already
2630 * unlocked everything so just bail out
2631 */
2632 return ret;
2633 } else if (!ret) {
2634 /*
2635 * Success, perform the usual cleanup of a processed
2636 * head
2637 */
f9871edd 2638 ret = cleanup_ref_head(trans, locked_ref);
194ab0bc 2639 if (ret > 0 ) {
b00e6250
JB
2640 /* We dropped our lock, we need to loop. */
2641 ret = 0;
d7df2c79 2642 continue;
194ab0bc
JB
2643 } else if (ret) {
2644 return ret;
5d4f98a2 2645 }
22cd2e7d 2646 }
1ce7a5ec 2647
b00e6250 2648 /*
0110a4c4
NB
2649 * Either success case or btrfs_run_delayed_refs_for_head
2650 * returned -EAGAIN, meaning we need to select another head
b00e6250 2651 */
b00e6250 2652
0110a4c4 2653 locked_ref = NULL;
c3e69d58 2654 cond_resched();
0110a4c4 2655 } while ((nr != -1 && count < nr) || locked_ref);
0a2b2a84
JB
2656
2657 /*
2658 * We don't want to include ref heads since we can have empty ref heads
2659 * and those will drastically skew our runtime down since we just do
2660 * accounting, no actual extent tree updates.
2661 */
2662 if (actual_count > 0) {
2663 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2664 u64 avg;
2665
2666 /*
2667 * We weigh the current average higher than our current runtime
2668 * to avoid large swings in the average.
2669 */
2670 spin_lock(&delayed_refs->lock);
2671 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
f8c269d7 2672 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
0a2b2a84
JB
2673 spin_unlock(&delayed_refs->lock);
2674 }
d7df2c79 2675 return 0;
c3e69d58
CM
2676}
2677
709c0486
AJ
2678#ifdef SCRAMBLE_DELAYED_REFS
2679/*
2680 * Normally delayed refs get processed in ascending bytenr order. This
2681 * correlates in most cases to the order added. To expose dependencies on this
2682 * order, we start to process the tree in the middle instead of the beginning
2683 */
2684static u64 find_middle(struct rb_root *root)
2685{
2686 struct rb_node *n = root->rb_node;
2687 struct btrfs_delayed_ref_node *entry;
2688 int alt = 1;
2689 u64 middle;
2690 u64 first = 0, last = 0;
2691
2692 n = rb_first(root);
2693 if (n) {
2694 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2695 first = entry->bytenr;
2696 }
2697 n = rb_last(root);
2698 if (n) {
2699 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2700 last = entry->bytenr;
2701 }
2702 n = root->rb_node;
2703
2704 while (n) {
2705 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2706 WARN_ON(!entry->in_tree);
2707
2708 middle = entry->bytenr;
2709
2710 if (alt)
2711 n = n->rb_left;
2712 else
2713 n = n->rb_right;
2714
2715 alt = 1 - alt;
2716 }
2717 return middle;
2718}
2719#endif
2720
2ff7e61e 2721static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
1be41b78
JB
2722{
2723 u64 num_bytes;
2724
2725 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2726 sizeof(struct btrfs_extent_inline_ref));
0b246afa 2727 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1be41b78
JB
2728 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2729
2730 /*
2731 * We don't ever fill up leaves all the way so multiply by 2 just to be
01327610 2732 * closer to what we're really going to want to use.
1be41b78 2733 */
0b246afa 2734 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
1be41b78
JB
2735}
2736
1262133b
JB
2737/*
2738 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2739 * would require to store the csums for that many bytes.
2740 */
2ff7e61e 2741u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
1262133b
JB
2742{
2743 u64 csum_size;
2744 u64 num_csums_per_leaf;
2745 u64 num_csums;
2746
0b246afa 2747 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
1262133b 2748 num_csums_per_leaf = div64_u64(csum_size,
0b246afa
JM
2749 (u64)btrfs_super_csum_size(fs_info->super_copy));
2750 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
1262133b
JB
2751 num_csums += num_csums_per_leaf - 1;
2752 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2753 return num_csums;
2754}
2755
64403612 2756bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
1be41b78 2757{
64403612
JB
2758 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2759 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2760 bool ret = false;
2761 u64 reserved;
1be41b78 2762
64403612
JB
2763 spin_lock(&global_rsv->lock);
2764 reserved = global_rsv->reserved;
2765 spin_unlock(&global_rsv->lock);
1be41b78
JB
2766
2767 /*
64403612
JB
2768 * Since the global reserve is just kind of magic we don't really want
2769 * to rely on it to save our bacon, so if our size is more than the
2770 * delayed_refs_rsv and the global rsv then it's time to think about
2771 * bailing.
1be41b78 2772 */
64403612
JB
2773 spin_lock(&delayed_refs_rsv->lock);
2774 reserved += delayed_refs_rsv->reserved;
2775 if (delayed_refs_rsv->size >= reserved)
2776 ret = true;
2777 spin_unlock(&delayed_refs_rsv->lock);
1be41b78
JB
2778 return ret;
2779}
2780
7c861627 2781int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
0a2b2a84 2782{
0a2b2a84
JB
2783 u64 num_entries =
2784 atomic_read(&trans->transaction->delayed_refs.num_entries);
2785 u64 avg_runtime;
a79b7d4b 2786 u64 val;
0a2b2a84
JB
2787
2788 smp_mb();
7c861627 2789 avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
a79b7d4b 2790 val = num_entries * avg_runtime;
dc1a90c6 2791 if (val >= NSEC_PER_SEC)
0a2b2a84 2792 return 1;
a79b7d4b
CM
2793 if (val >= NSEC_PER_SEC / 2)
2794 return 2;
0a2b2a84 2795
64403612 2796 return btrfs_check_space_for_delayed_refs(trans->fs_info);
0a2b2a84
JB
2797}
2798
c3e69d58
CM
2799/*
2800 * this starts processing the delayed reference count updates and
2801 * extent insertions we have queued up so far. count can be
2802 * 0, which means to process everything in the tree at the start
2803 * of the run (but not newly added entries), or it can be some target
2804 * number you'd like to process.
79787eaa
JM
2805 *
2806 * Returns 0 on success or if called with an aborted transaction
2807 * Returns <0 on error and aborts the transaction
c3e69d58
CM
2808 */
2809int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
c79a70b1 2810 unsigned long count)
c3e69d58 2811{
c79a70b1 2812 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58
CM
2813 struct rb_node *node;
2814 struct btrfs_delayed_ref_root *delayed_refs;
c46effa6 2815 struct btrfs_delayed_ref_head *head;
c3e69d58
CM
2816 int ret;
2817 int run_all = count == (unsigned long)-1;
c3e69d58 2818
79787eaa
JM
2819 /* We'll clean this up in btrfs_cleanup_transaction */
2820 if (trans->aborted)
2821 return 0;
2822
0b246afa 2823 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2824 return 0;
2825
c3e69d58 2826 delayed_refs = &trans->transaction->delayed_refs;
26455d33 2827 if (count == 0)
d7df2c79 2828 count = atomic_read(&delayed_refs->num_entries) * 2;
bb721703 2829
c3e69d58 2830again:
709c0486
AJ
2831#ifdef SCRAMBLE_DELAYED_REFS
2832 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2833#endif
0a1e458a 2834 ret = __btrfs_run_delayed_refs(trans, count);
d7df2c79 2835 if (ret < 0) {
66642832 2836 btrfs_abort_transaction(trans, ret);
d7df2c79 2837 return ret;
eb099670 2838 }
c3e69d58 2839
56bec294 2840 if (run_all) {
119e80df 2841 btrfs_create_pending_block_groups(trans);
ea658bad 2842
d7df2c79 2843 spin_lock(&delayed_refs->lock);
5c9d028b 2844 node = rb_first_cached(&delayed_refs->href_root);
d7df2c79
JB
2845 if (!node) {
2846 spin_unlock(&delayed_refs->lock);
56bec294 2847 goto out;
d7df2c79 2848 }
d278850e
JB
2849 head = rb_entry(node, struct btrfs_delayed_ref_head,
2850 href_node);
2851 refcount_inc(&head->refs);
2852 spin_unlock(&delayed_refs->lock);
e9d0b13b 2853
d278850e
JB
2854 /* Mutex was contended, block until it's released and retry. */
2855 mutex_lock(&head->mutex);
2856 mutex_unlock(&head->mutex);
56bec294 2857
d278850e 2858 btrfs_put_delayed_ref_head(head);
d7df2c79 2859 cond_resched();
56bec294 2860 goto again;
5f39d397 2861 }
54aa1f4d 2862out:
a28ec197
CM
2863 return 0;
2864}
2865
5d4f98a2 2866int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
5d4f98a2 2867 u64 bytenr, u64 num_bytes, u64 flags,
b1c79e09 2868 int level, int is_data)
5d4f98a2
YZ
2869{
2870 struct btrfs_delayed_extent_op *extent_op;
2871 int ret;
2872
78a6184a 2873 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2874 if (!extent_op)
2875 return -ENOMEM;
2876
2877 extent_op->flags_to_set = flags;
35b3ad50
DS
2878 extent_op->update_flags = true;
2879 extent_op->update_key = false;
2880 extent_op->is_data = is_data ? true : false;
b1c79e09 2881 extent_op->level = level;
5d4f98a2 2882
c6e340bc 2883 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
5d4f98a2 2884 if (ret)
78a6184a 2885 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2886 return ret;
2887}
2888
e4c3b2dc 2889static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2890 struct btrfs_path *path,
2891 u64 objectid, u64 offset, u64 bytenr)
2892{
2893 struct btrfs_delayed_ref_head *head;
2894 struct btrfs_delayed_ref_node *ref;
2895 struct btrfs_delayed_data_ref *data_ref;
2896 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2897 struct btrfs_transaction *cur_trans;
0e0adbcf 2898 struct rb_node *node;
5d4f98a2
YZ
2899 int ret = 0;
2900
998ac6d2 2901 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2902 cur_trans = root->fs_info->running_transaction;
998ac6d2 2903 if (cur_trans)
2904 refcount_inc(&cur_trans->use_count);
2905 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2906 if (!cur_trans)
2907 return 0;
2908
2909 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2910 spin_lock(&delayed_refs->lock);
f72ad18e 2911 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2912 if (!head) {
2913 spin_unlock(&delayed_refs->lock);
998ac6d2 2914 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2915 return 0;
2916 }
5d4f98a2
YZ
2917
2918 if (!mutex_trylock(&head->mutex)) {
d278850e 2919 refcount_inc(&head->refs);
5d4f98a2
YZ
2920 spin_unlock(&delayed_refs->lock);
2921
b3b4aa74 2922 btrfs_release_path(path);
5d4f98a2 2923
8cc33e5c
DS
2924 /*
2925 * Mutex was contended, block until it's released and let
2926 * caller try again
2927 */
5d4f98a2
YZ
2928 mutex_lock(&head->mutex);
2929 mutex_unlock(&head->mutex);
d278850e 2930 btrfs_put_delayed_ref_head(head);
998ac6d2 2931 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2932 return -EAGAIN;
2933 }
d7df2c79 2934 spin_unlock(&delayed_refs->lock);
5d4f98a2 2935
d7df2c79 2936 spin_lock(&head->lock);
0e0adbcf
JB
2937 /*
2938 * XXX: We should replace this with a proper search function in the
2939 * future.
2940 */
e3d03965
LB
2941 for (node = rb_first_cached(&head->ref_tree); node;
2942 node = rb_next(node)) {
0e0adbcf 2943 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2944 /* If it's a shared ref we know a cross reference exists */
2945 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2946 ret = 1;
2947 break;
2948 }
5d4f98a2 2949
d7df2c79 2950 data_ref = btrfs_delayed_node_to_data_ref(ref);
5d4f98a2 2951
d7df2c79
JB
2952 /*
2953 * If our ref doesn't match the one we're currently looking at
2954 * then we have a cross reference.
2955 */
2956 if (data_ref->root != root->root_key.objectid ||
2957 data_ref->objectid != objectid ||
2958 data_ref->offset != offset) {
2959 ret = 1;
2960 break;
2961 }
5d4f98a2 2962 }
d7df2c79 2963 spin_unlock(&head->lock);
5d4f98a2 2964 mutex_unlock(&head->mutex);
998ac6d2 2965 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2966 return ret;
2967}
2968
e4c3b2dc 2969static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2970 struct btrfs_path *path,
2971 u64 objectid, u64 offset, u64 bytenr)
be20aa9d 2972{
0b246afa
JM
2973 struct btrfs_fs_info *fs_info = root->fs_info;
2974 struct btrfs_root *extent_root = fs_info->extent_root;
f321e491 2975 struct extent_buffer *leaf;
5d4f98a2
YZ
2976 struct btrfs_extent_data_ref *ref;
2977 struct btrfs_extent_inline_ref *iref;
2978 struct btrfs_extent_item *ei;
f321e491 2979 struct btrfs_key key;
5d4f98a2 2980 u32 item_size;
3de28d57 2981 int type;
be20aa9d 2982 int ret;
925baedd 2983
be20aa9d 2984 key.objectid = bytenr;
31840ae1 2985 key.offset = (u64)-1;
f321e491 2986 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2987
be20aa9d
CM
2988 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2989 if (ret < 0)
2990 goto out;
79787eaa 2991 BUG_ON(ret == 0); /* Corruption */
80ff3856
YZ
2992
2993 ret = -ENOENT;
2994 if (path->slots[0] == 0)
31840ae1 2995 goto out;
be20aa9d 2996
31840ae1 2997 path->slots[0]--;
f321e491 2998 leaf = path->nodes[0];
5d4f98a2 2999 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 3000
5d4f98a2 3001 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 3002 goto out;
f321e491 3003
5d4f98a2
YZ
3004 ret = 1;
3005 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
5d4f98a2 3006 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 3007
5d4f98a2
YZ
3008 if (item_size != sizeof(*ei) +
3009 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3010 goto out;
be20aa9d 3011
5d4f98a2
YZ
3012 if (btrfs_extent_generation(leaf, ei) <=
3013 btrfs_root_last_snapshot(&root->root_item))
3014 goto out;
3015
3016 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3de28d57
LB
3017
3018 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3019 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
3020 goto out;
3021
3022 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3023 if (btrfs_extent_refs(leaf, ei) !=
3024 btrfs_extent_data_ref_count(leaf, ref) ||
3025 btrfs_extent_data_ref_root(leaf, ref) !=
3026 root->root_key.objectid ||
3027 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3028 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3029 goto out;
3030
3031 ret = 0;
3032out:
3033 return ret;
3034}
3035
e4c3b2dc
LB
3036int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3037 u64 bytenr)
5d4f98a2
YZ
3038{
3039 struct btrfs_path *path;
3040 int ret;
5d4f98a2
YZ
3041
3042 path = btrfs_alloc_path();
3043 if (!path)
9132c4ff 3044 return -ENOMEM;
5d4f98a2
YZ
3045
3046 do {
e4c3b2dc 3047 ret = check_committed_ref(root, path, objectid,
5d4f98a2
YZ
3048 offset, bytenr);
3049 if (ret && ret != -ENOENT)
f321e491 3050 goto out;
80ff3856 3051
380fd066
MT
3052 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3053 } while (ret == -EAGAIN);
5d4f98a2 3054
be20aa9d 3055out:
80ff3856 3056 btrfs_free_path(path);
f0486c68
YZ
3057 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3058 WARN_ON(ret > 0);
f321e491 3059 return ret;
be20aa9d 3060}
c5739bba 3061
5d4f98a2 3062static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 3063 struct btrfs_root *root,
5d4f98a2 3064 struct extent_buffer *buf,
e339a6b0 3065 int full_backref, int inc)
31840ae1 3066{
0b246afa 3067 struct btrfs_fs_info *fs_info = root->fs_info;
31840ae1 3068 u64 bytenr;
5d4f98a2
YZ
3069 u64 num_bytes;
3070 u64 parent;
31840ae1 3071 u64 ref_root;
31840ae1 3072 u32 nritems;
31840ae1
ZY
3073 struct btrfs_key key;
3074 struct btrfs_file_extent_item *fi;
82fa113f
QW
3075 struct btrfs_ref generic_ref = { 0 };
3076 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 3077 int i;
82fa113f 3078 int action;
31840ae1
ZY
3079 int level;
3080 int ret = 0;
fccb84c9 3081
0b246afa 3082 if (btrfs_is_testing(fs_info))
faa2dbf0 3083 return 0;
fccb84c9 3084
31840ae1 3085 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
3086 nritems = btrfs_header_nritems(buf);
3087 level = btrfs_header_level(buf);
3088
27cdeb70 3089 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
5d4f98a2 3090 return 0;
31840ae1 3091
5d4f98a2
YZ
3092 if (full_backref)
3093 parent = buf->start;
3094 else
3095 parent = 0;
82fa113f
QW
3096 if (inc)
3097 action = BTRFS_ADD_DELAYED_REF;
3098 else
3099 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
3100
3101 for (i = 0; i < nritems; i++) {
31840ae1 3102 if (level == 0) {
5d4f98a2 3103 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 3104 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 3105 continue;
5d4f98a2 3106 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
3107 struct btrfs_file_extent_item);
3108 if (btrfs_file_extent_type(buf, fi) ==
3109 BTRFS_FILE_EXTENT_INLINE)
3110 continue;
3111 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3112 if (bytenr == 0)
3113 continue;
5d4f98a2
YZ
3114
3115 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3116 key.offset -= btrfs_file_extent_offset(buf, fi);
82fa113f
QW
3117 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3118 num_bytes, parent);
3119 generic_ref.real_root = root->root_key.objectid;
3120 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
3121 key.offset);
3122 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 3123 if (inc)
82fa113f 3124 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 3125 else
ffd4bb2a 3126 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
3127 if (ret)
3128 goto fail;
3129 } else {
5d4f98a2 3130 bytenr = btrfs_node_blockptr(buf, i);
0b246afa 3131 num_bytes = fs_info->nodesize;
82fa113f
QW
3132 btrfs_init_generic_ref(&generic_ref, action, bytenr,
3133 num_bytes, parent);
3134 generic_ref.real_root = root->root_key.objectid;
3135 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
3136 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 3137 if (inc)
82fa113f 3138 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 3139 else
ffd4bb2a 3140 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
3141 if (ret)
3142 goto fail;
3143 }
3144 }
3145 return 0;
3146fail:
5d4f98a2
YZ
3147 return ret;
3148}
3149
3150int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3151 struct extent_buffer *buf, int full_backref)
5d4f98a2 3152{
e339a6b0 3153 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
3154}
3155
3156int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3157 struct extent_buffer *buf, int full_backref)
5d4f98a2 3158{
e339a6b0 3159 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
3160}
3161
9078a3e1 3162static int write_one_cache_group(struct btrfs_trans_handle *trans,
9078a3e1
CM
3163 struct btrfs_path *path,
3164 struct btrfs_block_group_cache *cache)
3165{
39db232d 3166 struct btrfs_fs_info *fs_info = trans->fs_info;
9078a3e1 3167 int ret;
0b246afa 3168 struct btrfs_root *extent_root = fs_info->extent_root;
5f39d397
CM
3169 unsigned long bi;
3170 struct extent_buffer *leaf;
9078a3e1 3171
9078a3e1 3172 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
df95e7f0
JB
3173 if (ret) {
3174 if (ret > 0)
3175 ret = -ENOENT;
54aa1f4d 3176 goto fail;
df95e7f0 3177 }
5f39d397
CM
3178
3179 leaf = path->nodes[0];
3180 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3181 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3182 btrfs_mark_buffer_dirty(leaf);
54aa1f4d 3183fail:
24b89d08 3184 btrfs_release_path(path);
df95e7f0 3185 return ret;
9078a3e1
CM
3186
3187}
3188
f87b7eb8
DS
3189static struct btrfs_block_group_cache *next_block_group(
3190 struct btrfs_block_group_cache *cache)
4a8c9a62 3191{
f87b7eb8 3192 struct btrfs_fs_info *fs_info = cache->fs_info;
4a8c9a62 3193 struct rb_node *node;
292cbd51 3194
0b246afa 3195 spin_lock(&fs_info->block_group_cache_lock);
292cbd51
FM
3196
3197 /* If our block group was removed, we need a full search. */
3198 if (RB_EMPTY_NODE(&cache->cache_node)) {
3199 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3200
0b246afa 3201 spin_unlock(&fs_info->block_group_cache_lock);
292cbd51 3202 btrfs_put_block_group(cache);
0b246afa 3203 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
292cbd51 3204 }
4a8c9a62
YZ
3205 node = rb_next(&cache->cache_node);
3206 btrfs_put_block_group(cache);
3207 if (node) {
3208 cache = rb_entry(node, struct btrfs_block_group_cache,
3209 cache_node);
11dfe35a 3210 btrfs_get_block_group(cache);
4a8c9a62
YZ
3211 } else
3212 cache = NULL;
0b246afa 3213 spin_unlock(&fs_info->block_group_cache_lock);
4a8c9a62
YZ
3214 return cache;
3215}
3216
0af3d00b
JB
3217static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3218 struct btrfs_trans_handle *trans,
3219 struct btrfs_path *path)
3220{
0b246afa
JM
3221 struct btrfs_fs_info *fs_info = block_group->fs_info;
3222 struct btrfs_root *root = fs_info->tree_root;
0af3d00b 3223 struct inode *inode = NULL;
364ecf36 3224 struct extent_changeset *data_reserved = NULL;
0af3d00b 3225 u64 alloc_hint = 0;
2b20982e 3226 int dcs = BTRFS_DC_ERROR;
f8c269d7 3227 u64 num_pages = 0;
0af3d00b
JB
3228 int retries = 0;
3229 int ret = 0;
3230
3231 /*
3232 * If this block group is smaller than 100 megs don't bother caching the
3233 * block group.
3234 */
ee22184b 3235 if (block_group->key.offset < (100 * SZ_1M)) {
0af3d00b
JB
3236 spin_lock(&block_group->lock);
3237 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3238 spin_unlock(&block_group->lock);
3239 return 0;
3240 }
3241
0c0ef4bc
JB
3242 if (trans->aborted)
3243 return 0;
0af3d00b 3244again:
7949f339 3245 inode = lookup_free_space_inode(block_group, path);
0af3d00b
JB
3246 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3247 ret = PTR_ERR(inode);
b3b4aa74 3248 btrfs_release_path(path);
0af3d00b
JB
3249 goto out;
3250 }
3251
3252 if (IS_ERR(inode)) {
3253 BUG_ON(retries);
3254 retries++;
3255
3256 if (block_group->ro)
3257 goto out_free;
3258
4ca75f1b 3259 ret = create_free_space_inode(trans, block_group, path);
0af3d00b
JB
3260 if (ret)
3261 goto out_free;
3262 goto again;
3263 }
3264
3265 /*
3266 * We want to set the generation to 0, that way if anything goes wrong
3267 * from here on out we know not to trust this cache when we load up next
3268 * time.
3269 */
3270 BTRFS_I(inode)->generation = 0;
3271 ret = btrfs_update_inode(trans, root, inode);
0c0ef4bc
JB
3272 if (ret) {
3273 /*
3274 * So theoretically we could recover from this, simply set the
3275 * super cache generation to 0 so we know to invalidate the
3276 * cache, but then we'd have to keep track of the block groups
3277 * that fail this way so we know we _have_ to reset this cache
3278 * before the next commit or risk reading stale cache. So to
3279 * limit our exposure to horrible edge cases lets just abort the
3280 * transaction, this only happens in really bad situations
3281 * anyway.
3282 */
66642832 3283 btrfs_abort_transaction(trans, ret);
0c0ef4bc
JB
3284 goto out_put;
3285 }
0af3d00b
JB
3286 WARN_ON(ret);
3287
8e138e0d
JB
3288 /* We've already setup this transaction, go ahead and exit */
3289 if (block_group->cache_generation == trans->transid &&
3290 i_size_read(inode)) {
3291 dcs = BTRFS_DC_SETUP;
3292 goto out_put;
3293 }
3294
0af3d00b 3295 if (i_size_read(inode) > 0) {
2ff7e61e 3296 ret = btrfs_check_trunc_cache_free_space(fs_info,
0b246afa 3297 &fs_info->global_block_rsv);
7b61cd92
MX
3298 if (ret)
3299 goto out_put;
3300
77ab86bf 3301 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
0af3d00b
JB
3302 if (ret)
3303 goto out_put;
3304 }
3305
3306 spin_lock(&block_group->lock);
cf7c1ef6 3307 if (block_group->cached != BTRFS_CACHE_FINISHED ||
0b246afa 3308 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
cf7c1ef6
LB
3309 /*
3310 * don't bother trying to write stuff out _if_
3311 * a) we're not cached,
1a79c1f2
LB
3312 * b) we're with nospace_cache mount option,
3313 * c) we're with v2 space_cache (FREE_SPACE_TREE).
cf7c1ef6 3314 */
2b20982e 3315 dcs = BTRFS_DC_WRITTEN;
0af3d00b
JB
3316 spin_unlock(&block_group->lock);
3317 goto out_put;
3318 }
3319 spin_unlock(&block_group->lock);
3320
2968b1f4
JB
3321 /*
3322 * We hit an ENOSPC when setting up the cache in this transaction, just
3323 * skip doing the setup, we've already cleared the cache so we're safe.
3324 */
3325 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3326 ret = -ENOSPC;
3327 goto out_put;
3328 }
3329
6fc823b1
JB
3330 /*
3331 * Try to preallocate enough space based on how big the block group is.
3332 * Keep in mind this has to include any pinned space which could end up
3333 * taking up quite a bit since it's not folded into the other space
3334 * cache.
3335 */
ee22184b 3336 num_pages = div_u64(block_group->key.offset, SZ_256M);
0af3d00b
JB
3337 if (!num_pages)
3338 num_pages = 1;
3339
0af3d00b 3340 num_pages *= 16;
09cbfeaf 3341 num_pages *= PAGE_SIZE;
0af3d00b 3342
364ecf36 3343 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
0af3d00b
JB
3344 if (ret)
3345 goto out_put;
3346
3347 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3348 num_pages, num_pages,
3349 &alloc_hint);
2968b1f4
JB
3350 /*
3351 * Our cache requires contiguous chunks so that we don't modify a bunch
3352 * of metadata or split extents when writing the cache out, which means
3353 * we can enospc if we are heavily fragmented in addition to just normal
3354 * out of space conditions. So if we hit this just skip setting up any
3355 * other block groups for this transaction, maybe we'll unpin enough
3356 * space the next time around.
3357 */
2b20982e
JB
3358 if (!ret)
3359 dcs = BTRFS_DC_SETUP;
2968b1f4
JB
3360 else if (ret == -ENOSPC)
3361 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
c09544e0 3362
0af3d00b
JB
3363out_put:
3364 iput(inode);
3365out_free:
b3b4aa74 3366 btrfs_release_path(path);
0af3d00b
JB
3367out:
3368 spin_lock(&block_group->lock);
e65cbb94 3369 if (!ret && dcs == BTRFS_DC_SETUP)
5b0e95bf 3370 block_group->cache_generation = trans->transid;
2b20982e 3371 block_group->disk_cache_state = dcs;
0af3d00b
JB
3372 spin_unlock(&block_group->lock);
3373
364ecf36 3374 extent_changeset_free(data_reserved);
0af3d00b
JB
3375 return ret;
3376}
3377
bbebb3e0 3378int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
dcdf7f6d 3379{
bbebb3e0 3380 struct btrfs_fs_info *fs_info = trans->fs_info;
dcdf7f6d
JB
3381 struct btrfs_block_group_cache *cache, *tmp;
3382 struct btrfs_transaction *cur_trans = trans->transaction;
3383 struct btrfs_path *path;
3384
3385 if (list_empty(&cur_trans->dirty_bgs) ||
0b246afa 3386 !btrfs_test_opt(fs_info, SPACE_CACHE))
dcdf7f6d
JB
3387 return 0;
3388
3389 path = btrfs_alloc_path();
3390 if (!path)
3391 return -ENOMEM;
3392
3393 /* Could add new block groups, use _safe just in case */
3394 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3395 dirty_list) {
3396 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3397 cache_save_setup(cache, trans, path);
3398 }
3399
3400 btrfs_free_path(path);
3401 return 0;
3402}
3403
1bbc621e
CM
3404/*
3405 * transaction commit does final block group cache writeback during a
3406 * critical section where nothing is allowed to change the FS. This is
3407 * required in order for the cache to actually match the block group,
3408 * but can introduce a lot of latency into the commit.
3409 *
3410 * So, btrfs_start_dirty_block_groups is here to kick off block group
3411 * cache IO. There's a chance we'll have to redo some of it if the
3412 * block group changes again during the commit, but it greatly reduces
3413 * the commit latency by getting rid of the easy block groups while
3414 * we're still allowing others to join the commit.
3415 */
21217054 3416int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
9078a3e1 3417{
21217054 3418 struct btrfs_fs_info *fs_info = trans->fs_info;
4a8c9a62 3419 struct btrfs_block_group_cache *cache;
ce93ec54
JB
3420 struct btrfs_transaction *cur_trans = trans->transaction;
3421 int ret = 0;
c9dc4c65 3422 int should_put;
1bbc621e
CM
3423 struct btrfs_path *path = NULL;
3424 LIST_HEAD(dirty);
3425 struct list_head *io = &cur_trans->io_bgs;
c9dc4c65 3426 int num_started = 0;
1bbc621e
CM
3427 int loops = 0;
3428
3429 spin_lock(&cur_trans->dirty_bgs_lock);
b58d1a9e
FM
3430 if (list_empty(&cur_trans->dirty_bgs)) {
3431 spin_unlock(&cur_trans->dirty_bgs_lock);
3432 return 0;
1bbc621e 3433 }
b58d1a9e 3434 list_splice_init(&cur_trans->dirty_bgs, &dirty);
1bbc621e 3435 spin_unlock(&cur_trans->dirty_bgs_lock);
ce93ec54 3436
1bbc621e 3437again:
1bbc621e
CM
3438 /*
3439 * make sure all the block groups on our dirty list actually
3440 * exist
3441 */
6c686b35 3442 btrfs_create_pending_block_groups(trans);
1bbc621e
CM
3443
3444 if (!path) {
3445 path = btrfs_alloc_path();
3446 if (!path)
3447 return -ENOMEM;
3448 }
3449
b58d1a9e
FM
3450 /*
3451 * cache_write_mutex is here only to save us from balance or automatic
3452 * removal of empty block groups deleting this block group while we are
3453 * writing out the cache
3454 */
3455 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e 3456 while (!list_empty(&dirty)) {
ba2c4d4e
JB
3457 bool drop_reserve = true;
3458
1bbc621e
CM
3459 cache = list_first_entry(&dirty,
3460 struct btrfs_block_group_cache,
3461 dirty_list);
1bbc621e
CM
3462 /*
3463 * this can happen if something re-dirties a block
3464 * group that is already under IO. Just wait for it to
3465 * finish and then do it all again
3466 */
3467 if (!list_empty(&cache->io_list)) {
3468 list_del_init(&cache->io_list);
afdb5718 3469 btrfs_wait_cache_io(trans, cache, path);
1bbc621e
CM
3470 btrfs_put_block_group(cache);
3471 }
3472
3473
3474 /*
3475 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3476 * if it should update the cache_state. Don't delete
3477 * until after we wait.
3478 *
3479 * Since we're not running in the commit critical section
3480 * we need the dirty_bgs_lock to protect from update_block_group
3481 */
3482 spin_lock(&cur_trans->dirty_bgs_lock);
3483 list_del_init(&cache->dirty_list);
3484 spin_unlock(&cur_trans->dirty_bgs_lock);
3485
3486 should_put = 1;
3487
3488 cache_save_setup(cache, trans, path);
3489
3490 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3491 cache->io_ctl.inode = NULL;
fe041534 3492 ret = btrfs_write_out_cache(trans, cache, path);
1bbc621e
CM
3493 if (ret == 0 && cache->io_ctl.inode) {
3494 num_started++;
3495 should_put = 0;
3496
3497 /*
45ae2c18
NB
3498 * The cache_write_mutex is protecting the
3499 * io_list, also refer to the definition of
3500 * btrfs_transaction::io_bgs for more details
1bbc621e
CM
3501 */
3502 list_add_tail(&cache->io_list, io);
3503 } else {
3504 /*
3505 * if we failed to write the cache, the
3506 * generation will be bad and life goes on
3507 */
3508 ret = 0;
3509 }
3510 }
ff1f8250 3511 if (!ret) {
39db232d 3512 ret = write_one_cache_group(trans, path, cache);
ff1f8250
FM
3513 /*
3514 * Our block group might still be attached to the list
3515 * of new block groups in the transaction handle of some
3516 * other task (struct btrfs_trans_handle->new_bgs). This
3517 * means its block group item isn't yet in the extent
3518 * tree. If this happens ignore the error, as we will
3519 * try again later in the critical section of the
3520 * transaction commit.
3521 */
3522 if (ret == -ENOENT) {
3523 ret = 0;
3524 spin_lock(&cur_trans->dirty_bgs_lock);
3525 if (list_empty(&cache->dirty_list)) {
3526 list_add_tail(&cache->dirty_list,
3527 &cur_trans->dirty_bgs);
3528 btrfs_get_block_group(cache);
ba2c4d4e 3529 drop_reserve = false;
ff1f8250
FM
3530 }
3531 spin_unlock(&cur_trans->dirty_bgs_lock);
3532 } else if (ret) {
66642832 3533 btrfs_abort_transaction(trans, ret);
ff1f8250
FM
3534 }
3535 }
1bbc621e 3536
52042d8e 3537 /* if it's not on the io list, we need to put the block group */
1bbc621e
CM
3538 if (should_put)
3539 btrfs_put_block_group(cache);
ba2c4d4e
JB
3540 if (drop_reserve)
3541 btrfs_delayed_refs_rsv_release(fs_info, 1);
1bbc621e
CM
3542
3543 if (ret)
3544 break;
b58d1a9e
FM
3545
3546 /*
3547 * Avoid blocking other tasks for too long. It might even save
3548 * us from writing caches for block groups that are going to be
3549 * removed.
3550 */
3551 mutex_unlock(&trans->transaction->cache_write_mutex);
3552 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e 3553 }
b58d1a9e 3554 mutex_unlock(&trans->transaction->cache_write_mutex);
1bbc621e
CM
3555
3556 /*
3557 * go through delayed refs for all the stuff we've just kicked off
3558 * and then loop back (just once)
3559 */
c79a70b1 3560 ret = btrfs_run_delayed_refs(trans, 0);
1bbc621e
CM
3561 if (!ret && loops == 0) {
3562 loops++;
3563 spin_lock(&cur_trans->dirty_bgs_lock);
3564 list_splice_init(&cur_trans->dirty_bgs, &dirty);
b58d1a9e
FM
3565 /*
3566 * dirty_bgs_lock protects us from concurrent block group
3567 * deletes too (not just cache_write_mutex).
3568 */
3569 if (!list_empty(&dirty)) {
3570 spin_unlock(&cur_trans->dirty_bgs_lock);
3571 goto again;
3572 }
1bbc621e 3573 spin_unlock(&cur_trans->dirty_bgs_lock);
c79a1751 3574 } else if (ret < 0) {
2ff7e61e 3575 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
1bbc621e
CM
3576 }
3577
3578 btrfs_free_path(path);
3579 return ret;
3580}
3581
5742d15f 3582int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
1bbc621e 3583{
5742d15f 3584 struct btrfs_fs_info *fs_info = trans->fs_info;
1bbc621e
CM
3585 struct btrfs_block_group_cache *cache;
3586 struct btrfs_transaction *cur_trans = trans->transaction;
3587 int ret = 0;
3588 int should_put;
3589 struct btrfs_path *path;
3590 struct list_head *io = &cur_trans->io_bgs;
3591 int num_started = 0;
9078a3e1
CM
3592
3593 path = btrfs_alloc_path();
3594 if (!path)
3595 return -ENOMEM;
3596
ce93ec54 3597 /*
e44081ef
FM
3598 * Even though we are in the critical section of the transaction commit,
3599 * we can still have concurrent tasks adding elements to this
3600 * transaction's list of dirty block groups. These tasks correspond to
3601 * endio free space workers started when writeback finishes for a
3602 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3603 * allocate new block groups as a result of COWing nodes of the root
3604 * tree when updating the free space inode. The writeback for the space
3605 * caches is triggered by an earlier call to
3606 * btrfs_start_dirty_block_groups() and iterations of the following
3607 * loop.
3608 * Also we want to do the cache_save_setup first and then run the
ce93ec54
JB
3609 * delayed refs to make sure we have the best chance at doing this all
3610 * in one shot.
3611 */
e44081ef 3612 spin_lock(&cur_trans->dirty_bgs_lock);
ce93ec54
JB
3613 while (!list_empty(&cur_trans->dirty_bgs)) {
3614 cache = list_first_entry(&cur_trans->dirty_bgs,
3615 struct btrfs_block_group_cache,
3616 dirty_list);
c9dc4c65
CM
3617
3618 /*
3619 * this can happen if cache_save_setup re-dirties a block
3620 * group that is already under IO. Just wait for it to
3621 * finish and then do it all again
3622 */
3623 if (!list_empty(&cache->io_list)) {
e44081ef 3624 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3625 list_del_init(&cache->io_list);
afdb5718 3626 btrfs_wait_cache_io(trans, cache, path);
c9dc4c65 3627 btrfs_put_block_group(cache);
e44081ef 3628 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3629 }
3630
1bbc621e
CM
3631 /*
3632 * don't remove from the dirty list until after we've waited
3633 * on any pending IO
3634 */
ce93ec54 3635 list_del_init(&cache->dirty_list);
e44081ef 3636 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3637 should_put = 1;
3638
1bbc621e 3639 cache_save_setup(cache, trans, path);
c9dc4c65 3640
ce93ec54 3641 if (!ret)
c79a70b1 3642 ret = btrfs_run_delayed_refs(trans,
2ff7e61e 3643 (unsigned long) -1);
c9dc4c65
CM
3644
3645 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3646 cache->io_ctl.inode = NULL;
fe041534 3647 ret = btrfs_write_out_cache(trans, cache, path);
c9dc4c65
CM
3648 if (ret == 0 && cache->io_ctl.inode) {
3649 num_started++;
3650 should_put = 0;
1bbc621e 3651 list_add_tail(&cache->io_list, io);
c9dc4c65
CM
3652 } else {
3653 /*
3654 * if we failed to write the cache, the
3655 * generation will be bad and life goes on
3656 */
3657 ret = 0;
3658 }
3659 }
ff1f8250 3660 if (!ret) {
39db232d 3661 ret = write_one_cache_group(trans, path, cache);
2bc0bb5f
FM
3662 /*
3663 * One of the free space endio workers might have
3664 * created a new block group while updating a free space
3665 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3666 * and hasn't released its transaction handle yet, in
3667 * which case the new block group is still attached to
3668 * its transaction handle and its creation has not
3669 * finished yet (no block group item in the extent tree
3670 * yet, etc). If this is the case, wait for all free
3671 * space endio workers to finish and retry. This is a
3672 * a very rare case so no need for a more efficient and
3673 * complex approach.
3674 */
3675 if (ret == -ENOENT) {
3676 wait_event(cur_trans->writer_wait,
3677 atomic_read(&cur_trans->num_writers) == 1);
39db232d 3678 ret = write_one_cache_group(trans, path, cache);
2bc0bb5f 3679 }
ff1f8250 3680 if (ret)
66642832 3681 btrfs_abort_transaction(trans, ret);
ff1f8250 3682 }
c9dc4c65
CM
3683
3684 /* if its not on the io list, we need to put the block group */
3685 if (should_put)
3686 btrfs_put_block_group(cache);
ba2c4d4e 3687 btrfs_delayed_refs_rsv_release(fs_info, 1);
e44081ef 3688 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3689 }
e44081ef 3690 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3691
45ae2c18
NB
3692 /*
3693 * Refer to the definition of io_bgs member for details why it's safe
3694 * to use it without any locking
3695 */
1bbc621e
CM
3696 while (!list_empty(io)) {
3697 cache = list_first_entry(io, struct btrfs_block_group_cache,
c9dc4c65
CM
3698 io_list);
3699 list_del_init(&cache->io_list);
afdb5718 3700 btrfs_wait_cache_io(trans, cache, path);
0cb59c99
JB
3701 btrfs_put_block_group(cache);
3702 }
3703
9078a3e1 3704 btrfs_free_path(path);
ce93ec54 3705 return ret;
9078a3e1
CM
3706}
3707
2ff7e61e 3708int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
d2fb3437
YZ
3709{
3710 struct btrfs_block_group_cache *block_group;
3711 int readonly = 0;
3712
0b246afa 3713 block_group = btrfs_lookup_block_group(fs_info, bytenr);
d2fb3437
YZ
3714 if (!block_group || block_group->ro)
3715 readonly = 1;
3716 if (block_group)
fa9c0d79 3717 btrfs_put_block_group(block_group);
d2fb3437
YZ
3718 return readonly;
3719}
3720
f78c436c
FM
3721bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3722{
3723 struct btrfs_block_group_cache *bg;
3724 bool ret = true;
3725
3726 bg = btrfs_lookup_block_group(fs_info, bytenr);
3727 if (!bg)
3728 return false;
3729
3730 spin_lock(&bg->lock);
3731 if (bg->ro)
3732 ret = false;
3733 else
3734 atomic_inc(&bg->nocow_writers);
3735 spin_unlock(&bg->lock);
3736
3737 /* no put on block group, done by btrfs_dec_nocow_writers */
3738 if (!ret)
3739 btrfs_put_block_group(bg);
3740
3741 return ret;
3742
3743}
3744
3745void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3746{
3747 struct btrfs_block_group_cache *bg;
3748
3749 bg = btrfs_lookup_block_group(fs_info, bytenr);
3750 ASSERT(bg);
3751 if (atomic_dec_and_test(&bg->nocow_writers))
4625956a 3752 wake_up_var(&bg->nocow_writers);
f78c436c
FM
3753 /*
3754 * Once for our lookup and once for the lookup done by a previous call
3755 * to btrfs_inc_nocow_writers()
3756 */
3757 btrfs_put_block_group(bg);
3758 btrfs_put_block_group(bg);
3759}
3760
f78c436c
FM
3761void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3762{
4625956a 3763 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
f78c436c
FM
3764}
3765
8790d502
CM
3766static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3767{
899c81ea
ID
3768 u64 extra_flags = chunk_to_extended(flags) &
3769 BTRFS_EXTENDED_PROFILE_MASK;
a46d11a8 3770
de98ced9 3771 write_seqlock(&fs_info->profiles_lock);
a46d11a8
ID
3772 if (flags & BTRFS_BLOCK_GROUP_DATA)
3773 fs_info->avail_data_alloc_bits |= extra_flags;
3774 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3775 fs_info->avail_metadata_alloc_bits |= extra_flags;
3776 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3777 fs_info->avail_system_alloc_bits |= extra_flags;
de98ced9 3778 write_sequnlock(&fs_info->profiles_lock);
8790d502 3779}
593060d7 3780
fc67c450
ID
3781/*
3782 * returns target flags in extended format or 0 if restripe for this
3783 * chunk_type is not in progress
c6664b42 3784 *
dccdb07b 3785 * should be called with balance_lock held
fc67c450
ID
3786 */
3787static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3788{
3789 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3790 u64 target = 0;
3791
fc67c450
ID
3792 if (!bctl)
3793 return 0;
3794
3795 if (flags & BTRFS_BLOCK_GROUP_DATA &&
3796 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3797 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3798 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3799 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3800 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3801 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3802 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3803 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3804 }
3805
3806 return target;
3807}
3808
a46d11a8
ID
3809/*
3810 * @flags: available profiles in extended format (see ctree.h)
3811 *
e4d8ec0f
ID
3812 * Returns reduced profile in chunk format. If profile changing is in
3813 * progress (either running or paused) picks the target profile (if it's
3814 * already available), otherwise falls back to plain reducing.
a46d11a8 3815 */
2ff7e61e 3816static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c 3817{
0b246afa 3818 u64 num_devices = fs_info->fs_devices->rw_devices;
fc67c450 3819 u64 target;
9c170b26
ZL
3820 u64 raid_type;
3821 u64 allowed = 0;
a061fc8d 3822
fc67c450
ID
3823 /*
3824 * see if restripe for this chunk_type is in progress, if so
3825 * try to reduce to the target profile
3826 */
0b246afa
JM
3827 spin_lock(&fs_info->balance_lock);
3828 target = get_restripe_target(fs_info, flags);
fc67c450
ID
3829 if (target) {
3830 /* pick target profile only if it's already available */
3831 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
0b246afa 3832 spin_unlock(&fs_info->balance_lock);
fc67c450 3833 return extended_to_chunk(target);
e4d8ec0f
ID
3834 }
3835 }
0b246afa 3836 spin_unlock(&fs_info->balance_lock);
e4d8ec0f 3837
53b381b3 3838 /* First, mask out the RAID levels which aren't possible */
9c170b26
ZL
3839 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3840 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
41a6e891 3841 allowed |= btrfs_raid_array[raid_type].bg_flag;
9c170b26
ZL
3842 }
3843 allowed &= flags;
3844
3845 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3846 allowed = BTRFS_BLOCK_GROUP_RAID6;
3847 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3848 allowed = BTRFS_BLOCK_GROUP_RAID5;
3849 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3850 allowed = BTRFS_BLOCK_GROUP_RAID10;
3851 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3852 allowed = BTRFS_BLOCK_GROUP_RAID1;
3853 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3854 allowed = BTRFS_BLOCK_GROUP_RAID0;
3855
3856 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3857
3858 return extended_to_chunk(flags | allowed);
ec44a35c
CM
3859}
3860
2ff7e61e 3861static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
6a63209f 3862{
de98ced9 3863 unsigned seq;
f8213bdc 3864 u64 flags;
de98ced9
MX
3865
3866 do {
f8213bdc 3867 flags = orig_flags;
0b246afa 3868 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9
MX
3869
3870 if (flags & BTRFS_BLOCK_GROUP_DATA)
0b246afa 3871 flags |= fs_info->avail_data_alloc_bits;
de98ced9 3872 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
0b246afa 3873 flags |= fs_info->avail_system_alloc_bits;
de98ced9 3874 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
0b246afa
JM
3875 flags |= fs_info->avail_metadata_alloc_bits;
3876 } while (read_seqretry(&fs_info->profiles_lock, seq));
6fef8df1 3877
2ff7e61e 3878 return btrfs_reduce_alloc_profile(fs_info, flags);
6a63209f
JB
3879}
3880
1b86826d 3881static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 3882{
0b246afa 3883 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 3884 u64 flags;
53b381b3 3885 u64 ret;
9ed74f2d 3886
b742bb82
YZ
3887 if (data)
3888 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 3889 else if (root == fs_info->chunk_root)
b742bb82 3890 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 3891 else
b742bb82 3892 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 3893
2ff7e61e 3894 ret = get_alloc_profile(fs_info, flags);
53b381b3 3895 return ret;
6a63209f 3896}
9ed74f2d 3897
1b86826d
JM
3898u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
3899{
3900 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
3901}
3902
3903u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
3904{
3905 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3906}
3907
3908u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
3909{
3910 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3911}
3912
04f4f916 3913int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
6a63209f 3914{
04f4f916 3915 struct btrfs_root *root = inode->root;
b4d7c3c9 3916 struct btrfs_fs_info *fs_info = root->fs_info;
1174cade 3917 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
ab6e2410 3918 u64 used;
94b947b2 3919 int ret = 0;
c99f1b0c
ZL
3920 int need_commit = 2;
3921 int have_pinned_space;
6a63209f 3922
6a63209f 3923 /* make sure bytes are sectorsize aligned */
0b246afa 3924 bytes = ALIGN(bytes, fs_info->sectorsize);
6a63209f 3925
9dced186 3926 if (btrfs_is_free_space_inode(inode)) {
c99f1b0c 3927 need_commit = 0;
9dced186 3928 ASSERT(current->journal_info);
0af3d00b
JB
3929 }
3930
6a63209f
JB
3931again:
3932 /* make sure we have enough space to handle the data first */
3933 spin_lock(&data_sinfo->lock);
4136135b 3934 used = btrfs_space_info_used(data_sinfo, true);
ab6e2410
JB
3935
3936 if (used + bytes > data_sinfo->total_bytes) {
4e06bdd6 3937 struct btrfs_trans_handle *trans;
9ed74f2d 3938
6a63209f
JB
3939 /*
3940 * if we don't have enough free bytes in this space then we need
3941 * to alloc a new chunk.
3942 */
b9fd47cd 3943 if (!data_sinfo->full) {
6a63209f 3944 u64 alloc_target;
9ed74f2d 3945
0e4f8f88 3946 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
6a63209f 3947 spin_unlock(&data_sinfo->lock);
1174cade 3948
1b86826d 3949 alloc_target = btrfs_data_alloc_profile(fs_info);
9dced186
MX
3950 /*
3951 * It is ugly that we don't call nolock join
3952 * transaction for the free space inode case here.
3953 * But it is safe because we only do the data space
3954 * reservation for the free space cache in the
3955 * transaction context, the common join transaction
3956 * just increase the counter of the current transaction
3957 * handler, doesn't try to acquire the trans_lock of
3958 * the fs.
3959 */
7a7eaa40 3960 trans = btrfs_join_transaction(root);
a22285a6
YZ
3961 if (IS_ERR(trans))
3962 return PTR_ERR(trans);
9ed74f2d 3963
fc471cb0
JB
3964 ret = btrfs_chunk_alloc(trans, alloc_target,
3965 CHUNK_ALLOC_NO_FORCE);
3a45bb20 3966 btrfs_end_transaction(trans);
d52a5b5f
MX
3967 if (ret < 0) {
3968 if (ret != -ENOSPC)
3969 return ret;
c99f1b0c
ZL
3970 else {
3971 have_pinned_space = 1;
d52a5b5f 3972 goto commit_trans;
c99f1b0c 3973 }
d52a5b5f 3974 }
9ed74f2d 3975
6a63209f
JB
3976 goto again;
3977 }
f2bb8f5c
JB
3978
3979 /*
b150a4f1 3980 * If we don't have enough pinned space to deal with this
94b947b2
ZL
3981 * allocation, and no removed chunk in current transaction,
3982 * don't bother committing the transaction.
f2bb8f5c 3983 */
dec59fa3 3984 have_pinned_space = __percpu_counter_compare(
c99f1b0c 3985 &data_sinfo->total_bytes_pinned,
dec59fa3
EL
3986 used + bytes - data_sinfo->total_bytes,
3987 BTRFS_TOTAL_BYTES_PINNED_BATCH);
6a63209f 3988 spin_unlock(&data_sinfo->lock);
6a63209f 3989
4e06bdd6 3990 /* commit the current transaction and try again */
d52a5b5f 3991commit_trans:
92e2f7e3 3992 if (need_commit) {
c99f1b0c 3993 need_commit--;
b150a4f1 3994
e1746e83 3995 if (need_commit > 0) {
82b3e53b 3996 btrfs_start_delalloc_roots(fs_info, -1);
6374e57a 3997 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
0b246afa 3998 (u64)-1);
e1746e83 3999 }
9a4e7276 4000
7a7eaa40 4001 trans = btrfs_join_transaction(root);
a22285a6
YZ
4002 if (IS_ERR(trans))
4003 return PTR_ERR(trans);
c99f1b0c 4004 if (have_pinned_space >= 0 ||
3204d33c
JB
4005 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4006 &trans->transaction->flags) ||
c99f1b0c 4007 need_commit > 0) {
3a45bb20 4008 ret = btrfs_commit_transaction(trans);
94b947b2
ZL
4009 if (ret)
4010 return ret;
d7c15171 4011 /*
c2d6cb16
FM
4012 * The cleaner kthread might still be doing iput
4013 * operations. Wait for it to finish so that
034f784d
JB
4014 * more space is released. We don't need to
4015 * explicitly run the delayed iputs here because
4016 * the commit_transaction would have woken up
4017 * the cleaner.
d7c15171 4018 */
034f784d
JB
4019 ret = btrfs_wait_on_delayed_iputs(fs_info);
4020 if (ret)
4021 return ret;
94b947b2
ZL
4022 goto again;
4023 } else {
3a45bb20 4024 btrfs_end_transaction(trans);
94b947b2 4025 }
4e06bdd6 4026 }
9ed74f2d 4027
0b246afa 4028 trace_btrfs_space_reservation(fs_info,
cab45e22
JM
4029 "space_info:enospc",
4030 data_sinfo->flags, bytes, 1);
6a63209f
JB
4031 return -ENOSPC;
4032 }
bb96c4e5 4033 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
0b246afa 4034 trace_btrfs_space_reservation(fs_info, "space_info",
2bcc0328 4035 data_sinfo->flags, bytes, 1);
6a63209f 4036 spin_unlock(&data_sinfo->lock);
6a63209f 4037
4559b0a7 4038 return 0;
9ed74f2d 4039}
6a63209f 4040
364ecf36
QW
4041int btrfs_check_data_free_space(struct inode *inode,
4042 struct extent_changeset **reserved, u64 start, u64 len)
4ceff079 4043{
0b246afa 4044 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4045 int ret;
4046
4047 /* align the range */
0b246afa
JM
4048 len = round_up(start + len, fs_info->sectorsize) -
4049 round_down(start, fs_info->sectorsize);
4050 start = round_down(start, fs_info->sectorsize);
4ceff079 4051
04f4f916 4052 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4ceff079
QW
4053 if (ret < 0)
4054 return ret;
4055
1e5ec2e7 4056 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
364ecf36 4057 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
7bc329c1 4058 if (ret < 0)
1e5ec2e7 4059 btrfs_free_reserved_data_space_noquota(inode, start, len);
364ecf36
QW
4060 else
4061 ret = 0;
4ceff079
QW
4062 return ret;
4063}
4064
4ceff079
QW
4065/*
4066 * Called if we need to clear a data reservation for this inode
4067 * Normally in a error case.
4068 *
51773bec
QW
4069 * This one will *NOT* use accurate qgroup reserved space API, just for case
4070 * which we can't sleep and is sure it won't affect qgroup reserved space.
4071 * Like clear_bit_hook().
4ceff079 4072 */
51773bec
QW
4073void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4074 u64 len)
4ceff079 4075{
0b246afa 4076 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4077 struct btrfs_space_info *data_sinfo;
4078
4079 /* Make sure the range is aligned to sectorsize */
0b246afa
JM
4080 len = round_up(start + len, fs_info->sectorsize) -
4081 round_down(start, fs_info->sectorsize);
4082 start = round_down(start, fs_info->sectorsize);
4ceff079 4083
0b246afa 4084 data_sinfo = fs_info->data_sinfo;
4ceff079 4085 spin_lock(&data_sinfo->lock);
bb96c4e5 4086 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
0b246afa 4087 trace_btrfs_space_reservation(fs_info, "space_info",
4ceff079
QW
4088 data_sinfo->flags, len, 0);
4089 spin_unlock(&data_sinfo->lock);
4090}
4091
51773bec
QW
4092/*
4093 * Called if we need to clear a data reservation for this inode
4094 * Normally in a error case.
4095 *
01327610 4096 * This one will handle the per-inode data rsv map for accurate reserved
51773bec
QW
4097 * space framework.
4098 */
bc42bda2
QW
4099void btrfs_free_reserved_data_space(struct inode *inode,
4100 struct extent_changeset *reserved, u64 start, u64 len)
51773bec 4101{
0c476a5d
JM
4102 struct btrfs_root *root = BTRFS_I(inode)->root;
4103
4104 /* Make sure the range is aligned to sectorsize */
da17066c
JM
4105 len = round_up(start + len, root->fs_info->sectorsize) -
4106 round_down(start, root->fs_info->sectorsize);
4107 start = round_down(start, root->fs_info->sectorsize);
0c476a5d 4108
51773bec 4109 btrfs_free_reserved_data_space_noquota(inode, start, len);
bc42bda2 4110 btrfs_qgroup_free_data(inode, reserved, start, len);
51773bec
QW
4111}
4112
97e728d4 4113static void force_metadata_allocation(struct btrfs_fs_info *info)
e3ccfa98 4114{
97e728d4
JB
4115 struct list_head *head = &info->space_info;
4116 struct btrfs_space_info *found;
e3ccfa98 4117
97e728d4
JB
4118 rcu_read_lock();
4119 list_for_each_entry_rcu(found, head, list) {
4120 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
0e4f8f88 4121 found->force_alloc = CHUNK_ALLOC_FORCE;
e3ccfa98 4122 }
97e728d4 4123 rcu_read_unlock();
e3ccfa98
JB
4124}
4125
2ff7e61e 4126static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
698d0082 4127 struct btrfs_space_info *sinfo, int force)
32c00aff 4128{
8d8aafee 4129 u64 bytes_used = btrfs_space_info_used(sinfo, false);
e5bc2458 4130 u64 thresh;
e3ccfa98 4131
0e4f8f88
CM
4132 if (force == CHUNK_ALLOC_FORCE)
4133 return 1;
4134
4135 /*
4136 * in limited mode, we want to have some free space up to
4137 * about 1% of the FS size.
4138 */
4139 if (force == CHUNK_ALLOC_LIMITED) {
0b246afa 4140 thresh = btrfs_super_total_bytes(fs_info->super_copy);
ee22184b 4141 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
0e4f8f88 4142
8d8aafee 4143 if (sinfo->total_bytes - bytes_used < thresh)
0e4f8f88
CM
4144 return 1;
4145 }
0e4f8f88 4146
8d8aafee 4147 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
14ed0ca6 4148 return 0;
424499db 4149 return 1;
32c00aff
JB
4150}
4151
2ff7e61e 4152static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
15d1ff81
LB
4153{
4154 u64 num_dev;
4155
9fa02ac7
DS
4156 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4157 if (!num_dev)
0b246afa 4158 num_dev = fs_info->fs_devices->rw_devices;
15d1ff81 4159
39c2d7fa 4160 return num_dev;
15d1ff81
LB
4161}
4162
39c2d7fa
FM
4163/*
4164 * If @is_allocation is true, reserve space in the system space info necessary
4165 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4166 * removing a chunk.
4167 */
451a2c13 4168void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
15d1ff81 4169{
451a2c13 4170 struct btrfs_fs_info *fs_info = trans->fs_info;
15d1ff81
LB
4171 struct btrfs_space_info *info;
4172 u64 left;
4173 u64 thresh;
4fbcdf66 4174 int ret = 0;
39c2d7fa 4175 u64 num_devs;
4fbcdf66
FM
4176
4177 /*
4178 * Needed because we can end up allocating a system chunk and for an
4179 * atomic and race free space reservation in the chunk block reserve.
4180 */
a32bf9a3 4181 lockdep_assert_held(&fs_info->chunk_mutex);
15d1ff81 4182
280c2908 4183 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
15d1ff81 4184 spin_lock(&info->lock);
4136135b 4185 left = info->total_bytes - btrfs_space_info_used(info, true);
15d1ff81
LB
4186 spin_unlock(&info->lock);
4187
2ff7e61e 4188 num_devs = get_profile_num_devs(fs_info, type);
39c2d7fa
FM
4189
4190 /* num_devs device items to update and 1 chunk item to add or remove */
0b246afa
JM
4191 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4192 btrfs_calc_trans_metadata_size(fs_info, 1);
39c2d7fa 4193
0b246afa
JM
4194 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4195 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4196 left, thresh, type);
5da6afeb 4197 btrfs_dump_space_info(fs_info, info, 0, 0);
15d1ff81
LB
4198 }
4199
4200 if (left < thresh) {
1b86826d 4201 u64 flags = btrfs_system_alloc_profile(fs_info);
15d1ff81 4202
4fbcdf66
FM
4203 /*
4204 * Ignore failure to create system chunk. We might end up not
4205 * needing it, as we might not need to COW all nodes/leafs from
4206 * the paths we visit in the chunk tree (they were already COWed
4207 * or created in the current transaction for example).
4208 */
c216b203 4209 ret = btrfs_alloc_chunk(trans, flags);
4fbcdf66
FM
4210 }
4211
4212 if (!ret) {
0b246afa
JM
4213 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4214 &fs_info->chunk_block_rsv,
4fbcdf66
FM
4215 thresh, BTRFS_RESERVE_NO_FLUSH);
4216 if (!ret)
4217 trans->chunk_bytes_reserved += thresh;
15d1ff81
LB
4218 }
4219}
4220
28b737f6
LB
4221/*
4222 * If force is CHUNK_ALLOC_FORCE:
4223 * - return 1 if it successfully allocates a chunk,
4224 * - return errors including -ENOSPC otherwise.
4225 * If force is NOT CHUNK_ALLOC_FORCE:
4226 * - return 0 if it doesn't need to allocate a new chunk,
4227 * - return 1 if it successfully allocates a chunk,
4228 * - return errors including -ENOSPC otherwise.
4229 */
fc471cb0
JB
4230int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4231 enum btrfs_chunk_alloc_enum force)
9ed74f2d 4232{
01458828 4233 struct btrfs_fs_info *fs_info = trans->fs_info;
6324fbf3 4234 struct btrfs_space_info *space_info;
2556fbb0
NB
4235 bool wait_for_alloc = false;
4236 bool should_alloc = false;
9ed74f2d 4237 int ret = 0;
9ed74f2d 4238
c6b305a8
JB
4239 /* Don't re-enter if we're already allocating a chunk */
4240 if (trans->allocating_chunk)
4241 return -ENOSPC;
4242
280c2908 4243 space_info = btrfs_find_space_info(fs_info, flags);
dc2d3005 4244 ASSERT(space_info);
9ed74f2d 4245
2556fbb0
NB
4246 do {
4247 spin_lock(&space_info->lock);
4248 if (force < space_info->force_alloc)
4249 force = space_info->force_alloc;
4250 should_alloc = should_alloc_chunk(fs_info, space_info, force);
4251 if (space_info->full) {
4252 /* No more free physical space */
4253 if (should_alloc)
4254 ret = -ENOSPC;
4255 else
4256 ret = 0;
4257 spin_unlock(&space_info->lock);
4258 return ret;
4259 } else if (!should_alloc) {
4260 spin_unlock(&space_info->lock);
4261 return 0;
4262 } else if (space_info->chunk_alloc) {
4263 /*
4264 * Someone is already allocating, so we need to block
4265 * until this someone is finished and then loop to
4266 * recheck if we should continue with our allocation
4267 * attempt.
4268 */
4269 wait_for_alloc = true;
4270 spin_unlock(&space_info->lock);
4271 mutex_lock(&fs_info->chunk_mutex);
4272 mutex_unlock(&fs_info->chunk_mutex);
4273 } else {
4274 /* Proceed with allocation */
4275 space_info->chunk_alloc = 1;
4276 wait_for_alloc = false;
4277 spin_unlock(&space_info->lock);
4278 }
6d74119f 4279
1e1c50a9 4280 cond_resched();
2556fbb0 4281 } while (wait_for_alloc);
6d74119f 4282
2556fbb0 4283 mutex_lock(&fs_info->chunk_mutex);
c6b305a8
JB
4284 trans->allocating_chunk = true;
4285
67377734
JB
4286 /*
4287 * If we have mixed data/metadata chunks we want to make sure we keep
4288 * allocating mixed chunks instead of individual chunks.
4289 */
4290 if (btrfs_mixed_space_info(space_info))
4291 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4292
97e728d4
JB
4293 /*
4294 * if we're doing a data chunk, go ahead and make sure that
4295 * we keep a reasonable number of metadata chunks allocated in the
4296 * FS as well.
4297 */
9ed74f2d 4298 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
97e728d4
JB
4299 fs_info->data_chunk_allocations++;
4300 if (!(fs_info->data_chunk_allocations %
4301 fs_info->metadata_ratio))
4302 force_metadata_allocation(fs_info);
9ed74f2d
JB
4303 }
4304
15d1ff81
LB
4305 /*
4306 * Check if we have enough space in SYSTEM chunk because we may need
4307 * to update devices.
4308 */
451a2c13 4309 check_system_chunk(trans, flags);
15d1ff81 4310
c216b203 4311 ret = btrfs_alloc_chunk(trans, flags);
c6b305a8 4312 trans->allocating_chunk = false;
92b8e897 4313
9ed74f2d 4314 spin_lock(&space_info->lock);
57f1642e
NB
4315 if (ret < 0) {
4316 if (ret == -ENOSPC)
4317 space_info->full = 1;
4318 else
4319 goto out;
4320 } else {
424499db 4321 ret = 1;
21a94f7a 4322 space_info->max_extent_size = 0;
57f1642e 4323 }
6d74119f 4324
0e4f8f88 4325 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
a81cb9a2 4326out:
6d74119f 4327 space_info->chunk_alloc = 0;
9ed74f2d 4328 spin_unlock(&space_info->lock);
a25c75d5 4329 mutex_unlock(&fs_info->chunk_mutex);
00d80e34
FM
4330 /*
4331 * When we allocate a new chunk we reserve space in the chunk block
4332 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4333 * add new nodes/leafs to it if we end up needing to do it when
4334 * inserting the chunk item and updating device items as part of the
4335 * second phase of chunk allocation, performed by
4336 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4337 * large number of new block groups to create in our transaction
4338 * handle's new_bgs list to avoid exhausting the chunk block reserve
4339 * in extreme cases - like having a single transaction create many new
4340 * block groups when starting to write out the free space caches of all
4341 * the block groups that were made dirty during the lifetime of the
4342 * transaction.
4343 */
5ce55557 4344 if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
6c686b35 4345 btrfs_create_pending_block_groups(trans);
5ce55557 4346
0f9dd46c 4347 return ret;
6324fbf3 4348}
9ed74f2d 4349
79787eaa
JM
4350static struct btrfs_block_rsv *get_block_rsv(
4351 const struct btrfs_trans_handle *trans,
4352 const struct btrfs_root *root)
f0486c68 4353{
0b246afa 4354 struct btrfs_fs_info *fs_info = root->fs_info;
4c13d758
JB
4355 struct btrfs_block_rsv *block_rsv = NULL;
4356
e9cf439f 4357 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
0b246afa
JM
4358 (root == fs_info->csum_root && trans->adding_csums) ||
4359 (root == fs_info->uuid_root))
f7a81ea4
SB
4360 block_rsv = trans->block_rsv;
4361
4c13d758 4362 if (!block_rsv)
f0486c68
YZ
4363 block_rsv = root->block_rsv;
4364
4365 if (!block_rsv)
0b246afa 4366 block_rsv = &fs_info->empty_block_rsv;
f0486c68
YZ
4367
4368 return block_rsv;
4369}
4370
ba2c4d4e
JB
4371/**
4372 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
4373 * @fs_info - the fs info for our fs.
4374 * @src - the source block rsv to transfer from.
4375 * @num_bytes - the number of bytes to transfer.
4376 *
4377 * This transfers up to the num_bytes amount from the src rsv to the
4378 * delayed_refs_rsv. Any extra bytes are returned to the space info.
4379 */
4380void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
4381 struct btrfs_block_rsv *src,
4382 u64 num_bytes)
4383{
4384 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4385 u64 to_free = 0;
4386
4387 spin_lock(&src->lock);
4388 src->reserved -= num_bytes;
4389 src->size -= num_bytes;
4390 spin_unlock(&src->lock);
4391
4392 spin_lock(&delayed_refs_rsv->lock);
4393 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
4394 u64 delta = delayed_refs_rsv->size -
4395 delayed_refs_rsv->reserved;
4396 if (num_bytes > delta) {
4397 to_free = num_bytes - delta;
4398 num_bytes = delta;
4399 }
4400 } else {
4401 to_free = num_bytes;
4402 num_bytes = 0;
4403 }
4404
4405 if (num_bytes)
4406 delayed_refs_rsv->reserved += num_bytes;
4407 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
4408 delayed_refs_rsv->full = 1;
4409 spin_unlock(&delayed_refs_rsv->lock);
4410
4411 if (num_bytes)
4412 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4413 0, num_bytes, 1);
4414 if (to_free)
d44b72aa
JB
4415 btrfs_space_info_add_old_bytes(fs_info,
4416 delayed_refs_rsv->space_info, to_free);
ba2c4d4e
JB
4417}
4418
4419/**
4420 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
4421 * @fs_info - the fs_info for our fs.
4422 * @flush - control how we can flush for this reservation.
4423 *
4424 * This will refill the delayed block_rsv up to 1 items size worth of space and
4425 * will return -ENOSPC if we can't make the reservation.
4426 */
4427int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
4428 enum btrfs_reserve_flush_enum flush)
4429{
4430 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
4431 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
4432 u64 num_bytes = 0;
4433 int ret = -ENOSPC;
4434
4435 spin_lock(&block_rsv->lock);
4436 if (block_rsv->reserved < block_rsv->size) {
4437 num_bytes = block_rsv->size - block_rsv->reserved;
4438 num_bytes = min(num_bytes, limit);
4439 }
4440 spin_unlock(&block_rsv->lock);
4441
4442 if (!num_bytes)
4443 return 0;
4444
0d9764f6
JB
4445 ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
4446 num_bytes, flush);
ba2c4d4e
JB
4447 if (ret)
4448 return ret;
0b50174a 4449 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
ba2c4d4e
JB
4450 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4451 0, num_bytes, 1);
4452 return 0;
4453}
4454
69fe2d75
JB
4455/**
4456 * btrfs_inode_rsv_release - release any excessive reservation.
4457 * @inode - the inode we need to release from.
43b18595
QW
4458 * @qgroup_free - free or convert qgroup meta.
4459 * Unlike normal operation, qgroup meta reservation needs to know if we are
4460 * freeing qgroup reservation or just converting it into per-trans. Normally
4461 * @qgroup_free is true for error handling, and false for normal release.
69fe2d75
JB
4462 *
4463 * This is the same as btrfs_block_rsv_release, except that it handles the
4464 * tracepoint for the reservation.
4465 */
43b18595 4466static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
69fe2d75
JB
4467{
4468 struct btrfs_fs_info *fs_info = inode->root->fs_info;
69fe2d75
JB
4469 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4470 u64 released = 0;
ff6bc37e 4471 u64 qgroup_to_release = 0;
69fe2d75
JB
4472
4473 /*
4474 * Since we statically set the block_rsv->size we just want to say we
4475 * are releasing 0 bytes, and then we'll just get the reservation over
4476 * the size free'd.
4477 */
ba2c4d4e
JB
4478 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
4479 &qgroup_to_release);
69fe2d75
JB
4480 if (released > 0)
4481 trace_btrfs_space_reservation(fs_info, "delalloc",
4482 btrfs_ino(inode), released, 0);
43b18595 4483 if (qgroup_free)
ff6bc37e 4484 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
43b18595 4485 else
ff6bc37e
QW
4486 btrfs_qgroup_convert_reserved_meta(inode->root,
4487 qgroup_to_release);
69fe2d75
JB
4488}
4489
ba2c4d4e
JB
4490/**
4491 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
4492 * @fs_info - the fs_info for our fs.
4493 * @nr - the number of items to drop.
4494 *
4495 * This drops the delayed ref head's count from the delayed refs rsv and frees
4496 * any excess reservation we had.
4497 */
4498void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
f0486c68 4499{
ba2c4d4e 4500 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
ba2c4d4e
JB
4501 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
4502 u64 released = 0;
0b246afa 4503
424a4780
JB
4504 released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
4505 NULL);
ba2c4d4e
JB
4506 if (released)
4507 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
4508 0, released, 0);
6a63209f
JB
4509}
4510
8929ecfa
YZ
4511static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4512{
4513 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4514 struct btrfs_space_info *sinfo = block_rsv->space_info;
4515 u64 num_bytes;
6a63209f 4516
ae2e4728
JB
4517 /*
4518 * The global block rsv is based on the size of the extent tree, the
4519 * checksum tree and the root tree. If the fs is empty we want to set
4520 * it to a minimal amount for safety.
4521 */
4522 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
4523 btrfs_root_used(&fs_info->csum_root->root_item) +
4524 btrfs_root_used(&fs_info->tree_root->root_item);
4525 num_bytes = max_t(u64, num_bytes, SZ_16M);
33b4d47f 4526
8929ecfa 4527 spin_lock(&sinfo->lock);
1f699d38 4528 spin_lock(&block_rsv->lock);
4e06bdd6 4529
ee22184b 4530 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
4e06bdd6 4531
fb4b10e5 4532 if (block_rsv->reserved < block_rsv->size) {
4136135b 4533 num_bytes = btrfs_space_info_used(sinfo, true);
fb4b10e5
JB
4534 if (sinfo->total_bytes > num_bytes) {
4535 num_bytes = sinfo->total_bytes - num_bytes;
4536 num_bytes = min(num_bytes,
4537 block_rsv->size - block_rsv->reserved);
4538 block_rsv->reserved += num_bytes;
bb96c4e5
JB
4539 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
4540 num_bytes);
fb4b10e5
JB
4541 trace_btrfs_space_reservation(fs_info, "space_info",
4542 sinfo->flags, num_bytes,
4543 1);
4544 }
4545 } else if (block_rsv->reserved > block_rsv->size) {
8929ecfa 4546 num_bytes = block_rsv->reserved - block_rsv->size;
bb96c4e5
JB
4547 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
4548 -num_bytes);
8c2a3ca2 4549 trace_btrfs_space_reservation(fs_info, "space_info",
2bcc0328 4550 sinfo->flags, num_bytes, 0);
8929ecfa 4551 block_rsv->reserved = block_rsv->size;
8929ecfa 4552 }
182608c8 4553
fb4b10e5
JB
4554 if (block_rsv->reserved == block_rsv->size)
4555 block_rsv->full = 1;
4556 else
4557 block_rsv->full = 0;
4558
8929ecfa 4559 spin_unlock(&block_rsv->lock);
1f699d38 4560 spin_unlock(&sinfo->lock);
6a63209f
JB
4561}
4562
f0486c68 4563static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
6a63209f 4564{
f0486c68 4565 struct btrfs_space_info *space_info;
6a63209f 4566
280c2908 4567 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
f0486c68 4568 fs_info->chunk_block_rsv.space_info = space_info;
6a63209f 4569
280c2908 4570 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
8929ecfa 4571 fs_info->global_block_rsv.space_info = space_info;
f0486c68
YZ
4572 fs_info->trans_block_rsv.space_info = space_info;
4573 fs_info->empty_block_rsv.space_info = space_info;
6d668dda 4574 fs_info->delayed_block_rsv.space_info = space_info;
ba2c4d4e 4575 fs_info->delayed_refs_rsv.space_info = space_info;
f0486c68 4576
ba2c4d4e
JB
4577 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
4578 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
8929ecfa
YZ
4579 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4580 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3a6cad90
SB
4581 if (fs_info->quota_root)
4582 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
f0486c68 4583 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
8929ecfa 4584
8929ecfa 4585 update_global_block_rsv(fs_info);
6a63209f
JB
4586}
4587
8929ecfa 4588static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
6a63209f 4589{
424a4780 4590 btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
8929ecfa
YZ
4591 WARN_ON(fs_info->trans_block_rsv.size > 0);
4592 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4593 WARN_ON(fs_info->chunk_block_rsv.size > 0);
4594 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
6d668dda
JB
4595 WARN_ON(fs_info->delayed_block_rsv.size > 0);
4596 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
ba2c4d4e
JB
4597 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
4598 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
fcb80c2a
JB
4599}
4600
ba2c4d4e
JB
4601/*
4602 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
4603 * @trans - the trans that may have generated delayed refs
4604 *
4605 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
4606 * it'll calculate the additional size and add it to the delayed_refs_rsv.
4607 */
4608void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
4609{
4610 struct btrfs_fs_info *fs_info = trans->fs_info;
4611 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
4612 u64 num_bytes;
4613
4614 if (!trans->delayed_ref_updates)
4615 return;
4616
4617 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
4618 trans->delayed_ref_updates);
4619 spin_lock(&delayed_rsv->lock);
4620 delayed_rsv->size += num_bytes;
4621 delayed_rsv->full = 0;
4622 spin_unlock(&delayed_rsv->lock);
4623 trans->delayed_ref_updates = 0;
4624}
6a63209f 4625
4fbcdf66
FM
4626/*
4627 * To be called after all the new block groups attached to the transaction
4628 * handle have been created (btrfs_create_pending_block_groups()).
4629 */
4630void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
4631{
64b63580 4632 struct btrfs_fs_info *fs_info = trans->fs_info;
4fbcdf66
FM
4633
4634 if (!trans->chunk_bytes_reserved)
4635 return;
4636
4637 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
4638
424a4780
JB
4639 btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
4640 trans->chunk_bytes_reserved);
4fbcdf66
FM
4641 trans->chunk_bytes_reserved = 0;
4642}
4643
d5c12070
MX
4644/*
4645 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4646 * root: the root of the parent directory
4647 * rsv: block reservation
4648 * items: the number of items that we need do reservation
a5b7f429 4649 * use_global_rsv: allow fallback to the global block reservation
d5c12070
MX
4650 *
4651 * This function is used to reserve the space for snapshot/subvolume
4652 * creation and deletion. Those operations are different with the
4653 * common file/directory operations, they change two fs/file trees
4654 * and root tree, the number of items that the qgroup reserves is
4655 * different with the free space reservation. So we can not use
01327610 4656 * the space reservation mechanism in start_transaction().
d5c12070
MX
4657 */
4658int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
a5b7f429 4659 struct btrfs_block_rsv *rsv, int items,
ee3441b4 4660 bool use_global_rsv)
a22285a6 4661{
a5b7f429 4662 u64 qgroup_num_bytes = 0;
d5c12070
MX
4663 u64 num_bytes;
4664 int ret;
0b246afa
JM
4665 struct btrfs_fs_info *fs_info = root->fs_info;
4666 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
d5c12070 4667
0b246afa 4668 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
d5c12070 4669 /* One for parent inode, two for dir entries */
a5b7f429
LF
4670 qgroup_num_bytes = 3 * fs_info->nodesize;
4671 ret = btrfs_qgroup_reserve_meta_prealloc(root,
4672 qgroup_num_bytes, true);
d5c12070
MX
4673 if (ret)
4674 return ret;
d5c12070
MX
4675 }
4676
0b246afa 4677 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
280c2908 4678 rsv->space_info = btrfs_find_space_info(fs_info,
d5c12070
MX
4679 BTRFS_BLOCK_GROUP_METADATA);
4680 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4681 BTRFS_RESERVE_FLUSH_ALL);
ee3441b4
JM
4682
4683 if (ret == -ENOSPC && use_global_rsv)
3a584174 4684 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
ee3441b4 4685
a5b7f429
LF
4686 if (ret && qgroup_num_bytes)
4687 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
d5c12070
MX
4688
4689 return ret;
4690}
4691
2ff7e61e 4692void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
7775c818 4693 struct btrfs_block_rsv *rsv)
d5c12070 4694{
2ff7e61e 4695 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
97e728d4
JB
4696}
4697
69fe2d75
JB
4698static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
4699 struct btrfs_inode *inode)
9e0baf60 4700{
69fe2d75
JB
4701 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4702 u64 reserve_size = 0;
ff6bc37e 4703 u64 qgroup_rsv_size = 0;
69fe2d75
JB
4704 u64 csum_leaves;
4705 unsigned outstanding_extents;
9e0baf60 4706
69fe2d75
JB
4707 lockdep_assert_held(&inode->lock);
4708 outstanding_extents = inode->outstanding_extents;
4709 if (outstanding_extents)
4710 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
4711 outstanding_extents + 1);
4712 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
4713 inode->csum_bytes);
4714 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
4715 csum_leaves);
ff6bc37e
QW
4716 /*
4717 * For qgroup rsv, the calculation is very simple:
4718 * account one nodesize for each outstanding extent
4719 *
4720 * This is overestimating in most cases.
4721 */
139a5617 4722 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
9e0baf60 4723
69fe2d75
JB
4724 spin_lock(&block_rsv->lock);
4725 block_rsv->size = reserve_size;
ff6bc37e 4726 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
69fe2d75 4727 spin_unlock(&block_rsv->lock);
0ca1f7ce 4728}
c146afad 4729
c8eaeac7
JB
4730static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
4731 u64 num_bytes, u64 *meta_reserve,
4732 u64 *qgroup_reserve)
4733{
4734 u64 nr_extents = count_max_extents(num_bytes);
4735 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
4736
4737 /* We add one for the inode update at finish ordered time */
4738 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
4739 nr_extents + csum_leaves + 1);
4740 *qgroup_reserve = nr_extents * fs_info->nodesize;
4741}
4742
9f3db423 4743int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
0ca1f7ce 4744{
c8eaeac7
JB
4745 struct btrfs_root *root = inode->root;
4746 struct btrfs_fs_info *fs_info = root->fs_info;
4747 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
4748 u64 meta_reserve, qgroup_reserve;
69fe2d75 4749 unsigned nr_extents;
08e007d2 4750 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
eb6b88d9 4751 int ret = 0;
c64c2bd8 4752 bool delalloc_lock = true;
6324fbf3 4753
c64c2bd8
JB
4754 /* If we are a free space inode we need to not flush since we will be in
4755 * the middle of a transaction commit. We also don't need the delalloc
4756 * mutex since we won't race with anybody. We need this mostly to make
4757 * lockdep shut its filthy mouth.
bac357dc
JB
4758 *
4759 * If we have a transaction open (can happen if we call truncate_block
4760 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
c64c2bd8
JB
4761 */
4762 if (btrfs_is_free_space_inode(inode)) {
08e007d2 4763 flush = BTRFS_RESERVE_NO_FLUSH;
c64c2bd8 4764 delalloc_lock = false;
da07d4ab
NB
4765 } else {
4766 if (current->journal_info)
4767 flush = BTRFS_RESERVE_FLUSH_LIMIT;
c09544e0 4768
da07d4ab
NB
4769 if (btrfs_transaction_in_commit(fs_info))
4770 schedule_timeout(1);
4771 }
ec44a35c 4772
c64c2bd8 4773 if (delalloc_lock)
9f3db423 4774 mutex_lock(&inode->delalloc_mutex);
c64c2bd8 4775
0b246afa 4776 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
69fe2d75 4777
c8eaeac7
JB
4778 /*
4779 * We always want to do it this way, every other way is wrong and ends
4780 * in tears. Pre-reserving the amount we are going to add will always
4781 * be the right way, because otherwise if we have enough parallelism we
4782 * could end up with thousands of inodes all holding little bits of
4783 * reservations they were able to make previously and the only way to
4784 * reclaim that space is to ENOSPC out the operations and clear
4785 * everything out and try again, which is bad. This way we just
4786 * over-reserve slightly, and clean up the mess when we are done.
4787 */
4788 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
4789 &qgroup_reserve);
4790 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
4791 if (ret)
4792 goto out_fail;
0d9764f6 4793 ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
c8eaeac7
JB
4794 if (ret)
4795 goto out_qgroup;
4796
4797 /*
4798 * Now we need to update our outstanding extents and csum bytes _first_
4799 * and then add the reservation to the block_rsv. This keeps us from
4800 * racing with an ordered completion or some such that would think it
4801 * needs to free the reservation we just made.
4802 */
9f3db423 4803 spin_lock(&inode->lock);
69fe2d75 4804 nr_extents = count_max_extents(num_bytes);
8b62f87b 4805 btrfs_mod_outstanding_extents(inode, nr_extents);
69fe2d75
JB
4806 inode->csum_bytes += num_bytes;
4807 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
9f3db423 4808 spin_unlock(&inode->lock);
57a45ced 4809
c8eaeac7 4810 /* Now we can safely add our space to our block rsv */
0b50174a 4811 btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
c8eaeac7
JB
4812 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4813 btrfs_ino(inode), meta_reserve, 1);
4814
4815 spin_lock(&block_rsv->lock);
4816 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
4817 spin_unlock(&block_rsv->lock);
25179201 4818
c64c2bd8 4819 if (delalloc_lock)
9f3db423 4820 mutex_unlock(&inode->delalloc_mutex);
0ca1f7ce 4821 return 0;
c8eaeac7
JB
4822out_qgroup:
4823 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
88e081bf 4824out_fail:
43b18595 4825 btrfs_inode_rsv_release(inode, true);
88e081bf 4826 if (delalloc_lock)
9f3db423 4827 mutex_unlock(&inode->delalloc_mutex);
88e081bf 4828 return ret;
0ca1f7ce
YZ
4829}
4830
7709cde3
JB
4831/**
4832 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
8b62f87b
JB
4833 * @inode: the inode to release the reservation for.
4834 * @num_bytes: the number of bytes we are releasing.
43b18595 4835 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
7709cde3
JB
4836 *
4837 * This will release the metadata reservation for an inode. This can be called
4838 * once we complete IO for a given set of bytes to release their metadata
8b62f87b 4839 * reservations, or on error for the same reason.
7709cde3 4840 */
43b18595
QW
4841void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
4842 bool qgroup_free)
0ca1f7ce 4843{
3ffbd68c 4844 struct btrfs_fs_info *fs_info = inode->root->fs_info;
0ca1f7ce 4845
0b246afa 4846 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
691fa059 4847 spin_lock(&inode->lock);
69fe2d75
JB
4848 inode->csum_bytes -= num_bytes;
4849 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
691fa059 4850 spin_unlock(&inode->lock);
0ca1f7ce 4851
0b246afa 4852 if (btrfs_is_testing(fs_info))
6a3891c5
JB
4853 return;
4854
43b18595 4855 btrfs_inode_rsv_release(inode, qgroup_free);
0ca1f7ce
YZ
4856}
4857
8b62f87b
JB
4858/**
4859 * btrfs_delalloc_release_extents - release our outstanding_extents
4860 * @inode: the inode to balance the reservation for.
4861 * @num_bytes: the number of bytes we originally reserved with
43b18595 4862 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
8b62f87b
JB
4863 *
4864 * When we reserve space we increase outstanding_extents for the extents we may
4865 * add. Once we've set the range as delalloc or created our ordered extents we
4866 * have outstanding_extents to track the real usage, so we use this to free our
4867 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
4868 * with btrfs_delalloc_reserve_metadata.
4869 */
43b18595
QW
4870void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
4871 bool qgroup_free)
8b62f87b 4872{
3ffbd68c 4873 struct btrfs_fs_info *fs_info = inode->root->fs_info;
8b62f87b 4874 unsigned num_extents;
8b62f87b
JB
4875
4876 spin_lock(&inode->lock);
4877 num_extents = count_max_extents(num_bytes);
4878 btrfs_mod_outstanding_extents(inode, -num_extents);
69fe2d75 4879 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
8b62f87b
JB
4880 spin_unlock(&inode->lock);
4881
8b62f87b
JB
4882 if (btrfs_is_testing(fs_info))
4883 return;
4884
43b18595 4885 btrfs_inode_rsv_release(inode, qgroup_free);
8b62f87b
JB
4886}
4887
1ada3a62 4888/**
7cf5b976 4889 * btrfs_delalloc_reserve_space - reserve data and metadata space for
1ada3a62
QW
4890 * delalloc
4891 * @inode: inode we're writing to
4892 * @start: start range we are writing to
4893 * @len: how long the range we are writing to
364ecf36
QW
4894 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
4895 * current reservation.
1ada3a62 4896 *
1ada3a62
QW
4897 * This will do the following things
4898 *
4899 * o reserve space in data space info for num bytes
4900 * and reserve precious corresponding qgroup space
4901 * (Done in check_data_free_space)
4902 *
4903 * o reserve space for metadata space, based on the number of outstanding
4904 * extents and how much csums will be needed
4905 * also reserve metadata space in a per root over-reserve method.
4906 * o add to the inodes->delalloc_bytes
4907 * o add it to the fs_info's delalloc inodes list.
4908 * (Above 3 all done in delalloc_reserve_metadata)
4909 *
4910 * Return 0 for success
4911 * Return <0 for error(-ENOSPC or -EQUOT)
4912 */
364ecf36
QW
4913int btrfs_delalloc_reserve_space(struct inode *inode,
4914 struct extent_changeset **reserved, u64 start, u64 len)
1ada3a62
QW
4915{
4916 int ret;
4917
364ecf36 4918 ret = btrfs_check_data_free_space(inode, reserved, start, len);
1ada3a62
QW
4919 if (ret < 0)
4920 return ret;
9f3db423 4921 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
1ada3a62 4922 if (ret < 0)
bc42bda2 4923 btrfs_free_reserved_data_space(inode, *reserved, start, len);
1ada3a62
QW
4924 return ret;
4925}
4926
7709cde3 4927/**
7cf5b976 4928 * btrfs_delalloc_release_space - release data and metadata space for delalloc
1ada3a62
QW
4929 * @inode: inode we're releasing space for
4930 * @start: start position of the space already reserved
4931 * @len: the len of the space already reserved
8b62f87b 4932 * @release_bytes: the len of the space we consumed or didn't use
1ada3a62
QW
4933 *
4934 * This function will release the metadata space that was not used and will
4935 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4936 * list if there are no delalloc bytes left.
4937 * Also it will handle the qgroup reserved space.
4938 */
bc42bda2 4939void btrfs_delalloc_release_space(struct inode *inode,
8b62f87b 4940 struct extent_changeset *reserved,
43b18595 4941 u64 start, u64 len, bool qgroup_free)
1ada3a62 4942{
43b18595 4943 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
bc42bda2 4944 btrfs_free_reserved_data_space(inode, reserved, start, len);
6324fbf3
CM
4945}
4946
ce93ec54 4947static int update_block_group(struct btrfs_trans_handle *trans,
6b279408 4948 u64 bytenr, u64 num_bytes, int alloc)
9078a3e1 4949{
6b279408 4950 struct btrfs_fs_info *info = trans->fs_info;
0af3d00b 4951 struct btrfs_block_group_cache *cache = NULL;
db94535d 4952 u64 total = num_bytes;
9078a3e1 4953 u64 old_val;
db94535d 4954 u64 byte_in_group;
0af3d00b 4955 int factor;
ba2c4d4e 4956 int ret = 0;
3e1ad54f 4957
5d4f98a2 4958 /* block accounting for super block */
eb73c1b7 4959 spin_lock(&info->delalloc_root_lock);
6c41761f 4960 old_val = btrfs_super_bytes_used(info->super_copy);
5d4f98a2
YZ
4961 if (alloc)
4962 old_val += num_bytes;
4963 else
4964 old_val -= num_bytes;
6c41761f 4965 btrfs_set_super_bytes_used(info->super_copy, old_val);
eb73c1b7 4966 spin_unlock(&info->delalloc_root_lock);
5d4f98a2 4967
d397712b 4968 while (total) {
db94535d 4969 cache = btrfs_lookup_block_group(info, bytenr);
ba2c4d4e
JB
4970 if (!cache) {
4971 ret = -ENOENT;
4972 break;
4973 }
46df06b8
DS
4974 factor = btrfs_bg_type_to_factor(cache->flags);
4975
9d66e233
JB
4976 /*
4977 * If this block group has free space cache written out, we
4978 * need to make sure to load it if we are removing space. This
4979 * is because we need the unpinning stage to actually add the
4980 * space back to the block group, otherwise we will leak space.
4981 */
4982 if (!alloc && cache->cached == BTRFS_CACHE_NO)
f6373bf3 4983 cache_block_group(cache, 1);
0af3d00b 4984
db94535d
CM
4985 byte_in_group = bytenr - cache->key.objectid;
4986 WARN_ON(byte_in_group > cache->key.offset);
9078a3e1 4987
25179201 4988 spin_lock(&cache->space_info->lock);
c286ac48 4989 spin_lock(&cache->lock);
0af3d00b 4990
6202df69 4991 if (btrfs_test_opt(info, SPACE_CACHE) &&
0af3d00b
JB
4992 cache->disk_cache_state < BTRFS_DC_CLEAR)
4993 cache->disk_cache_state = BTRFS_DC_CLEAR;
4994
9078a3e1 4995 old_val = btrfs_block_group_used(&cache->item);
db94535d 4996 num_bytes = min(total, cache->key.offset - byte_in_group);
cd1bc465 4997 if (alloc) {
db94535d 4998 old_val += num_bytes;
11833d66
YZ
4999 btrfs_set_block_group_used(&cache->item, old_val);
5000 cache->reserved -= num_bytes;
11833d66 5001 cache->space_info->bytes_reserved -= num_bytes;
b742bb82
YZ
5002 cache->space_info->bytes_used += num_bytes;
5003 cache->space_info->disk_used += num_bytes * factor;
c286ac48 5004 spin_unlock(&cache->lock);
25179201 5005 spin_unlock(&cache->space_info->lock);
cd1bc465 5006 } else {
db94535d 5007 old_val -= num_bytes;
ae0ab003
FM
5008 btrfs_set_block_group_used(&cache->item, old_val);
5009 cache->pinned += num_bytes;
bb96c4e5
JB
5010 btrfs_space_info_update_bytes_pinned(info,
5011 cache->space_info, num_bytes);
ae0ab003
FM
5012 cache->space_info->bytes_used -= num_bytes;
5013 cache->space_info->disk_used -= num_bytes * factor;
5014 spin_unlock(&cache->lock);
5015 spin_unlock(&cache->space_info->lock);
47ab2a6c 5016
0b246afa 5017 trace_btrfs_space_reservation(info, "pinned",
c51e7bb1
JB
5018 cache->space_info->flags,
5019 num_bytes, 1);
dec59fa3
EL
5020 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
5021 num_bytes,
5022 BTRFS_TOTAL_BYTES_PINNED_BATCH);
ae0ab003
FM
5023 set_extent_dirty(info->pinned_extents,
5024 bytenr, bytenr + num_bytes - 1,
5025 GFP_NOFS | __GFP_NOFAIL);
cd1bc465 5026 }
1bbc621e
CM
5027
5028 spin_lock(&trans->transaction->dirty_bgs_lock);
5029 if (list_empty(&cache->dirty_list)) {
5030 list_add_tail(&cache->dirty_list,
5031 &trans->transaction->dirty_bgs);
ba2c4d4e 5032 trans->delayed_ref_updates++;
1bbc621e
CM
5033 btrfs_get_block_group(cache);
5034 }
5035 spin_unlock(&trans->transaction->dirty_bgs_lock);
5036
036a9348
FM
5037 /*
5038 * No longer have used bytes in this block group, queue it for
5039 * deletion. We do this after adding the block group to the
5040 * dirty list to avoid races between cleaner kthread and space
5041 * cache writeout.
5042 */
031f24da
QW
5043 if (!alloc && old_val == 0)
5044 btrfs_mark_bg_unused(cache);
036a9348 5045
fa9c0d79 5046 btrfs_put_block_group(cache);
db94535d
CM
5047 total -= num_bytes;
5048 bytenr += num_bytes;
9078a3e1 5049 }
ba2c4d4e
JB
5050
5051 /* Modified block groups are accounted for in the delayed_refs_rsv. */
5052 btrfs_update_delayed_refs_rsv(trans);
5053 return ret;
9078a3e1 5054}
6324fbf3 5055
2ff7e61e 5056static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
a061fc8d 5057{
0f9dd46c 5058 struct btrfs_block_group_cache *cache;
d2fb3437 5059 u64 bytenr;
0f9dd46c 5060
0b246afa
JM
5061 spin_lock(&fs_info->block_group_cache_lock);
5062 bytenr = fs_info->first_logical_byte;
5063 spin_unlock(&fs_info->block_group_cache_lock);
a1897fdd
LB
5064
5065 if (bytenr < (u64)-1)
5066 return bytenr;
5067
0b246afa 5068 cache = btrfs_lookup_first_block_group(fs_info, search_start);
0f9dd46c 5069 if (!cache)
a061fc8d 5070 return 0;
0f9dd46c 5071
d2fb3437 5072 bytenr = cache->key.objectid;
fa9c0d79 5073 btrfs_put_block_group(cache);
d2fb3437
YZ
5074
5075 return bytenr;
a061fc8d
CM
5076}
5077
fdf08605 5078static int pin_down_extent(struct btrfs_block_group_cache *cache,
f0486c68 5079 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 5080{
fdf08605
DS
5081 struct btrfs_fs_info *fs_info = cache->fs_info;
5082
11833d66
YZ
5083 spin_lock(&cache->space_info->lock);
5084 spin_lock(&cache->lock);
5085 cache->pinned += num_bytes;
bb96c4e5
JB
5086 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
5087 num_bytes);
11833d66
YZ
5088 if (reserved) {
5089 cache->reserved -= num_bytes;
5090 cache->space_info->bytes_reserved -= num_bytes;
5091 }
5092 spin_unlock(&cache->lock);
5093 spin_unlock(&cache->space_info->lock);
68b38550 5094
0b246afa 5095 trace_btrfs_space_reservation(fs_info, "pinned",
c51e7bb1 5096 cache->space_info->flags, num_bytes, 1);
dec59fa3
EL
5097 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
5098 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
0b246afa 5099 set_extent_dirty(fs_info->pinned_extents, bytenr,
f0486c68
YZ
5100 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5101 return 0;
5102}
68b38550 5103
f0486c68
YZ
5104/*
5105 * this function must be called within transaction
5106 */
2ff7e61e 5107int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
f0486c68
YZ
5108 u64 bytenr, u64 num_bytes, int reserved)
5109{
5110 struct btrfs_block_group_cache *cache;
68b38550 5111
0b246afa 5112 cache = btrfs_lookup_block_group(fs_info, bytenr);
79787eaa 5113 BUG_ON(!cache); /* Logic error */
f0486c68 5114
fdf08605 5115 pin_down_extent(cache, bytenr, num_bytes, reserved);
f0486c68
YZ
5116
5117 btrfs_put_block_group(cache);
11833d66
YZ
5118 return 0;
5119}
5120
f0486c68 5121/*
e688b725
CM
5122 * this function must be called within transaction
5123 */
2ff7e61e 5124int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
e688b725
CM
5125 u64 bytenr, u64 num_bytes)
5126{
5127 struct btrfs_block_group_cache *cache;
b50c6e25 5128 int ret;
e688b725 5129
0b246afa 5130 cache = btrfs_lookup_block_group(fs_info, bytenr);
b50c6e25
JB
5131 if (!cache)
5132 return -EINVAL;
e688b725
CM
5133
5134 /*
5135 * pull in the free space cache (if any) so that our pin
5136 * removes the free space from the cache. We have load_only set
5137 * to one because the slow code to read in the free extents does check
5138 * the pinned extents.
5139 */
f6373bf3 5140 cache_block_group(cache, 1);
e688b725 5141
fdf08605 5142 pin_down_extent(cache, bytenr, num_bytes, 0);
e688b725
CM
5143
5144 /* remove us from the free space cache (if we're there at all) */
b50c6e25 5145 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
e688b725 5146 btrfs_put_block_group(cache);
b50c6e25 5147 return ret;
e688b725
CM
5148}
5149
2ff7e61e
JM
5150static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
5151 u64 start, u64 num_bytes)
8c2a1a30
JB
5152{
5153 int ret;
5154 struct btrfs_block_group_cache *block_group;
5155 struct btrfs_caching_control *caching_ctl;
5156
0b246afa 5157 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
5158 if (!block_group)
5159 return -EINVAL;
5160
5161 cache_block_group(block_group, 0);
5162 caching_ctl = get_caching_control(block_group);
5163
5164 if (!caching_ctl) {
5165 /* Logic error */
5166 BUG_ON(!block_group_cache_done(block_group));
5167 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5168 } else {
5169 mutex_lock(&caching_ctl->mutex);
5170
5171 if (start >= caching_ctl->progress) {
2ff7e61e 5172 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
5173 } else if (start + num_bytes <= caching_ctl->progress) {
5174 ret = btrfs_remove_free_space(block_group,
5175 start, num_bytes);
5176 } else {
5177 num_bytes = caching_ctl->progress - start;
5178 ret = btrfs_remove_free_space(block_group,
5179 start, num_bytes);
5180 if (ret)
5181 goto out_lock;
5182
5183 num_bytes = (start + num_bytes) -
5184 caching_ctl->progress;
5185 start = caching_ctl->progress;
2ff7e61e 5186 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
5187 }
5188out_lock:
5189 mutex_unlock(&caching_ctl->mutex);
5190 put_caching_control(caching_ctl);
5191 }
5192 btrfs_put_block_group(block_group);
5193 return ret;
5194}
5195
bcdc428c 5196int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 5197{
bcdc428c 5198 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
5199 struct btrfs_file_extent_item *item;
5200 struct btrfs_key key;
5201 int found_type;
5202 int i;
b89311ef 5203 int ret = 0;
8c2a1a30 5204
2ff7e61e 5205 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
5206 return 0;
5207
5208 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5209 btrfs_item_key_to_cpu(eb, &key, i);
5210 if (key.type != BTRFS_EXTENT_DATA_KEY)
5211 continue;
5212 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5213 found_type = btrfs_file_extent_type(eb, item);
5214 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5215 continue;
5216 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5217 continue;
5218 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5219 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
5220 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
5221 if (ret)
5222 break;
8c2a1a30
JB
5223 }
5224
b89311ef 5225 return ret;
8c2a1a30
JB
5226}
5227
9cfa3e34
FM
5228static void
5229btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
5230{
5231 atomic_inc(&bg->reservations);
5232}
5233
5234void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
5235 const u64 start)
5236{
5237 struct btrfs_block_group_cache *bg;
5238
5239 bg = btrfs_lookup_block_group(fs_info, start);
5240 ASSERT(bg);
5241 if (atomic_dec_and_test(&bg->reservations))
4625956a 5242 wake_up_var(&bg->reservations);
9cfa3e34
FM
5243 btrfs_put_block_group(bg);
5244}
5245
9cfa3e34
FM
5246void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
5247{
5248 struct btrfs_space_info *space_info = bg->space_info;
5249
5250 ASSERT(bg->ro);
5251
5252 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
5253 return;
5254
5255 /*
5256 * Our block group is read only but before we set it to read only,
5257 * some task might have had allocated an extent from it already, but it
5258 * has not yet created a respective ordered extent (and added it to a
5259 * root's list of ordered extents).
5260 * Therefore wait for any task currently allocating extents, since the
5261 * block group's reservations counter is incremented while a read lock
5262 * on the groups' semaphore is held and decremented after releasing
5263 * the read access on that semaphore and creating the ordered extent.
5264 */
5265 down_write(&space_info->groups_sem);
5266 up_write(&space_info->groups_sem);
5267
4625956a 5268 wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
9cfa3e34
FM
5269}
5270
fb25e914 5271/**
4824f1f4 5272 * btrfs_add_reserved_bytes - update the block_group and space info counters
fb25e914 5273 * @cache: The cache we are manipulating
18513091
WX
5274 * @ram_bytes: The number of bytes of file content, and will be same to
5275 * @num_bytes except for the compress path.
fb25e914 5276 * @num_bytes: The number of bytes in question
e570fd27 5277 * @delalloc: The blocks are allocated for the delalloc write
fb25e914 5278 *
745699ef
XW
5279 * This is called by the allocator when it reserves space. If this is a
5280 * reservation and the block group has become read only we cannot make the
5281 * reservation and return -EAGAIN, otherwise this function always succeeds.
f0486c68 5282 */
4824f1f4 5283static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
18513091 5284 u64 ram_bytes, u64 num_bytes, int delalloc)
11833d66 5285{
fb25e914 5286 struct btrfs_space_info *space_info = cache->space_info;
f0486c68 5287 int ret = 0;
79787eaa 5288
fb25e914
JB
5289 spin_lock(&space_info->lock);
5290 spin_lock(&cache->lock);
4824f1f4
WX
5291 if (cache->ro) {
5292 ret = -EAGAIN;
fb25e914 5293 } else {
4824f1f4
WX
5294 cache->reserved += num_bytes;
5295 space_info->bytes_reserved += num_bytes;
bb96c4e5
JB
5296 btrfs_space_info_update_bytes_may_use(cache->fs_info,
5297 space_info, -ram_bytes);
e570fd27 5298 if (delalloc)
4824f1f4 5299 cache->delalloc_bytes += num_bytes;
324ae4df 5300 }
fb25e914
JB
5301 spin_unlock(&cache->lock);
5302 spin_unlock(&space_info->lock);
f0486c68 5303 return ret;
324ae4df 5304}
9078a3e1 5305
4824f1f4
WX
5306/**
5307 * btrfs_free_reserved_bytes - update the block_group and space info counters
5308 * @cache: The cache we are manipulating
5309 * @num_bytes: The number of bytes in question
5310 * @delalloc: The blocks are allocated for the delalloc write
5311 *
5312 * This is called by somebody who is freeing space that was never actually used
5313 * on disk. For example if you reserve some space for a new leaf in transaction
5314 * A and before transaction A commits you free that leaf, you call this with
5315 * reserve set to 0 in order to clear the reservation.
5316 */
5317
556f3ca8 5318static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
5319 u64 num_bytes, int delalloc)
4824f1f4
WX
5320{
5321 struct btrfs_space_info *space_info = cache->space_info;
4824f1f4
WX
5322
5323 spin_lock(&space_info->lock);
5324 spin_lock(&cache->lock);
5325 if (cache->ro)
5326 space_info->bytes_readonly += num_bytes;
5327 cache->reserved -= num_bytes;
5328 space_info->bytes_reserved -= num_bytes;
21a94f7a 5329 space_info->max_extent_size = 0;
4824f1f4
WX
5330
5331 if (delalloc)
5332 cache->delalloc_bytes -= num_bytes;
5333 spin_unlock(&cache->lock);
5334 spin_unlock(&space_info->lock);
4824f1f4 5335}
8b74c03e 5336void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
e8569813 5337{
11833d66
YZ
5338 struct btrfs_caching_control *next;
5339 struct btrfs_caching_control *caching_ctl;
5340 struct btrfs_block_group_cache *cache;
e8569813 5341
9e351cc8 5342 down_write(&fs_info->commit_root_sem);
25179201 5343
11833d66
YZ
5344 list_for_each_entry_safe(caching_ctl, next,
5345 &fs_info->caching_block_groups, list) {
5346 cache = caching_ctl->block_group;
5347 if (block_group_cache_done(cache)) {
5348 cache->last_byte_to_unpin = (u64)-1;
5349 list_del_init(&caching_ctl->list);
5350 put_caching_control(caching_ctl);
e8569813 5351 } else {
11833d66 5352 cache->last_byte_to_unpin = caching_ctl->progress;
e8569813 5353 }
e8569813 5354 }
11833d66
YZ
5355
5356 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5357 fs_info->pinned_extents = &fs_info->freed_extents[1];
5358 else
5359 fs_info->pinned_extents = &fs_info->freed_extents[0];
5360
9e351cc8 5361 up_write(&fs_info->commit_root_sem);
8929ecfa
YZ
5362
5363 update_global_block_rsv(fs_info);
e8569813
ZY
5364}
5365
c759c4e1
JB
5366/*
5367 * Returns the free cluster for the given space info and sets empty_cluster to
5368 * what it should be based on the mount options.
5369 */
5370static struct btrfs_free_cluster *
2ff7e61e
JM
5371fetch_cluster_info(struct btrfs_fs_info *fs_info,
5372 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
5373{
5374 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
5375
5376 *empty_cluster = 0;
5377 if (btrfs_mixed_space_info(space_info))
5378 return ret;
5379
c759c4e1 5380 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 5381 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
5382 if (btrfs_test_opt(fs_info, SSD))
5383 *empty_cluster = SZ_2M;
5384 else
ee22184b 5385 *empty_cluster = SZ_64K;
583b7231
HK
5386 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
5387 btrfs_test_opt(fs_info, SSD_SPREAD)) {
5388 *empty_cluster = SZ_2M;
0b246afa 5389 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
5390 }
5391
5392 return ret;
5393}
5394
2ff7e61e
JM
5395static int unpin_extent_range(struct btrfs_fs_info *fs_info,
5396 u64 start, u64 end,
678886bd 5397 const bool return_free_space)
ccd467d6 5398{
11833d66 5399 struct btrfs_block_group_cache *cache = NULL;
7b398f8e
JB
5400 struct btrfs_space_info *space_info;
5401 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 5402 struct btrfs_free_cluster *cluster = NULL;
11833d66 5403 u64 len;
c759c4e1
JB
5404 u64 total_unpinned = 0;
5405 u64 empty_cluster = 0;
7b398f8e 5406 bool readonly;
ccd467d6 5407
11833d66 5408 while (start <= end) {
7b398f8e 5409 readonly = false;
11833d66
YZ
5410 if (!cache ||
5411 start >= cache->key.objectid + cache->key.offset) {
5412 if (cache)
5413 btrfs_put_block_group(cache);
c759c4e1 5414 total_unpinned = 0;
11833d66 5415 cache = btrfs_lookup_block_group(fs_info, start);
79787eaa 5416 BUG_ON(!cache); /* Logic error */
c759c4e1 5417
2ff7e61e 5418 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
5419 cache->space_info,
5420 &empty_cluster);
5421 empty_cluster <<= 1;
11833d66
YZ
5422 }
5423
5424 len = cache->key.objectid + cache->key.offset - start;
5425 len = min(len, end + 1 - start);
5426
5427 if (start < cache->last_byte_to_unpin) {
5428 len = min(len, cache->last_byte_to_unpin - start);
678886bd
FM
5429 if (return_free_space)
5430 btrfs_add_free_space(cache, start, len);
11833d66
YZ
5431 }
5432
f0486c68 5433 start += len;
c759c4e1 5434 total_unpinned += len;
7b398f8e 5435 space_info = cache->space_info;
f0486c68 5436
c759c4e1
JB
5437 /*
5438 * If this space cluster has been marked as fragmented and we've
5439 * unpinned enough in this block group to potentially allow a
5440 * cluster to be created inside of it go ahead and clear the
5441 * fragmented check.
5442 */
5443 if (cluster && cluster->fragmented &&
5444 total_unpinned > empty_cluster) {
5445 spin_lock(&cluster->lock);
5446 cluster->fragmented = 0;
5447 spin_unlock(&cluster->lock);
5448 }
5449
7b398f8e 5450 spin_lock(&space_info->lock);
11833d66
YZ
5451 spin_lock(&cache->lock);
5452 cache->pinned -= len;
bb96c4e5 5453 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
c51e7bb1
JB
5454
5455 trace_btrfs_space_reservation(fs_info, "pinned",
5456 space_info->flags, len, 0);
4f4db217 5457 space_info->max_extent_size = 0;
dec59fa3
EL
5458 percpu_counter_add_batch(&space_info->total_bytes_pinned,
5459 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
7b398f8e
JB
5460 if (cache->ro) {
5461 space_info->bytes_readonly += len;
5462 readonly = true;
5463 }
11833d66 5464 spin_unlock(&cache->lock);
957780eb
JB
5465 if (!readonly && return_free_space &&
5466 global_rsv->space_info == space_info) {
5467 u64 to_add = len;
92ac58ec 5468
7b398f8e
JB
5469 spin_lock(&global_rsv->lock);
5470 if (!global_rsv->full) {
957780eb
JB
5471 to_add = min(len, global_rsv->size -
5472 global_rsv->reserved);
5473 global_rsv->reserved += to_add;
bb96c4e5
JB
5474 btrfs_space_info_update_bytes_may_use(fs_info,
5475 space_info, to_add);
7b398f8e
JB
5476 if (global_rsv->reserved >= global_rsv->size)
5477 global_rsv->full = 1;
957780eb
JB
5478 trace_btrfs_space_reservation(fs_info,
5479 "space_info",
5480 space_info->flags,
5481 to_add, 1);
5482 len -= to_add;
7b398f8e
JB
5483 }
5484 spin_unlock(&global_rsv->lock);
957780eb
JB
5485 /* Add to any tickets we may have */
5486 if (len)
d44b72aa
JB
5487 btrfs_space_info_add_new_bytes(fs_info,
5488 space_info, len);
7b398f8e
JB
5489 }
5490 spin_unlock(&space_info->lock);
ccd467d6 5491 }
11833d66
YZ
5492
5493 if (cache)
5494 btrfs_put_block_group(cache);
ccd467d6
CM
5495 return 0;
5496}
5497
5ead2dd0 5498int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 5499{
5ead2dd0 5500 struct btrfs_fs_info *fs_info = trans->fs_info;
e33e17ee
JM
5501 struct btrfs_block_group_cache *block_group, *tmp;
5502 struct list_head *deleted_bgs;
11833d66 5503 struct extent_io_tree *unpin;
1a5bc167
CM
5504 u64 start;
5505 u64 end;
a28ec197 5506 int ret;
a28ec197 5507
11833d66
YZ
5508 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5509 unpin = &fs_info->freed_extents[1];
5510 else
5511 unpin = &fs_info->freed_extents[0];
5512
e33e17ee 5513 while (!trans->aborted) {
0e6ec385
FM
5514 struct extent_state *cached_state = NULL;
5515
d4b450cd 5516 mutex_lock(&fs_info->unused_bg_unpin_mutex);
1a5bc167 5517 ret = find_first_extent_bit(unpin, 0, &start, &end,
0e6ec385 5518 EXTENT_DIRTY, &cached_state);
d4b450cd
FM
5519 if (ret) {
5520 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 5521 break;
d4b450cd 5522 }
1f3c79a2 5523
0b246afa 5524 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 5525 ret = btrfs_discard_extent(fs_info, start,
5378e607 5526 end + 1 - start, NULL);
1f3c79a2 5527
0e6ec385 5528 clear_extent_dirty(unpin, start, end, &cached_state);
2ff7e61e 5529 unpin_extent_range(fs_info, start, end, true);
d4b450cd 5530 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 5531 free_extent_state(cached_state);
b9473439 5532 cond_resched();
a28ec197 5533 }
817d52f8 5534
e33e17ee
JM
5535 /*
5536 * Transaction is finished. We don't need the lock anymore. We
5537 * do need to clean up the block groups in case of a transaction
5538 * abort.
5539 */
5540 deleted_bgs = &trans->transaction->deleted_bgs;
5541 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
5542 u64 trimmed = 0;
5543
5544 ret = -EROFS;
5545 if (!trans->aborted)
2ff7e61e 5546 ret = btrfs_discard_extent(fs_info,
e33e17ee
JM
5547 block_group->key.objectid,
5548 block_group->key.offset,
5549 &trimmed);
5550
5551 list_del_init(&block_group->bg_list);
5552 btrfs_put_block_group_trimming(block_group);
5553 btrfs_put_block_group(block_group);
5554
5555 if (ret) {
5556 const char *errstr = btrfs_decode_error(ret);
5557 btrfs_warn(fs_info,
913e1535 5558 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
5559 ret, errstr);
5560 }
5561 }
5562
e20d96d6
CM
5563 return 0;
5564}
5565
5d4f98a2 5566static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
5567 struct btrfs_delayed_ref_node *node, u64 parent,
5568 u64 root_objectid, u64 owner_objectid,
5569 u64 owner_offset, int refs_to_drop,
5570 struct btrfs_delayed_extent_op *extent_op)
a28ec197 5571{
e72cb923 5572 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 5573 struct btrfs_key key;
5d4f98a2 5574 struct btrfs_path *path;
1261ec42 5575 struct btrfs_root *extent_root = info->extent_root;
5f39d397 5576 struct extent_buffer *leaf;
5d4f98a2
YZ
5577 struct btrfs_extent_item *ei;
5578 struct btrfs_extent_inline_ref *iref;
a28ec197 5579 int ret;
5d4f98a2 5580 int is_data;
952fccac
CM
5581 int extent_slot = 0;
5582 int found_extent = 0;
5583 int num_to_del = 1;
5d4f98a2
YZ
5584 u32 item_size;
5585 u64 refs;
c682f9b3
QW
5586 u64 bytenr = node->bytenr;
5587 u64 num_bytes = node->num_bytes;
fcebe456 5588 int last_ref = 0;
0b246afa 5589 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
037e6390 5590
5caf2a00 5591 path = btrfs_alloc_path();
54aa1f4d
CM
5592 if (!path)
5593 return -ENOMEM;
5f26f772 5594
e4058b54 5595 path->reada = READA_FORWARD;
b9473439 5596 path->leave_spinning = 1;
5d4f98a2
YZ
5597
5598 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5599 BUG_ON(!is_data && refs_to_drop != 1);
5600
3173a18f 5601 if (is_data)
897ca819 5602 skinny_metadata = false;
3173a18f 5603
fbe4801b
NB
5604 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
5605 parent, root_objectid, owner_objectid,
5d4f98a2 5606 owner_offset);
7bb86316 5607 if (ret == 0) {
952fccac 5608 extent_slot = path->slots[0];
5d4f98a2
YZ
5609 while (extent_slot >= 0) {
5610 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 5611 extent_slot);
5d4f98a2 5612 if (key.objectid != bytenr)
952fccac 5613 break;
5d4f98a2
YZ
5614 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5615 key.offset == num_bytes) {
952fccac
CM
5616 found_extent = 1;
5617 break;
5618 }
3173a18f
JB
5619 if (key.type == BTRFS_METADATA_ITEM_KEY &&
5620 key.offset == owner_objectid) {
5621 found_extent = 1;
5622 break;
5623 }
952fccac
CM
5624 if (path->slots[0] - extent_slot > 5)
5625 break;
5d4f98a2 5626 extent_slot--;
952fccac 5627 }
a79865c6 5628
31840ae1 5629 if (!found_extent) {
5d4f98a2 5630 BUG_ON(iref);
87cc7a8a 5631 ret = remove_extent_backref(trans, path, NULL,
87bde3cd 5632 refs_to_drop,
fcebe456 5633 is_data, &last_ref);
005d6427 5634 if (ret) {
66642832 5635 btrfs_abort_transaction(trans, ret);
005d6427
DS
5636 goto out;
5637 }
b3b4aa74 5638 btrfs_release_path(path);
b9473439 5639 path->leave_spinning = 1;
5d4f98a2
YZ
5640
5641 key.objectid = bytenr;
5642 key.type = BTRFS_EXTENT_ITEM_KEY;
5643 key.offset = num_bytes;
5644
3173a18f
JB
5645 if (!is_data && skinny_metadata) {
5646 key.type = BTRFS_METADATA_ITEM_KEY;
5647 key.offset = owner_objectid;
5648 }
5649
31840ae1
ZY
5650 ret = btrfs_search_slot(trans, extent_root,
5651 &key, path, -1, 1);
3173a18f
JB
5652 if (ret > 0 && skinny_metadata && path->slots[0]) {
5653 /*
5654 * Couldn't find our skinny metadata item,
5655 * see if we have ye olde extent item.
5656 */
5657 path->slots[0]--;
5658 btrfs_item_key_to_cpu(path->nodes[0], &key,
5659 path->slots[0]);
5660 if (key.objectid == bytenr &&
5661 key.type == BTRFS_EXTENT_ITEM_KEY &&
5662 key.offset == num_bytes)
5663 ret = 0;
5664 }
5665
5666 if (ret > 0 && skinny_metadata) {
5667 skinny_metadata = false;
9ce49a0b 5668 key.objectid = bytenr;
3173a18f
JB
5669 key.type = BTRFS_EXTENT_ITEM_KEY;
5670 key.offset = num_bytes;
5671 btrfs_release_path(path);
5672 ret = btrfs_search_slot(trans, extent_root,
5673 &key, path, -1, 1);
5674 }
5675
f3465ca4 5676 if (ret) {
5d163e0e
JM
5677 btrfs_err(info,
5678 "umm, got %d back from search, was looking for %llu",
5679 ret, bytenr);
b783e62d 5680 if (ret > 0)
a4f78750 5681 btrfs_print_leaf(path->nodes[0]);
f3465ca4 5682 }
005d6427 5683 if (ret < 0) {
66642832 5684 btrfs_abort_transaction(trans, ret);
005d6427
DS
5685 goto out;
5686 }
31840ae1
ZY
5687 extent_slot = path->slots[0];
5688 }
fae7f21c 5689 } else if (WARN_ON(ret == -ENOENT)) {
a4f78750 5690 btrfs_print_leaf(path->nodes[0]);
c2cf52eb
SK
5691 btrfs_err(info,
5692 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
c1c9ff7c
GU
5693 bytenr, parent, root_objectid, owner_objectid,
5694 owner_offset);
66642832 5695 btrfs_abort_transaction(trans, ret);
c4a050bb 5696 goto out;
79787eaa 5697 } else {
66642832 5698 btrfs_abort_transaction(trans, ret);
005d6427 5699 goto out;
7bb86316 5700 }
5f39d397
CM
5701
5702 leaf = path->nodes[0];
5d4f98a2 5703 item_size = btrfs_item_size_nr(leaf, extent_slot);
6d8ff4e4 5704 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
5705 ret = -EINVAL;
5706 btrfs_print_v0_err(info);
5707 btrfs_abort_transaction(trans, ret);
5708 goto out;
5709 }
952fccac 5710 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 5711 struct btrfs_extent_item);
3173a18f
JB
5712 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5713 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2
YZ
5714 struct btrfs_tree_block_info *bi;
5715 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5716 bi = (struct btrfs_tree_block_info *)(ei + 1);
5717 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5718 }
56bec294 5719
5d4f98a2 5720 refs = btrfs_extent_refs(leaf, ei);
32b02538 5721 if (refs < refs_to_drop) {
5d163e0e
JM
5722 btrfs_err(info,
5723 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
5724 refs_to_drop, refs, bytenr);
32b02538 5725 ret = -EINVAL;
66642832 5726 btrfs_abort_transaction(trans, ret);
32b02538
JB
5727 goto out;
5728 }
56bec294 5729 refs -= refs_to_drop;
5f39d397 5730
5d4f98a2
YZ
5731 if (refs > 0) {
5732 if (extent_op)
5733 __run_delayed_extent_op(extent_op, leaf, ei);
5734 /*
5735 * In the case of inline back ref, reference count will
5736 * be updated by remove_extent_backref
952fccac 5737 */
5d4f98a2
YZ
5738 if (iref) {
5739 BUG_ON(!found_extent);
5740 } else {
5741 btrfs_set_extent_refs(leaf, ei, refs);
5742 btrfs_mark_buffer_dirty(leaf);
5743 }
5744 if (found_extent) {
87cc7a8a
NB
5745 ret = remove_extent_backref(trans, path, iref,
5746 refs_to_drop, is_data,
5747 &last_ref);
005d6427 5748 if (ret) {
66642832 5749 btrfs_abort_transaction(trans, ret);
005d6427
DS
5750 goto out;
5751 }
952fccac 5752 }
5d4f98a2 5753 } else {
5d4f98a2
YZ
5754 if (found_extent) {
5755 BUG_ON(is_data && refs_to_drop !=
9ed0dea0 5756 extent_data_ref_count(path, iref));
5d4f98a2
YZ
5757 if (iref) {
5758 BUG_ON(path->slots[0] != extent_slot);
5759 } else {
5760 BUG_ON(path->slots[0] != extent_slot + 1);
5761 path->slots[0] = extent_slot;
5762 num_to_del = 2;
5763 }
78fae27e 5764 }
b9473439 5765
fcebe456 5766 last_ref = 1;
952fccac
CM
5767 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5768 num_to_del);
005d6427 5769 if (ret) {
66642832 5770 btrfs_abort_transaction(trans, ret);
005d6427
DS
5771 goto out;
5772 }
b3b4aa74 5773 btrfs_release_path(path);
21af804c 5774
5d4f98a2 5775 if (is_data) {
5b4aacef 5776 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
005d6427 5777 if (ret) {
66642832 5778 btrfs_abort_transaction(trans, ret);
005d6427
DS
5779 goto out;
5780 }
459931ec
CM
5781 }
5782
e7355e50 5783 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
1e144fb8 5784 if (ret) {
66642832 5785 btrfs_abort_transaction(trans, ret);
1e144fb8
OS
5786 goto out;
5787 }
5788
6b279408 5789 ret = update_block_group(trans, bytenr, num_bytes, 0);
005d6427 5790 if (ret) {
66642832 5791 btrfs_abort_transaction(trans, ret);
005d6427
DS
5792 goto out;
5793 }
a28ec197 5794 }
fcebe456
JB
5795 btrfs_release_path(path);
5796
79787eaa 5797out:
5caf2a00 5798 btrfs_free_path(path);
a28ec197
CM
5799 return ret;
5800}
5801
1887be66 5802/*
f0486c68 5803 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
5804 * delayed ref for that extent as well. This searches the delayed ref tree for
5805 * a given extent, and if there are no other delayed refs to be processed, it
5806 * removes it from the tree.
5807 */
5808static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 5809 u64 bytenr)
1887be66
CM
5810{
5811 struct btrfs_delayed_ref_head *head;
5812 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 5813 int ret = 0;
1887be66
CM
5814
5815 delayed_refs = &trans->transaction->delayed_refs;
5816 spin_lock(&delayed_refs->lock);
f72ad18e 5817 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 5818 if (!head)
cf93da7b 5819 goto out_delayed_unlock;
1887be66 5820
d7df2c79 5821 spin_lock(&head->lock);
e3d03965 5822 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
5823 goto out;
5824
bedc6617
JB
5825 if (cleanup_extent_op(head) != NULL)
5826 goto out;
5d4f98a2 5827
1887be66
CM
5828 /*
5829 * waiting for the lock here would deadlock. If someone else has it
5830 * locked they are already in the process of dropping it anyway
5831 */
5832 if (!mutex_trylock(&head->mutex))
5833 goto out;
5834
d7baffda 5835 btrfs_delete_ref_head(delayed_refs, head);
d7df2c79 5836 head->processing = 0;
d7baffda 5837
d7df2c79 5838 spin_unlock(&head->lock);
1887be66
CM
5839 spin_unlock(&delayed_refs->lock);
5840
f0486c68
YZ
5841 BUG_ON(head->extent_op);
5842 if (head->must_insert_reserved)
5843 ret = 1;
5844
31890da0 5845 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 5846 mutex_unlock(&head->mutex);
d278850e 5847 btrfs_put_delayed_ref_head(head);
f0486c68 5848 return ret;
1887be66 5849out:
d7df2c79 5850 spin_unlock(&head->lock);
cf93da7b
CM
5851
5852out_delayed_unlock:
1887be66
CM
5853 spin_unlock(&delayed_refs->lock);
5854 return 0;
5855}
5856
f0486c68
YZ
5857void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5858 struct btrfs_root *root,
5859 struct extent_buffer *buf,
5581a51a 5860 u64 parent, int last_ref)
f0486c68 5861{
0b246afa 5862 struct btrfs_fs_info *fs_info = root->fs_info;
ed4f255b 5863 struct btrfs_ref generic_ref = { 0 };
b150a4f1 5864 int pin = 1;
f0486c68
YZ
5865 int ret;
5866
ed4f255b
QW
5867 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
5868 buf->start, buf->len, parent);
5869 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
5870 root->root_key.objectid);
5871
f0486c68 5872 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
d7eae340
OS
5873 int old_ref_mod, new_ref_mod;
5874
8a5040f7 5875 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 5876 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
d7eae340 5877 &old_ref_mod, &new_ref_mod);
79787eaa 5878 BUG_ON(ret); /* -ENOMEM */
d7eae340 5879 pin = old_ref_mod >= 0 && new_ref_mod < 0;
f0486c68
YZ
5880 }
5881
0a16c7d7 5882 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
6219872d
FM
5883 struct btrfs_block_group_cache *cache;
5884
f0486c68 5885 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
2ff7e61e 5886 ret = check_ref_cleanup(trans, buf->start);
f0486c68 5887 if (!ret)
37be25bc 5888 goto out;
f0486c68
YZ
5889 }
5890
4da8b76d 5891 pin = 0;
0b246afa 5892 cache = btrfs_lookup_block_group(fs_info, buf->start);
6219872d 5893
f0486c68 5894 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
fdf08605 5895 pin_down_extent(cache, buf->start, buf->len, 1);
6219872d 5896 btrfs_put_block_group(cache);
37be25bc 5897 goto out;
f0486c68
YZ
5898 }
5899
5900 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5901
5902 btrfs_add_free_space(cache, buf->start, buf->len);
4824f1f4 5903 btrfs_free_reserved_bytes(cache, buf->len, 0);
6219872d 5904 btrfs_put_block_group(cache);
71ff6437 5905 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
f0486c68
YZ
5906 }
5907out:
b150a4f1 5908 if (pin)
78192442 5909 add_pinned_bytes(fs_info, &generic_ref);
b150a4f1 5910
0a16c7d7
OS
5911 if (last_ref) {
5912 /*
5913 * Deleting the buffer, clear the corrupt flag since it doesn't
5914 * matter anymore.
5915 */
5916 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5917 }
f0486c68
YZ
5918}
5919
79787eaa 5920/* Can return -ENOMEM */
ffd4bb2a 5921int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 5922{
ffd4bb2a 5923 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 5924 int old_ref_mod, new_ref_mod;
925baedd
CM
5925 int ret;
5926
f5ee5c9a 5927 if (btrfs_is_testing(fs_info))
faa2dbf0 5928 return 0;
fccb84c9 5929
56bec294
CM
5930 /*
5931 * tree log blocks never actually go into the extent allocation
5932 * tree, just update pinning info and exit early.
56bec294 5933 */
ffd4bb2a
QW
5934 if ((ref->type == BTRFS_REF_METADATA &&
5935 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5936 (ref->type == BTRFS_REF_DATA &&
5937 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
b9473439 5938 /* unlocks the pinned mutex */
ffd4bb2a 5939 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
d7eae340 5940 old_ref_mod = new_ref_mod = 0;
56bec294 5941 ret = 0;
ffd4bb2a
QW
5942 } else if (ref->type == BTRFS_REF_METADATA) {
5943 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
d7eae340 5944 &old_ref_mod, &new_ref_mod);
5d4f98a2 5945 } else {
ffd4bb2a 5946 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
d7eae340 5947 &old_ref_mod, &new_ref_mod);
56bec294 5948 }
d7eae340 5949
ffd4bb2a
QW
5950 if (!((ref->type == BTRFS_REF_METADATA &&
5951 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
5952 (ref->type == BTRFS_REF_DATA &&
5953 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
5954 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 5955
ddf30cf0 5956 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
78192442 5957 add_pinned_bytes(fs_info, ref);
d7eae340 5958
925baedd
CM
5959 return ret;
5960}
5961
817d52f8
JB
5962/*
5963 * when we wait for progress in the block group caching, its because
5964 * our allocation attempt failed at least once. So, we must sleep
5965 * and let some progress happen before we try again.
5966 *
5967 * This function will sleep at least once waiting for new free space to
5968 * show up, and then it will check the block group free space numbers
5969 * for our min num_bytes. Another option is to have it go ahead
5970 * and look in the rbtree for a free extent of a given size, but this
5971 * is a good start.
36cce922
JB
5972 *
5973 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
5974 * any of the information in this block group.
817d52f8 5975 */
36cce922 5976static noinline void
817d52f8
JB
5977wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5978 u64 num_bytes)
5979{
11833d66 5980 struct btrfs_caching_control *caching_ctl;
817d52f8 5981
11833d66
YZ
5982 caching_ctl = get_caching_control(cache);
5983 if (!caching_ctl)
36cce922 5984 return;
817d52f8 5985
11833d66 5986 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
34d52cb6 5987 (cache->free_space_ctl->free_space >= num_bytes));
11833d66
YZ
5988
5989 put_caching_control(caching_ctl);
11833d66
YZ
5990}
5991
5992static noinline int
5993wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5994{
5995 struct btrfs_caching_control *caching_ctl;
36cce922 5996 int ret = 0;
11833d66
YZ
5997
5998 caching_ctl = get_caching_control(cache);
5999 if (!caching_ctl)
36cce922 6000 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
11833d66
YZ
6001
6002 wait_event(caching_ctl->wait, block_group_cache_done(cache));
36cce922
JB
6003 if (cache->cached == BTRFS_CACHE_ERROR)
6004 ret = -EIO;
11833d66 6005 put_caching_control(caching_ctl);
36cce922 6006 return ret;
817d52f8
JB
6007}
6008
6009enum btrfs_loop_type {
f262fa8d
DS
6010 LOOP_CACHING_NOWAIT,
6011 LOOP_CACHING_WAIT,
6012 LOOP_ALLOC_CHUNK,
6013 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
6014};
6015
e570fd27
MX
6016static inline void
6017btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6018 int delalloc)
6019{
6020 if (delalloc)
6021 down_read(&cache->data_rwsem);
6022}
6023
6024static inline void
6025btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6026 int delalloc)
6027{
6028 btrfs_get_block_group(cache);
6029 if (delalloc)
6030 down_read(&cache->data_rwsem);
6031}
6032
6033static struct btrfs_block_group_cache *
6034btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6035 struct btrfs_free_cluster *cluster,
6036 int delalloc)
6037{
89771cc9 6038 struct btrfs_block_group_cache *used_bg = NULL;
6719afdc 6039
e570fd27 6040 spin_lock(&cluster->refill_lock);
6719afdc
GU
6041 while (1) {
6042 used_bg = cluster->block_group;
6043 if (!used_bg)
6044 return NULL;
6045
6046 if (used_bg == block_group)
e570fd27
MX
6047 return used_bg;
6048
6719afdc 6049 btrfs_get_block_group(used_bg);
e570fd27 6050
6719afdc
GU
6051 if (!delalloc)
6052 return used_bg;
e570fd27 6053
6719afdc
GU
6054 if (down_read_trylock(&used_bg->data_rwsem))
6055 return used_bg;
e570fd27 6056
6719afdc 6057 spin_unlock(&cluster->refill_lock);
e570fd27 6058
e321f8a8
LB
6059 /* We should only have one-level nested. */
6060 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 6061
6719afdc
GU
6062 spin_lock(&cluster->refill_lock);
6063 if (used_bg == cluster->block_group)
6064 return used_bg;
e570fd27 6065
6719afdc
GU
6066 up_read(&used_bg->data_rwsem);
6067 btrfs_put_block_group(used_bg);
6068 }
e570fd27
MX
6069}
6070
6071static inline void
6072btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6073 int delalloc)
6074{
6075 if (delalloc)
6076 up_read(&cache->data_rwsem);
6077 btrfs_put_block_group(cache);
6078}
6079
b4bd745d
QW
6080/*
6081 * Structure used internally for find_free_extent() function. Wraps needed
6082 * parameters.
6083 */
6084struct find_free_extent_ctl {
6085 /* Basic allocation info */
6086 u64 ram_bytes;
6087 u64 num_bytes;
6088 u64 empty_size;
6089 u64 flags;
6090 int delalloc;
6091
6092 /* Where to start the search inside the bg */
6093 u64 search_start;
6094
6095 /* For clustered allocation */
6096 u64 empty_cluster;
6097
6098 bool have_caching_bg;
6099 bool orig_have_caching_bg;
6100
6101 /* RAID index, converted from flags */
6102 int index;
6103
e72d79d6
QW
6104 /*
6105 * Current loop number, check find_free_extent_update_loop() for details
6106 */
b4bd745d
QW
6107 int loop;
6108
6109 /*
6110 * Whether we're refilling a cluster, if true we need to re-search
6111 * current block group but don't try to refill the cluster again.
6112 */
6113 bool retry_clustered;
6114
6115 /*
6116 * Whether we're updating free space cache, if true we need to re-search
6117 * current block group but don't try updating free space cache again.
6118 */
6119 bool retry_unclustered;
6120
6121 /* If current block group is cached */
6122 int cached;
6123
6124 /* Max contiguous hole found */
6125 u64 max_extent_size;
6126
6127 /* Total free space from free space cache, not always contiguous */
6128 u64 total_free_space;
6129
6130 /* Found result */
6131 u64 found_offset;
6132};
6133
d06e3bb6
QW
6134
6135/*
6136 * Helper function for find_free_extent().
6137 *
6138 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
6139 * Return -EAGAIN to inform caller that we need to re-search this block group
6140 * Return >0 to inform caller that we find nothing
6141 * Return 0 means we have found a location and set ffe_ctl->found_offset.
6142 */
6143static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
6144 struct btrfs_free_cluster *last_ptr,
6145 struct find_free_extent_ctl *ffe_ctl,
6146 struct btrfs_block_group_cache **cluster_bg_ret)
6147{
d06e3bb6
QW
6148 struct btrfs_block_group_cache *cluster_bg;
6149 u64 aligned_cluster;
6150 u64 offset;
6151 int ret;
6152
6153 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
6154 if (!cluster_bg)
6155 goto refill_cluster;
6156 if (cluster_bg != bg && (cluster_bg->ro ||
6157 !block_group_bits(cluster_bg, ffe_ctl->flags)))
6158 goto release_cluster;
6159
6160 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
6161 ffe_ctl->num_bytes, cluster_bg->key.objectid,
6162 &ffe_ctl->max_extent_size);
6163 if (offset) {
6164 /* We have a block, we're done */
6165 spin_unlock(&last_ptr->refill_lock);
6166 trace_btrfs_reserve_extent_cluster(cluster_bg,
6167 ffe_ctl->search_start, ffe_ctl->num_bytes);
6168 *cluster_bg_ret = cluster_bg;
6169 ffe_ctl->found_offset = offset;
6170 return 0;
6171 }
6172 WARN_ON(last_ptr->block_group != cluster_bg);
6173
6174release_cluster:
6175 /*
6176 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
6177 * lets just skip it and let the allocator find whatever block it can
6178 * find. If we reach this point, we will have tried the cluster
6179 * allocator plenty of times and not have found anything, so we are
6180 * likely way too fragmented for the clustering stuff to find anything.
6181 *
6182 * However, if the cluster is taken from the current block group,
6183 * release the cluster first, so that we stand a better chance of
6184 * succeeding in the unclustered allocation.
6185 */
6186 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
6187 spin_unlock(&last_ptr->refill_lock);
6188 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
6189 return -ENOENT;
6190 }
6191
6192 /* This cluster didn't work out, free it and start over */
6193 btrfs_return_cluster_to_free_space(NULL, last_ptr);
6194
6195 if (cluster_bg != bg)
6196 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
6197
6198refill_cluster:
6199 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
6200 spin_unlock(&last_ptr->refill_lock);
6201 return -ENOENT;
6202 }
6203
6204 aligned_cluster = max_t(u64,
6205 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
6206 bg->full_stripe_len);
2ceeae2e
DS
6207 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
6208 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
6209 if (ret == 0) {
6210 /* Now pull our allocation out of this cluster */
6211 offset = btrfs_alloc_from_cluster(bg, last_ptr,
6212 ffe_ctl->num_bytes, ffe_ctl->search_start,
6213 &ffe_ctl->max_extent_size);
6214 if (offset) {
6215 /* We found one, proceed */
6216 spin_unlock(&last_ptr->refill_lock);
6217 trace_btrfs_reserve_extent_cluster(bg,
6218 ffe_ctl->search_start,
6219 ffe_ctl->num_bytes);
6220 ffe_ctl->found_offset = offset;
6221 return 0;
6222 }
6223 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
6224 !ffe_ctl->retry_clustered) {
6225 spin_unlock(&last_ptr->refill_lock);
6226
6227 ffe_ctl->retry_clustered = true;
6228 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
6229 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
6230 return -EAGAIN;
6231 }
6232 /*
6233 * At this point we either didn't find a cluster or we weren't able to
6234 * allocate a block from our cluster. Free the cluster we've been
6235 * trying to use, and go to the next block group.
6236 */
6237 btrfs_return_cluster_to_free_space(NULL, last_ptr);
6238 spin_unlock(&last_ptr->refill_lock);
6239 return 1;
6240}
6241
e1a41848
QW
6242/*
6243 * Return >0 to inform caller that we find nothing
6244 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
6245 * Return -EAGAIN to inform caller that we need to re-search this block group
6246 */
6247static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
6248 struct btrfs_free_cluster *last_ptr,
6249 struct find_free_extent_ctl *ffe_ctl)
6250{
6251 u64 offset;
6252
6253 /*
6254 * We are doing an unclustered allocation, set the fragmented flag so
6255 * we don't bother trying to setup a cluster again until we get more
6256 * space.
6257 */
6258 if (unlikely(last_ptr)) {
6259 spin_lock(&last_ptr->lock);
6260 last_ptr->fragmented = 1;
6261 spin_unlock(&last_ptr->lock);
6262 }
6263 if (ffe_ctl->cached) {
6264 struct btrfs_free_space_ctl *free_space_ctl;
6265
6266 free_space_ctl = bg->free_space_ctl;
6267 spin_lock(&free_space_ctl->tree_lock);
6268 if (free_space_ctl->free_space <
6269 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
6270 ffe_ctl->empty_size) {
6271 ffe_ctl->total_free_space = max_t(u64,
6272 ffe_ctl->total_free_space,
6273 free_space_ctl->free_space);
6274 spin_unlock(&free_space_ctl->tree_lock);
6275 return 1;
6276 }
6277 spin_unlock(&free_space_ctl->tree_lock);
6278 }
6279
6280 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
6281 ffe_ctl->num_bytes, ffe_ctl->empty_size,
6282 &ffe_ctl->max_extent_size);
6283
6284 /*
6285 * If we didn't find a chunk, and we haven't failed on this block group
6286 * before, and this block group is in the middle of caching and we are
6287 * ok with waiting, then go ahead and wait for progress to be made, and
6288 * set @retry_unclustered to true.
6289 *
6290 * If @retry_unclustered is true then we've already waited on this
6291 * block group once and should move on to the next block group.
6292 */
6293 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
6294 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
6295 wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
6296 ffe_ctl->empty_size);
6297 ffe_ctl->retry_unclustered = true;
6298 return -EAGAIN;
6299 } else if (!offset) {
6300 return 1;
6301 }
6302 ffe_ctl->found_offset = offset;
6303 return 0;
6304}
6305
e72d79d6
QW
6306/*
6307 * Return >0 means caller needs to re-search for free extent
6308 * Return 0 means we have the needed free extent.
6309 * Return <0 means we failed to locate any free extent.
6310 */
6311static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
6312 struct btrfs_free_cluster *last_ptr,
6313 struct btrfs_key *ins,
6314 struct find_free_extent_ctl *ffe_ctl,
6315 int full_search, bool use_cluster)
6316{
6317 struct btrfs_root *root = fs_info->extent_root;
6318 int ret;
6319
6320 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
6321 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
6322 ffe_ctl->orig_have_caching_bg = true;
6323
6324 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
6325 ffe_ctl->have_caching_bg)
6326 return 1;
6327
6328 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
6329 return 1;
6330
6331 if (ins->objectid) {
6332 if (!use_cluster && last_ptr) {
6333 spin_lock(&last_ptr->lock);
6334 last_ptr->window_start = ins->objectid;
6335 spin_unlock(&last_ptr->lock);
6336 }
6337 return 0;
6338 }
6339
6340 /*
6341 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6342 * caching kthreads as we move along
6343 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6344 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6345 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6346 * again
6347 */
6348 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
6349 ffe_ctl->index = 0;
6350 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
6351 /*
6352 * We want to skip the LOOP_CACHING_WAIT step if we
6353 * don't have any uncached bgs and we've already done a
6354 * full search through.
6355 */
6356 if (ffe_ctl->orig_have_caching_bg || !full_search)
6357 ffe_ctl->loop = LOOP_CACHING_WAIT;
6358 else
6359 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
6360 } else {
6361 ffe_ctl->loop++;
6362 }
6363
6364 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
6365 struct btrfs_trans_handle *trans;
6366 int exist = 0;
6367
6368 trans = current->journal_info;
6369 if (trans)
6370 exist = 1;
6371 else
6372 trans = btrfs_join_transaction(root);
6373
6374 if (IS_ERR(trans)) {
6375 ret = PTR_ERR(trans);
6376 return ret;
6377 }
6378
fc471cb0
JB
6379 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
6380 CHUNK_ALLOC_FORCE);
e72d79d6
QW
6381
6382 /*
6383 * If we can't allocate a new chunk we've already looped
6384 * through at least once, move on to the NO_EMPTY_SIZE
6385 * case.
6386 */
6387 if (ret == -ENOSPC)
6388 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
6389
6390 /* Do not bail out on ENOSPC since we can do more. */
6391 if (ret < 0 && ret != -ENOSPC)
6392 btrfs_abort_transaction(trans, ret);
6393 else
6394 ret = 0;
6395 if (!exist)
6396 btrfs_end_transaction(trans);
6397 if (ret)
6398 return ret;
6399 }
6400
6401 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
6402 /*
6403 * Don't loop again if we already have no empty_size and
6404 * no empty_cluster.
6405 */
6406 if (ffe_ctl->empty_size == 0 &&
6407 ffe_ctl->empty_cluster == 0)
6408 return -ENOSPC;
6409 ffe_ctl->empty_size = 0;
6410 ffe_ctl->empty_cluster = 0;
6411 }
6412 return 1;
6413 }
6414 return -ENOSPC;
6415}
6416
fec577fb
CM
6417/*
6418 * walks the btree of allocated extents and find a hole of a given size.
6419 * The key ins is changed to record the hole:
a4820398 6420 * ins->objectid == start position
62e2749e 6421 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 6422 * ins->offset == the size of the hole.
fec577fb 6423 * Any available blocks before search_start are skipped.
a4820398
MX
6424 *
6425 * If there is no suitable free space, we will record the max size of
6426 * the free space extent currently.
e72d79d6
QW
6427 *
6428 * The overall logic and call chain:
6429 *
6430 * find_free_extent()
6431 * |- Iterate through all block groups
6432 * | |- Get a valid block group
6433 * | |- Try to do clustered allocation in that block group
6434 * | |- Try to do unclustered allocation in that block group
6435 * | |- Check if the result is valid
6436 * | | |- If valid, then exit
6437 * | |- Jump to next block group
6438 * |
6439 * |- Push harder to find free extents
6440 * |- If not found, re-iterate all block groups
fec577fb 6441 */
87bde3cd 6442static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
18513091
WX
6443 u64 ram_bytes, u64 num_bytes, u64 empty_size,
6444 u64 hint_byte, struct btrfs_key *ins,
6445 u64 flags, int delalloc)
fec577fb 6446{
80eb234a 6447 int ret = 0;
fa9c0d79 6448 struct btrfs_free_cluster *last_ptr = NULL;
80eb234a 6449 struct btrfs_block_group_cache *block_group = NULL;
b4bd745d 6450 struct find_free_extent_ctl ffe_ctl = {0};
80eb234a 6451 struct btrfs_space_info *space_info;
67377734 6452 bool use_cluster = true;
a5e681d9 6453 bool full_search = false;
fec577fb 6454
0b246afa 6455 WARN_ON(num_bytes < fs_info->sectorsize);
b4bd745d
QW
6456
6457 ffe_ctl.ram_bytes = ram_bytes;
6458 ffe_ctl.num_bytes = num_bytes;
6459 ffe_ctl.empty_size = empty_size;
6460 ffe_ctl.flags = flags;
6461 ffe_ctl.search_start = 0;
6462 ffe_ctl.retry_clustered = false;
6463 ffe_ctl.retry_unclustered = false;
6464 ffe_ctl.delalloc = delalloc;
6465 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
6466 ffe_ctl.have_caching_bg = false;
6467 ffe_ctl.orig_have_caching_bg = false;
6468 ffe_ctl.found_offset = 0;
6469
962a298f 6470 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
6471 ins->objectid = 0;
6472 ins->offset = 0;
b1a4d965 6473
71ff6437 6474 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
3f7de037 6475
280c2908 6476 space_info = btrfs_find_space_info(fs_info, flags);
1b1d1f66 6477 if (!space_info) {
0b246afa 6478 btrfs_err(fs_info, "No space info for %llu", flags);
1b1d1f66
JB
6479 return -ENOSPC;
6480 }
2552d17e 6481
67377734 6482 /*
4f4db217
JB
6483 * If our free space is heavily fragmented we may not be able to make
6484 * big contiguous allocations, so instead of doing the expensive search
6485 * for free space, simply return ENOSPC with our max_extent_size so we
6486 * can go ahead and search for a more manageable chunk.
6487 *
6488 * If our max_extent_size is large enough for our allocation simply
6489 * disable clustering since we will likely not be able to find enough
6490 * space to create a cluster and induce latency trying.
67377734 6491 */
4f4db217
JB
6492 if (unlikely(space_info->max_extent_size)) {
6493 spin_lock(&space_info->lock);
6494 if (space_info->max_extent_size &&
6495 num_bytes > space_info->max_extent_size) {
6496 ins->offset = space_info->max_extent_size;
6497 spin_unlock(&space_info->lock);
6498 return -ENOSPC;
6499 } else if (space_info->max_extent_size) {
6500 use_cluster = false;
6501 }
6502 spin_unlock(&space_info->lock);
fa9c0d79 6503 }
0f9dd46c 6504
b4bd745d
QW
6505 last_ptr = fetch_cluster_info(fs_info, space_info,
6506 &ffe_ctl.empty_cluster);
239b14b3 6507 if (last_ptr) {
fa9c0d79
CM
6508 spin_lock(&last_ptr->lock);
6509 if (last_ptr->block_group)
6510 hint_byte = last_ptr->window_start;
c759c4e1
JB
6511 if (last_ptr->fragmented) {
6512 /*
6513 * We still set window_start so we can keep track of the
6514 * last place we found an allocation to try and save
6515 * some time.
6516 */
6517 hint_byte = last_ptr->window_start;
6518 use_cluster = false;
6519 }
fa9c0d79 6520 spin_unlock(&last_ptr->lock);
239b14b3 6521 }
fa9c0d79 6522
b4bd745d
QW
6523 ffe_ctl.search_start = max(ffe_ctl.search_start,
6524 first_logical_byte(fs_info, 0));
6525 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
6526 if (ffe_ctl.search_start == hint_byte) {
6527 block_group = btrfs_lookup_block_group(fs_info,
6528 ffe_ctl.search_start);
817d52f8
JB
6529 /*
6530 * we don't want to use the block group if it doesn't match our
6531 * allocation bits, or if its not cached.
ccf0e725
JB
6532 *
6533 * However if we are re-searching with an ideal block group
6534 * picked out then we don't care that the block group is cached.
817d52f8 6535 */
b6919a58 6536 if (block_group && block_group_bits(block_group, flags) &&
285ff5af 6537 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 6538 down_read(&space_info->groups_sem);
44fb5511
CM
6539 if (list_empty(&block_group->list) ||
6540 block_group->ro) {
6541 /*
6542 * someone is removing this block group,
6543 * we can't jump into the have_block_group
6544 * target because our list pointers are not
6545 * valid
6546 */
6547 btrfs_put_block_group(block_group);
6548 up_read(&space_info->groups_sem);
ccf0e725 6549 } else {
b4bd745d 6550 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3e72ee88 6551 block_group->flags);
e570fd27 6552 btrfs_lock_block_group(block_group, delalloc);
44fb5511 6553 goto have_block_group;
ccf0e725 6554 }
2552d17e 6555 } else if (block_group) {
fa9c0d79 6556 btrfs_put_block_group(block_group);
2552d17e 6557 }
42e70e7a 6558 }
2552d17e 6559search:
b4bd745d
QW
6560 ffe_ctl.have_caching_bg = false;
6561 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
6562 ffe_ctl.index == 0)
a5e681d9 6563 full_search = true;
80eb234a 6564 down_read(&space_info->groups_sem);
b4bd745d
QW
6565 list_for_each_entry(block_group,
6566 &space_info->block_groups[ffe_ctl.index], list) {
14443937
JM
6567 /* If the block group is read-only, we can skip it entirely. */
6568 if (unlikely(block_group->ro))
6569 continue;
6570
e570fd27 6571 btrfs_grab_block_group(block_group, delalloc);
b4bd745d 6572 ffe_ctl.search_start = block_group->key.objectid;
42e70e7a 6573
83a50de9
CM
6574 /*
6575 * this can happen if we end up cycling through all the
6576 * raid types, but we want to make sure we only allocate
6577 * for the proper type.
6578 */
b6919a58 6579 if (!block_group_bits(block_group, flags)) {
bece2e82 6580 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 6581 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 6582 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
6583 BTRFS_BLOCK_GROUP_RAID10;
6584
6585 /*
6586 * if they asked for extra copies and this block group
6587 * doesn't provide them, bail. This does allow us to
6588 * fill raid0 from raid1.
6589 */
b6919a58 6590 if ((flags & extra) && !(block_group->flags & extra))
83a50de9
CM
6591 goto loop;
6592 }
6593
2552d17e 6594have_block_group:
b4bd745d
QW
6595 ffe_ctl.cached = block_group_cache_done(block_group);
6596 if (unlikely(!ffe_ctl.cached)) {
6597 ffe_ctl.have_caching_bg = true;
f6373bf3 6598 ret = cache_block_group(block_group, 0);
1d4284bd
CM
6599 BUG_ON(ret < 0);
6600 ret = 0;
817d52f8
JB
6601 }
6602
36cce922
JB
6603 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6604 goto loop;
0f9dd46c 6605
0a24325e 6606 /*
062c05c4
AO
6607 * Ok we want to try and use the cluster allocator, so
6608 * lets look there
0a24325e 6609 */
c759c4e1 6610 if (last_ptr && use_cluster) {
d06e3bb6 6611 struct btrfs_block_group_cache *cluster_bg = NULL;
fa9c0d79 6612
d06e3bb6
QW
6613 ret = find_free_extent_clustered(block_group, last_ptr,
6614 &ffe_ctl, &cluster_bg);
062c05c4 6615
fa9c0d79 6616 if (ret == 0) {
d06e3bb6
QW
6617 if (cluster_bg && cluster_bg != block_group) {
6618 btrfs_release_block_group(block_group,
6619 delalloc);
6620 block_group = cluster_bg;
fa9c0d79 6621 }
d06e3bb6
QW
6622 goto checks;
6623 } else if (ret == -EAGAIN) {
817d52f8 6624 goto have_block_group;
d06e3bb6
QW
6625 } else if (ret > 0) {
6626 goto loop;
fa9c0d79 6627 }
d06e3bb6 6628 /* ret == -ENOENT case falls through */
fa9c0d79
CM
6629 }
6630
e1a41848
QW
6631 ret = find_free_extent_unclustered(block_group, last_ptr,
6632 &ffe_ctl);
6633 if (ret == -EAGAIN)
817d52f8 6634 goto have_block_group;
e1a41848 6635 else if (ret > 0)
1cdda9b8 6636 goto loop;
e1a41848 6637 /* ret == 0 case falls through */
fa9c0d79 6638checks:
b4bd745d
QW
6639 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
6640 fs_info->stripesize);
25179201 6641
2552d17e 6642 /* move on to the next group */
b4bd745d 6643 if (ffe_ctl.search_start + num_bytes >
215a63d1 6644 block_group->key.objectid + block_group->key.offset) {
b4bd745d
QW
6645 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6646 num_bytes);
2552d17e 6647 goto loop;
6226cb0a 6648 }
f5a31e16 6649
b4bd745d
QW
6650 if (ffe_ctl.found_offset < ffe_ctl.search_start)
6651 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6652 ffe_ctl.search_start - ffe_ctl.found_offset);
2552d17e 6653
18513091
WX
6654 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
6655 num_bytes, delalloc);
f0486c68 6656 if (ret == -EAGAIN) {
b4bd745d
QW
6657 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
6658 num_bytes);
2552d17e 6659 goto loop;
0f9dd46c 6660 }
9cfa3e34 6661 btrfs_inc_block_group_reservations(block_group);
0b86a832 6662
f0486c68 6663 /* we are all good, lets return */
b4bd745d 6664 ins->objectid = ffe_ctl.search_start;
2552d17e 6665 ins->offset = num_bytes;
d2fb3437 6666
b4bd745d
QW
6667 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
6668 num_bytes);
e570fd27 6669 btrfs_release_block_group(block_group, delalloc);
2552d17e
JB
6670 break;
6671loop:
b4bd745d
QW
6672 ffe_ctl.retry_clustered = false;
6673 ffe_ctl.retry_unclustered = false;
3e72ee88 6674 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
b4bd745d 6675 ffe_ctl.index);
e570fd27 6676 btrfs_release_block_group(block_group, delalloc);
14443937 6677 cond_resched();
2552d17e
JB
6678 }
6679 up_read(&space_info->groups_sem);
6680
e72d79d6
QW
6681 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
6682 full_search, use_cluster);
6683 if (ret > 0)
b742bb82
YZ
6684 goto search;
6685
4f4db217 6686 if (ret == -ENOSPC) {
b4bd745d
QW
6687 /*
6688 * Use ffe_ctl->total_free_space as fallback if we can't find
6689 * any contiguous hole.
6690 */
6691 if (!ffe_ctl.max_extent_size)
6692 ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4f4db217 6693 spin_lock(&space_info->lock);
b4bd745d 6694 space_info->max_extent_size = ffe_ctl.max_extent_size;
4f4db217 6695 spin_unlock(&space_info->lock);
b4bd745d 6696 ins->offset = ffe_ctl.max_extent_size;
4f4db217 6697 }
0f70abe2 6698 return ret;
fec577fb 6699}
ec44a35c 6700
6f47c706
NB
6701/*
6702 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
6703 * hole that is at least as big as @num_bytes.
6704 *
6705 * @root - The root that will contain this extent
6706 *
6707 * @ram_bytes - The amount of space in ram that @num_bytes take. This
6708 * is used for accounting purposes. This value differs
6709 * from @num_bytes only in the case of compressed extents.
6710 *
6711 * @num_bytes - Number of bytes to allocate on-disk.
6712 *
6713 * @min_alloc_size - Indicates the minimum amount of space that the
6714 * allocator should try to satisfy. In some cases
6715 * @num_bytes may be larger than what is required and if
6716 * the filesystem is fragmented then allocation fails.
6717 * However, the presence of @min_alloc_size gives a
6718 * chance to try and satisfy the smaller allocation.
6719 *
6720 * @empty_size - A hint that you plan on doing more COW. This is the
6721 * size in bytes the allocator should try to find free
6722 * next to the block it returns. This is just a hint and
6723 * may be ignored by the allocator.
6724 *
6725 * @hint_byte - Hint to the allocator to start searching above the byte
6726 * address passed. It might be ignored.
6727 *
6728 * @ins - This key is modified to record the found hole. It will
6729 * have the following values:
6730 * ins->objectid == start position
6731 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6732 * ins->offset == the size of the hole.
6733 *
6734 * @is_data - Boolean flag indicating whether an extent is
6735 * allocated for data (true) or metadata (false)
6736 *
6737 * @delalloc - Boolean flag indicating whether this allocation is for
6738 * delalloc or not. If 'true' data_rwsem of block groups
6739 * is going to be acquired.
6740 *
6741 *
6742 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
6743 * case -ENOSPC is returned then @ins->offset will contain the size of the
6744 * largest available hole the allocator managed to find.
6745 */
18513091 6746int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
6747 u64 num_bytes, u64 min_alloc_size,
6748 u64 empty_size, u64 hint_byte,
e570fd27 6749 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 6750{
ab8d0fc4 6751 struct btrfs_fs_info *fs_info = root->fs_info;
36af4e07 6752 bool final_tried = num_bytes == min_alloc_size;
b6919a58 6753 u64 flags;
fec577fb 6754 int ret;
925baedd 6755
1b86826d 6756 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 6757again:
0b246afa 6758 WARN_ON(num_bytes < fs_info->sectorsize);
87bde3cd 6759 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
18513091 6760 hint_byte, ins, flags, delalloc);
9cfa3e34 6761 if (!ret && !is_data) {
ab8d0fc4 6762 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 6763 } else if (ret == -ENOSPC) {
a4820398
MX
6764 if (!final_tried && ins->offset) {
6765 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 6766 num_bytes = round_down(num_bytes,
0b246afa 6767 fs_info->sectorsize);
9e622d6b 6768 num_bytes = max(num_bytes, min_alloc_size);
18513091 6769 ram_bytes = num_bytes;
9e622d6b
MX
6770 if (num_bytes == min_alloc_size)
6771 final_tried = true;
6772 goto again;
ab8d0fc4 6773 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
6774 struct btrfs_space_info *sinfo;
6775
280c2908 6776 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 6777 btrfs_err(fs_info,
5d163e0e
JM
6778 "allocation failed flags %llu, wanted %llu",
6779 flags, num_bytes);
53804280 6780 if (sinfo)
5da6afeb
JB
6781 btrfs_dump_space_info(fs_info, sinfo,
6782 num_bytes, 1);
9e622d6b 6783 }
925baedd 6784 }
0f9dd46c
JB
6785
6786 return ret;
e6dcd2dc
CM
6787}
6788
2ff7e61e 6789static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27
MX
6790 u64 start, u64 len,
6791 int pin, int delalloc)
65b51a00 6792{
0f9dd46c 6793 struct btrfs_block_group_cache *cache;
1f3c79a2 6794 int ret = 0;
0f9dd46c 6795
0b246afa 6796 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 6797 if (!cache) {
0b246afa
JM
6798 btrfs_err(fs_info, "Unable to find block group for %llu",
6799 start);
0f9dd46c
JB
6800 return -ENOSPC;
6801 }
1f3c79a2 6802
e688b725 6803 if (pin)
fdf08605 6804 pin_down_extent(cache, start, len, 1);
e688b725 6805 else {
0b246afa 6806 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 6807 ret = btrfs_discard_extent(fs_info, start, len, NULL);
e688b725 6808 btrfs_add_free_space(cache, start, len);
4824f1f4 6809 btrfs_free_reserved_bytes(cache, len, delalloc);
71ff6437 6810 trace_btrfs_reserved_extent_free(fs_info, start, len);
e688b725 6811 }
31193213 6812
fa9c0d79 6813 btrfs_put_block_group(cache);
e6dcd2dc
CM
6814 return ret;
6815}
6816
2ff7e61e 6817int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27 6818 u64 start, u64 len, int delalloc)
e688b725 6819{
2ff7e61e 6820 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
e688b725
CM
6821}
6822
2ff7e61e 6823int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
e688b725
CM
6824 u64 start, u64 len)
6825{
2ff7e61e 6826 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
e688b725
CM
6827}
6828
5d4f98a2 6829static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
6830 u64 parent, u64 root_objectid,
6831 u64 flags, u64 owner, u64 offset,
6832 struct btrfs_key *ins, int ref_mod)
e6dcd2dc 6833{
ef89b824 6834 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 6835 int ret;
e6dcd2dc 6836 struct btrfs_extent_item *extent_item;
5d4f98a2 6837 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 6838 struct btrfs_path *path;
5d4f98a2
YZ
6839 struct extent_buffer *leaf;
6840 int type;
6841 u32 size;
26b8003f 6842
5d4f98a2
YZ
6843 if (parent > 0)
6844 type = BTRFS_SHARED_DATA_REF_KEY;
6845 else
6846 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 6847
5d4f98a2 6848 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
6849
6850 path = btrfs_alloc_path();
db5b493a
TI
6851 if (!path)
6852 return -ENOMEM;
47e4bb98 6853
b9473439 6854 path->leave_spinning = 1;
5d4f98a2
YZ
6855 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6856 ins, size);
79787eaa
JM
6857 if (ret) {
6858 btrfs_free_path(path);
6859 return ret;
6860 }
0f9dd46c 6861
5d4f98a2
YZ
6862 leaf = path->nodes[0];
6863 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 6864 struct btrfs_extent_item);
5d4f98a2
YZ
6865 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6866 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6867 btrfs_set_extent_flags(leaf, extent_item,
6868 flags | BTRFS_EXTENT_FLAG_DATA);
6869
6870 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6871 btrfs_set_extent_inline_ref_type(leaf, iref, type);
6872 if (parent > 0) {
6873 struct btrfs_shared_data_ref *ref;
6874 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6875 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6876 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6877 } else {
6878 struct btrfs_extent_data_ref *ref;
6879 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6880 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6881 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6882 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6883 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6884 }
47e4bb98
CM
6885
6886 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 6887 btrfs_free_path(path);
f510cfec 6888
25a356d3 6889 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
1e144fb8
OS
6890 if (ret)
6891 return ret;
6892
6b279408 6893 ret = update_block_group(trans, ins->objectid, ins->offset, 1);
79787eaa 6894 if (ret) { /* -ENOENT, logic error */
c2cf52eb 6895 btrfs_err(fs_info, "update block group failed for %llu %llu",
c1c9ff7c 6896 ins->objectid, ins->offset);
f5947066
CM
6897 BUG();
6898 }
71ff6437 6899 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
e6dcd2dc
CM
6900 return ret;
6901}
6902
5d4f98a2 6903static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 6904 struct btrfs_delayed_ref_node *node,
21ebfbe7 6905 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 6906{
9dcdbe01 6907 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 6908 int ret;
5d4f98a2 6909 struct btrfs_extent_item *extent_item;
4e6bd4e0 6910 struct btrfs_key extent_key;
5d4f98a2
YZ
6911 struct btrfs_tree_block_info *block_info;
6912 struct btrfs_extent_inline_ref *iref;
6913 struct btrfs_path *path;
6914 struct extent_buffer *leaf;
4e6bd4e0 6915 struct btrfs_delayed_tree_ref *ref;
3173a18f 6916 u32 size = sizeof(*extent_item) + sizeof(*iref);
4e6bd4e0 6917 u64 num_bytes;
21ebfbe7 6918 u64 flags = extent_op->flags_to_set;
0b246afa 6919 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 6920
4e6bd4e0
NB
6921 ref = btrfs_delayed_node_to_tree_ref(node);
6922
4e6bd4e0
NB
6923 extent_key.objectid = node->bytenr;
6924 if (skinny_metadata) {
6925 extent_key.offset = ref->level;
6926 extent_key.type = BTRFS_METADATA_ITEM_KEY;
6927 num_bytes = fs_info->nodesize;
6928 } else {
6929 extent_key.offset = node->num_bytes;
6930 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 6931 size += sizeof(*block_info);
4e6bd4e0
NB
6932 num_bytes = node->num_bytes;
6933 }
1c2308f8 6934
5d4f98a2 6935 path = btrfs_alloc_path();
80ee54bf 6936 if (!path)
d8926bb3 6937 return -ENOMEM;
56bec294 6938
5d4f98a2
YZ
6939 path->leave_spinning = 1;
6940 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4e6bd4e0 6941 &extent_key, size);
79787eaa 6942 if (ret) {
dd825259 6943 btrfs_free_path(path);
79787eaa
JM
6944 return ret;
6945 }
5d4f98a2
YZ
6946
6947 leaf = path->nodes[0];
6948 extent_item = btrfs_item_ptr(leaf, path->slots[0],
6949 struct btrfs_extent_item);
6950 btrfs_set_extent_refs(leaf, extent_item, 1);
6951 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6952 btrfs_set_extent_flags(leaf, extent_item,
6953 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 6954
3173a18f
JB
6955 if (skinny_metadata) {
6956 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6957 } else {
6958 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 6959 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4e6bd4e0 6960 btrfs_set_tree_block_level(leaf, block_info, ref->level);
3173a18f
JB
6961 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6962 }
5d4f98a2 6963
d4b20733 6964 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
6965 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6966 btrfs_set_extent_inline_ref_type(leaf, iref,
6967 BTRFS_SHARED_BLOCK_REF_KEY);
d4b20733 6968 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
5d4f98a2
YZ
6969 } else {
6970 btrfs_set_extent_inline_ref_type(leaf, iref,
6971 BTRFS_TREE_BLOCK_REF_KEY);
4e6bd4e0 6972 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
5d4f98a2
YZ
6973 }
6974
6975 btrfs_mark_buffer_dirty(leaf);
6976 btrfs_free_path(path);
6977
4e6bd4e0
NB
6978 ret = remove_from_free_space_tree(trans, extent_key.objectid,
6979 num_bytes);
1e144fb8
OS
6980 if (ret)
6981 return ret;
6982
6b279408 6983 ret = update_block_group(trans, extent_key.objectid,
6202df69 6984 fs_info->nodesize, 1);
79787eaa 6985 if (ret) { /* -ENOENT, logic error */
c2cf52eb 6986 btrfs_err(fs_info, "update block group failed for %llu %llu",
4e6bd4e0 6987 extent_key.objectid, extent_key.offset);
5d4f98a2
YZ
6988 BUG();
6989 }
0be5dc67 6990
4e6bd4e0 6991 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
0b246afa 6992 fs_info->nodesize);
5d4f98a2
YZ
6993 return ret;
6994}
6995
6996int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 6997 struct btrfs_root *root, u64 owner,
5846a3c2
QW
6998 u64 offset, u64 ram_bytes,
6999 struct btrfs_key *ins)
5d4f98a2 7000{
76675593 7001 struct btrfs_ref generic_ref = { 0 };
5d4f98a2
YZ
7002 int ret;
7003
84f7d8e6 7004 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 7005
76675593
QW
7006 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
7007 ins->objectid, ins->offset, 0);
7008 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8a5040f7 7009 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
76675593
QW
7010 ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
7011 ram_bytes, NULL, NULL);
e6dcd2dc
CM
7012 return ret;
7013}
e02119d5
CM
7014
7015/*
7016 * this is used by the tree logging recovery code. It records that
7017 * an extent has been allocated and makes sure to clear the free
7018 * space cache bits as well
7019 */
5d4f98a2 7020int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
7021 u64 root_objectid, u64 owner, u64 offset,
7022 struct btrfs_key *ins)
e02119d5 7023{
61da2abf 7024 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5
CM
7025 int ret;
7026 struct btrfs_block_group_cache *block_group;
ed7a6948 7027 struct btrfs_space_info *space_info;
11833d66 7028
8c2a1a30
JB
7029 /*
7030 * Mixed block groups will exclude before processing the log so we only
01327610 7031 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 7032 */
0b246afa 7033 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
7034 ret = __exclude_logged_extent(fs_info, ins->objectid,
7035 ins->offset);
b50c6e25 7036 if (ret)
8c2a1a30 7037 return ret;
11833d66
YZ
7038 }
7039
0b246afa 7040 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
7041 if (!block_group)
7042 return -EINVAL;
7043
ed7a6948
WX
7044 space_info = block_group->space_info;
7045 spin_lock(&space_info->lock);
7046 spin_lock(&block_group->lock);
7047 space_info->bytes_reserved += ins->offset;
7048 block_group->reserved += ins->offset;
7049 spin_unlock(&block_group->lock);
7050 spin_unlock(&space_info->lock);
7051
ef89b824
NB
7052 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
7053 offset, ins, 1);
b50c6e25 7054 btrfs_put_block_group(block_group);
e02119d5
CM
7055 return ret;
7056}
7057
48a3b636
ES
7058static struct extent_buffer *
7059btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
bc877d28 7060 u64 bytenr, int level, u64 owner)
65b51a00 7061{
0b246afa 7062 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00
CM
7063 struct extent_buffer *buf;
7064
2ff7e61e 7065 buf = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
7066 if (IS_ERR(buf))
7067 return buf;
7068
b72c3aba
QW
7069 /*
7070 * Extra safety check in case the extent tree is corrupted and extent
7071 * allocator chooses to use a tree block which is already used and
7072 * locked.
7073 */
7074 if (buf->lock_owner == current->pid) {
7075 btrfs_err_rl(fs_info,
7076"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
7077 buf->start, btrfs_header_owner(buf), current->pid);
7078 free_extent_buffer(buf);
7079 return ERR_PTR(-EUCLEAN);
7080 }
7081
85d4e461 7082 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
65b51a00 7083 btrfs_tree_lock(buf);
6a884d7d 7084 btrfs_clean_tree_block(buf);
3083ee2e 7085 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
b4ce94de 7086
8bead258 7087 btrfs_set_lock_blocking_write(buf);
4db8c528 7088 set_extent_buffer_uptodate(buf);
b4ce94de 7089
bc877d28
NB
7090 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
7091 btrfs_set_header_level(buf, level);
7092 btrfs_set_header_bytenr(buf, buf->start);
7093 btrfs_set_header_generation(buf, trans->transid);
7094 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
7095 btrfs_set_header_owner(buf, owner);
de37aa51 7096 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 7097 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
d0c803c4 7098 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 7099 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
7100 /*
7101 * we allow two log transactions at a time, use different
52042d8e 7102 * EXTENT bit to differentiate dirty pages.
8cef4e16 7103 */
656f30db 7104 if (buf->log_index == 0)
8cef4e16
YZ
7105 set_extent_dirty(&root->dirty_log_pages, buf->start,
7106 buf->start + buf->len - 1, GFP_NOFS);
7107 else
7108 set_extent_new(&root->dirty_log_pages, buf->start,
3744dbeb 7109 buf->start + buf->len - 1);
d0c803c4 7110 } else {
656f30db 7111 buf->log_index = -1;
d0c803c4 7112 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 7113 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 7114 }
64c12921 7115 trans->dirty = true;
b4ce94de 7116 /* this returns a buffer locked for blocking */
65b51a00
CM
7117 return buf;
7118}
7119
f0486c68
YZ
7120static struct btrfs_block_rsv *
7121use_block_rsv(struct btrfs_trans_handle *trans,
7122 struct btrfs_root *root, u32 blocksize)
7123{
0b246afa 7124 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 7125 struct btrfs_block_rsv *block_rsv;
0b246afa 7126 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
f0486c68 7127 int ret;
d88033db 7128 bool global_updated = false;
f0486c68
YZ
7129
7130 block_rsv = get_block_rsv(trans, root);
7131
b586b323
MX
7132 if (unlikely(block_rsv->size == 0))
7133 goto try_reserve;
d88033db 7134again:
c2a67a76 7135 ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
f0486c68
YZ
7136 if (!ret)
7137 return block_rsv;
7138
b586b323
MX
7139 if (block_rsv->failfast)
7140 return ERR_PTR(ret);
7141
d88033db
MX
7142 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7143 global_updated = true;
0b246afa 7144 update_global_block_rsv(fs_info);
d88033db
MX
7145 goto again;
7146 }
7147
ba2c4d4e
JB
7148 /*
7149 * The global reserve still exists to save us from ourselves, so don't
7150 * warn_on if we are short on our delayed refs reserve.
7151 */
7152 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
7153 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
b586b323
MX
7154 static DEFINE_RATELIMIT_STATE(_rs,
7155 DEFAULT_RATELIMIT_INTERVAL * 10,
7156 /*DEFAULT_RATELIMIT_BURST*/ 1);
7157 if (__ratelimit(&_rs))
7158 WARN(1, KERN_DEBUG
efe120a0 7159 "BTRFS: block rsv returned %d\n", ret);
b586b323
MX
7160 }
7161try_reserve:
0d9764f6
JB
7162 ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
7163 BTRFS_RESERVE_NO_FLUSH);
b586b323
MX
7164 if (!ret)
7165 return block_rsv;
7166 /*
7167 * If we couldn't reserve metadata bytes try and use some from
5881cfc9
MX
7168 * the global reserve if its space type is the same as the global
7169 * reservation.
b586b323 7170 */
5881cfc9
MX
7171 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7172 block_rsv->space_info == global_rsv->space_info) {
c2a67a76 7173 ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
b586b323
MX
7174 if (!ret)
7175 return global_rsv;
7176 }
7177 return ERR_PTR(ret);
f0486c68
YZ
7178}
7179
8c2a3ca2
JB
7180static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7181 struct btrfs_block_rsv *block_rsv, u32 blocksize)
f0486c68 7182{
0b50174a 7183 btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
424a4780 7184 btrfs_block_rsv_release(fs_info, block_rsv, 0);
f0486c68
YZ
7185}
7186
fec577fb 7187/*
f0486c68 7188 * finds a free extent and does all the dirty work required for allocation
67b7859e 7189 * returns the tree buffer or an ERR_PTR on error.
fec577fb 7190 */
4d75f8a9 7191struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
7192 struct btrfs_root *root,
7193 u64 parent, u64 root_objectid,
7194 const struct btrfs_disk_key *key,
7195 int level, u64 hint,
7196 u64 empty_size)
fec577fb 7197{
0b246afa 7198 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 7199 struct btrfs_key ins;
f0486c68 7200 struct btrfs_block_rsv *block_rsv;
5f39d397 7201 struct extent_buffer *buf;
67b7859e 7202 struct btrfs_delayed_extent_op *extent_op;
ed4f255b 7203 struct btrfs_ref generic_ref = { 0 };
f0486c68
YZ
7204 u64 flags = 0;
7205 int ret;
0b246afa
JM
7206 u32 blocksize = fs_info->nodesize;
7207 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
fec577fb 7208
05653ef3 7209#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 7210 if (btrfs_is_testing(fs_info)) {
faa2dbf0 7211 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
bc877d28 7212 level, root_objectid);
faa2dbf0
JB
7213 if (!IS_ERR(buf))
7214 root->alloc_bytenr += blocksize;
7215 return buf;
7216 }
05653ef3 7217#endif
fccb84c9 7218
f0486c68
YZ
7219 block_rsv = use_block_rsv(trans, root, blocksize);
7220 if (IS_ERR(block_rsv))
7221 return ERR_CAST(block_rsv);
7222
18513091 7223 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 7224 empty_size, hint, &ins, 0, 0);
67b7859e
OS
7225 if (ret)
7226 goto out_unuse;
55c69072 7227
bc877d28
NB
7228 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
7229 root_objectid);
67b7859e
OS
7230 if (IS_ERR(buf)) {
7231 ret = PTR_ERR(buf);
7232 goto out_free_reserved;
7233 }
f0486c68
YZ
7234
7235 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7236 if (parent == 0)
7237 parent = ins.objectid;
7238 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7239 } else
7240 BUG_ON(parent > 0);
7241
7242 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
78a6184a 7243 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
7244 if (!extent_op) {
7245 ret = -ENOMEM;
7246 goto out_free_buf;
7247 }
f0486c68
YZ
7248 if (key)
7249 memcpy(&extent_op->key, key, sizeof(extent_op->key));
7250 else
7251 memset(&extent_op->key, 0, sizeof(extent_op->key));
7252 extent_op->flags_to_set = flags;
35b3ad50
DS
7253 extent_op->update_key = skinny_metadata ? false : true;
7254 extent_op->update_flags = true;
7255 extent_op->is_data = false;
b1c79e09 7256 extent_op->level = level;
f0486c68 7257
ed4f255b
QW
7258 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
7259 ins.objectid, ins.offset, parent);
7260 generic_ref.real_root = root->root_key.objectid;
7261 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8a5040f7 7262 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 7263 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
7be07912 7264 extent_op, NULL, NULL);
67b7859e
OS
7265 if (ret)
7266 goto out_free_delayed;
f0486c68 7267 }
fec577fb 7268 return buf;
67b7859e
OS
7269
7270out_free_delayed:
7271 btrfs_free_delayed_extent_op(extent_op);
7272out_free_buf:
7273 free_extent_buffer(buf);
7274out_free_reserved:
2ff7e61e 7275 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 7276out_unuse:
0b246afa 7277 unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 7278 return ERR_PTR(ret);
fec577fb 7279}
a28ec197 7280
2c47e605
YZ
7281struct walk_control {
7282 u64 refs[BTRFS_MAX_LEVEL];
7283 u64 flags[BTRFS_MAX_LEVEL];
7284 struct btrfs_key update_progress;
aea6f028
JB
7285 struct btrfs_key drop_progress;
7286 int drop_level;
2c47e605
YZ
7287 int stage;
7288 int level;
7289 int shared_level;
7290 int update_ref;
7291 int keep_locks;
1c4850e2
YZ
7292 int reada_slot;
7293 int reada_count;
78c52d9e 7294 int restarted;
2c47e605
YZ
7295};
7296
7297#define DROP_REFERENCE 1
7298#define UPDATE_BACKREF 2
7299
1c4850e2
YZ
7300static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7301 struct btrfs_root *root,
7302 struct walk_control *wc,
7303 struct btrfs_path *path)
6407bf6d 7304{
0b246afa 7305 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
7306 u64 bytenr;
7307 u64 generation;
7308 u64 refs;
94fcca9f 7309 u64 flags;
5d4f98a2 7310 u32 nritems;
1c4850e2
YZ
7311 struct btrfs_key key;
7312 struct extent_buffer *eb;
6407bf6d 7313 int ret;
1c4850e2
YZ
7314 int slot;
7315 int nread = 0;
6407bf6d 7316
1c4850e2
YZ
7317 if (path->slots[wc->level] < wc->reada_slot) {
7318 wc->reada_count = wc->reada_count * 2 / 3;
7319 wc->reada_count = max(wc->reada_count, 2);
7320 } else {
7321 wc->reada_count = wc->reada_count * 3 / 2;
7322 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 7323 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 7324 }
7bb86316 7325
1c4850e2
YZ
7326 eb = path->nodes[wc->level];
7327 nritems = btrfs_header_nritems(eb);
bd56b302 7328
1c4850e2
YZ
7329 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7330 if (nread >= wc->reada_count)
7331 break;
bd56b302 7332
2dd3e67b 7333 cond_resched();
1c4850e2
YZ
7334 bytenr = btrfs_node_blockptr(eb, slot);
7335 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 7336
1c4850e2
YZ
7337 if (slot == path->slots[wc->level])
7338 goto reada;
5d4f98a2 7339
1c4850e2
YZ
7340 if (wc->stage == UPDATE_BACKREF &&
7341 generation <= root->root_key.offset)
bd56b302
CM
7342 continue;
7343
94fcca9f 7344 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 7345 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f
JB
7346 wc->level - 1, 1, &refs,
7347 &flags);
79787eaa
JM
7348 /* We don't care about errors in readahead. */
7349 if (ret < 0)
7350 continue;
94fcca9f
YZ
7351 BUG_ON(refs == 0);
7352
1c4850e2 7353 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
7354 if (refs == 1)
7355 goto reada;
bd56b302 7356
94fcca9f
YZ
7357 if (wc->level == 1 &&
7358 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7359 continue;
1c4850e2
YZ
7360 if (!wc->update_ref ||
7361 generation <= root->root_key.offset)
7362 continue;
7363 btrfs_node_key_to_cpu(eb, &key, slot);
7364 ret = btrfs_comp_cpu_keys(&key,
7365 &wc->update_progress);
7366 if (ret < 0)
7367 continue;
94fcca9f
YZ
7368 } else {
7369 if (wc->level == 1 &&
7370 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7371 continue;
6407bf6d 7372 }
1c4850e2 7373reada:
2ff7e61e 7374 readahead_tree_block(fs_info, bytenr);
1c4850e2 7375 nread++;
20524f02 7376 }
1c4850e2 7377 wc->reada_slot = slot;
20524f02 7378}
2c47e605 7379
f82d02d9 7380/*
2c016dc2 7381 * helper to process tree block while walking down the tree.
2c47e605 7382 *
2c47e605
YZ
7383 * when wc->stage == UPDATE_BACKREF, this function updates
7384 * back refs for pointers in the block.
7385 *
7386 * NOTE: return value 1 means we should stop walking down.
f82d02d9 7387 */
2c47e605 7388static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 7389 struct btrfs_root *root,
2c47e605 7390 struct btrfs_path *path,
94fcca9f 7391 struct walk_control *wc, int lookup_info)
f82d02d9 7392{
2ff7e61e 7393 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
7394 int level = wc->level;
7395 struct extent_buffer *eb = path->nodes[level];
2c47e605 7396 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
7397 int ret;
7398
2c47e605
YZ
7399 if (wc->stage == UPDATE_BACKREF &&
7400 btrfs_header_owner(eb) != root->root_key.objectid)
7401 return 1;
f82d02d9 7402
2c47e605
YZ
7403 /*
7404 * when reference count of tree block is 1, it won't increase
7405 * again. once full backref flag is set, we never clear it.
7406 */
94fcca9f
YZ
7407 if (lookup_info &&
7408 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7409 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 7410 BUG_ON(!path->locks[level]);
2ff7e61e 7411 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 7412 eb->start, level, 1,
2c47e605
YZ
7413 &wc->refs[level],
7414 &wc->flags[level]);
79787eaa
JM
7415 BUG_ON(ret == -ENOMEM);
7416 if (ret)
7417 return ret;
2c47e605
YZ
7418 BUG_ON(wc->refs[level] == 0);
7419 }
5d4f98a2 7420
2c47e605
YZ
7421 if (wc->stage == DROP_REFERENCE) {
7422 if (wc->refs[level] > 1)
7423 return 1;
f82d02d9 7424
2c47e605 7425 if (path->locks[level] && !wc->keep_locks) {
bd681513 7426 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
7427 path->locks[level] = 0;
7428 }
7429 return 0;
7430 }
f82d02d9 7431
2c47e605
YZ
7432 /* wc->stage == UPDATE_BACKREF */
7433 if (!(wc->flags[level] & flag)) {
7434 BUG_ON(!path->locks[level]);
e339a6b0 7435 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 7436 BUG_ON(ret); /* -ENOMEM */
e339a6b0 7437 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 7438 BUG_ON(ret); /* -ENOMEM */
f5c8daa5 7439 ret = btrfs_set_disk_extent_flags(trans, eb->start,
b1c79e09
JB
7440 eb->len, flag,
7441 btrfs_header_level(eb), 0);
79787eaa 7442 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
7443 wc->flags[level] |= flag;
7444 }
7445
7446 /*
7447 * the block is shared by multiple trees, so it's not good to
7448 * keep the tree lock
7449 */
7450 if (path->locks[level] && level > 0) {
bd681513 7451 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
7452 path->locks[level] = 0;
7453 }
7454 return 0;
7455}
7456
78c52d9e
JB
7457/*
7458 * This is used to verify a ref exists for this root to deal with a bug where we
7459 * would have a drop_progress key that hadn't been updated properly.
7460 */
7461static int check_ref_exists(struct btrfs_trans_handle *trans,
7462 struct btrfs_root *root, u64 bytenr, u64 parent,
7463 int level)
7464{
7465 struct btrfs_path *path;
7466 struct btrfs_extent_inline_ref *iref;
7467 int ret;
7468
7469 path = btrfs_alloc_path();
7470 if (!path)
7471 return -ENOMEM;
7472
7473 ret = lookup_extent_backref(trans, path, &iref, bytenr,
7474 root->fs_info->nodesize, parent,
7475 root->root_key.objectid, level, 0);
7476 btrfs_free_path(path);
7477 if (ret == -ENOENT)
7478 return 0;
7479 if (ret < 0)
7480 return ret;
7481 return 1;
7482}
7483
1c4850e2 7484/*
2c016dc2 7485 * helper to process tree block pointer.
1c4850e2
YZ
7486 *
7487 * when wc->stage == DROP_REFERENCE, this function checks
7488 * reference count of the block pointed to. if the block
7489 * is shared and we need update back refs for the subtree
7490 * rooted at the block, this function changes wc->stage to
7491 * UPDATE_BACKREF. if the block is shared and there is no
7492 * need to update back, this function drops the reference
7493 * to the block.
7494 *
7495 * NOTE: return value 1 means we should stop walking down.
7496 */
7497static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7498 struct btrfs_root *root,
7499 struct btrfs_path *path,
94fcca9f 7500 struct walk_control *wc, int *lookup_info)
1c4850e2 7501{
0b246afa 7502 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
7503 u64 bytenr;
7504 u64 generation;
7505 u64 parent;
1c4850e2 7506 struct btrfs_key key;
581c1760 7507 struct btrfs_key first_key;
ffd4bb2a 7508 struct btrfs_ref ref = { 0 };
1c4850e2
YZ
7509 struct extent_buffer *next;
7510 int level = wc->level;
7511 int reada = 0;
7512 int ret = 0;
1152651a 7513 bool need_account = false;
1c4850e2
YZ
7514
7515 generation = btrfs_node_ptr_generation(path->nodes[level],
7516 path->slots[level]);
7517 /*
7518 * if the lower level block was created before the snapshot
7519 * was created, we know there is no need to update back refs
7520 * for the subtree
7521 */
7522 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
7523 generation <= root->root_key.offset) {
7524 *lookup_info = 1;
1c4850e2 7525 return 1;
94fcca9f 7526 }
1c4850e2
YZ
7527
7528 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
581c1760
QW
7529 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
7530 path->slots[level]);
1c4850e2 7531
0b246afa 7532 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 7533 if (!next) {
2ff7e61e 7534 next = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
7535 if (IS_ERR(next))
7536 return PTR_ERR(next);
7537
b2aaaa3b
JB
7538 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7539 level - 1);
1c4850e2
YZ
7540 reada = 1;
7541 }
7542 btrfs_tree_lock(next);
8bead258 7543 btrfs_set_lock_blocking_write(next);
1c4850e2 7544
2ff7e61e 7545 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f
YZ
7546 &wc->refs[level - 1],
7547 &wc->flags[level - 1]);
4867268c
JB
7548 if (ret < 0)
7549 goto out_unlock;
79787eaa 7550
c2cf52eb 7551 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 7552 btrfs_err(fs_info, "Missing references.");
4867268c
JB
7553 ret = -EIO;
7554 goto out_unlock;
c2cf52eb 7555 }
94fcca9f 7556 *lookup_info = 0;
1c4850e2 7557
94fcca9f 7558 if (wc->stage == DROP_REFERENCE) {
1c4850e2 7559 if (wc->refs[level - 1] > 1) {
1152651a 7560 need_account = true;
94fcca9f
YZ
7561 if (level == 1 &&
7562 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7563 goto skip;
7564
1c4850e2
YZ
7565 if (!wc->update_ref ||
7566 generation <= root->root_key.offset)
7567 goto skip;
7568
7569 btrfs_node_key_to_cpu(path->nodes[level], &key,
7570 path->slots[level]);
7571 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7572 if (ret < 0)
7573 goto skip;
7574
7575 wc->stage = UPDATE_BACKREF;
7576 wc->shared_level = level - 1;
7577 }
94fcca9f
YZ
7578 } else {
7579 if (level == 1 &&
7580 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7581 goto skip;
1c4850e2
YZ
7582 }
7583
b9fab919 7584 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
7585 btrfs_tree_unlock(next);
7586 free_extent_buffer(next);
7587 next = NULL;
94fcca9f 7588 *lookup_info = 1;
1c4850e2
YZ
7589 }
7590
7591 if (!next) {
7592 if (reada && level == 1)
7593 reada_walk_down(trans, root, wc, path);
581c1760
QW
7594 next = read_tree_block(fs_info, bytenr, generation, level - 1,
7595 &first_key);
64c043de
LB
7596 if (IS_ERR(next)) {
7597 return PTR_ERR(next);
7598 } else if (!extent_buffer_uptodate(next)) {
416bc658 7599 free_extent_buffer(next);
97d9a8a4 7600 return -EIO;
416bc658 7601 }
1c4850e2 7602 btrfs_tree_lock(next);
8bead258 7603 btrfs_set_lock_blocking_write(next);
1c4850e2
YZ
7604 }
7605
7606 level--;
4867268c
JB
7607 ASSERT(level == btrfs_header_level(next));
7608 if (level != btrfs_header_level(next)) {
7609 btrfs_err(root->fs_info, "mismatched level");
7610 ret = -EIO;
7611 goto out_unlock;
7612 }
1c4850e2
YZ
7613 path->nodes[level] = next;
7614 path->slots[level] = 0;
bd681513 7615 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
1c4850e2
YZ
7616 wc->level = level;
7617 if (wc->level == 1)
7618 wc->reada_slot = 0;
7619 return 0;
7620skip:
7621 wc->refs[level - 1] = 0;
7622 wc->flags[level - 1] = 0;
94fcca9f
YZ
7623 if (wc->stage == DROP_REFERENCE) {
7624 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7625 parent = path->nodes[level]->start;
7626 } else {
4867268c 7627 ASSERT(root->root_key.objectid ==
94fcca9f 7628 btrfs_header_owner(path->nodes[level]));
4867268c
JB
7629 if (root->root_key.objectid !=
7630 btrfs_header_owner(path->nodes[level])) {
7631 btrfs_err(root->fs_info,
7632 "mismatched block owner");
7633 ret = -EIO;
7634 goto out_unlock;
7635 }
94fcca9f
YZ
7636 parent = 0;
7637 }
1c4850e2 7638
78c52d9e
JB
7639 /*
7640 * If we had a drop_progress we need to verify the refs are set
7641 * as expected. If we find our ref then we know that from here
7642 * on out everything should be correct, and we can clear the
7643 * ->restarted flag.
7644 */
7645 if (wc->restarted) {
7646 ret = check_ref_exists(trans, root, bytenr, parent,
7647 level - 1);
7648 if (ret < 0)
7649 goto out_unlock;
7650 if (ret == 0)
7651 goto no_delete;
7652 ret = 0;
7653 wc->restarted = 0;
7654 }
7655
2cd86d30
QW
7656 /*
7657 * Reloc tree doesn't contribute to qgroup numbers, and we have
7658 * already accounted them at merge time (replace_path),
7659 * thus we could skip expensive subtree trace here.
7660 */
7661 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
7662 need_account) {
deb40627 7663 ret = btrfs_qgroup_trace_subtree(trans, next,
33d1f05c 7664 generation, level - 1);
1152651a 7665 if (ret) {
0b246afa 7666 btrfs_err_rl(fs_info,
5d163e0e
JM
7667 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
7668 ret);
1152651a
MF
7669 }
7670 }
aea6f028
JB
7671
7672 /*
7673 * We need to update the next key in our walk control so we can
7674 * update the drop_progress key accordingly. We don't care if
7675 * find_next_key doesn't find a key because that means we're at
7676 * the end and are going to clean up now.
7677 */
7678 wc->drop_level = level;
7679 find_next_key(path, level, &wc->drop_progress);
7680
ffd4bb2a
QW
7681 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
7682 fs_info->nodesize, parent);
7683 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
7684 ret = btrfs_free_extent(trans, &ref);
4867268c
JB
7685 if (ret)
7686 goto out_unlock;
1c4850e2 7687 }
78c52d9e 7688no_delete:
4867268c
JB
7689 *lookup_info = 1;
7690 ret = 1;
7691
7692out_unlock:
1c4850e2
YZ
7693 btrfs_tree_unlock(next);
7694 free_extent_buffer(next);
4867268c
JB
7695
7696 return ret;
1c4850e2
YZ
7697}
7698
2c47e605 7699/*
2c016dc2 7700 * helper to process tree block while walking up the tree.
2c47e605
YZ
7701 *
7702 * when wc->stage == DROP_REFERENCE, this function drops
7703 * reference count on the block.
7704 *
7705 * when wc->stage == UPDATE_BACKREF, this function changes
7706 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7707 * to UPDATE_BACKREF previously while processing the block.
7708 *
7709 * NOTE: return value 1 means we should stop walking up.
7710 */
7711static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7712 struct btrfs_root *root,
7713 struct btrfs_path *path,
7714 struct walk_control *wc)
7715{
0b246afa 7716 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 7717 int ret;
2c47e605
YZ
7718 int level = wc->level;
7719 struct extent_buffer *eb = path->nodes[level];
7720 u64 parent = 0;
7721
7722 if (wc->stage == UPDATE_BACKREF) {
7723 BUG_ON(wc->shared_level < level);
7724 if (level < wc->shared_level)
7725 goto out;
7726
2c47e605
YZ
7727 ret = find_next_key(path, level + 1, &wc->update_progress);
7728 if (ret > 0)
7729 wc->update_ref = 0;
7730
7731 wc->stage = DROP_REFERENCE;
7732 wc->shared_level = -1;
7733 path->slots[level] = 0;
7734
7735 /*
7736 * check reference count again if the block isn't locked.
7737 * we should start walking down the tree again if reference
7738 * count is one.
7739 */
7740 if (!path->locks[level]) {
7741 BUG_ON(level == 0);
7742 btrfs_tree_lock(eb);
8bead258 7743 btrfs_set_lock_blocking_write(eb);
bd681513 7744 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7745
2ff7e61e 7746 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 7747 eb->start, level, 1,
2c47e605
YZ
7748 &wc->refs[level],
7749 &wc->flags[level]);
79787eaa
JM
7750 if (ret < 0) {
7751 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 7752 path->locks[level] = 0;
79787eaa
JM
7753 return ret;
7754 }
2c47e605
YZ
7755 BUG_ON(wc->refs[level] == 0);
7756 if (wc->refs[level] == 1) {
bd681513 7757 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 7758 path->locks[level] = 0;
2c47e605
YZ
7759 return 1;
7760 }
f82d02d9 7761 }
2c47e605 7762 }
f82d02d9 7763
2c47e605
YZ
7764 /* wc->stage == DROP_REFERENCE */
7765 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 7766
2c47e605
YZ
7767 if (wc->refs[level] == 1) {
7768 if (level == 0) {
7769 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 7770 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 7771 else
e339a6b0 7772 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 7773 BUG_ON(ret); /* -ENOMEM */
c4140cbf
QW
7774 if (is_fstree(root->root_key.objectid)) {
7775 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
7776 if (ret) {
7777 btrfs_err_rl(fs_info,
7778 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 7779 ret);
c4140cbf 7780 }
1152651a 7781 }
2c47e605 7782 }
6a884d7d 7783 /* make block locked assertion in btrfs_clean_tree_block happy */
2c47e605
YZ
7784 if (!path->locks[level] &&
7785 btrfs_header_generation(eb) == trans->transid) {
7786 btrfs_tree_lock(eb);
8bead258 7787 btrfs_set_lock_blocking_write(eb);
bd681513 7788 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7789 }
6a884d7d 7790 btrfs_clean_tree_block(eb);
2c47e605
YZ
7791 }
7792
7793 if (eb == root->node) {
7794 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7795 parent = eb->start;
65c6e82b
QW
7796 else if (root->root_key.objectid != btrfs_header_owner(eb))
7797 goto owner_mismatch;
2c47e605
YZ
7798 } else {
7799 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7800 parent = path->nodes[level + 1]->start;
65c6e82b
QW
7801 else if (root->root_key.objectid !=
7802 btrfs_header_owner(path->nodes[level + 1]))
7803 goto owner_mismatch;
f82d02d9 7804 }
f82d02d9 7805
5581a51a 7806 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
2c47e605
YZ
7807out:
7808 wc->refs[level] = 0;
7809 wc->flags[level] = 0;
f0486c68 7810 return 0;
65c6e82b
QW
7811
7812owner_mismatch:
7813 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
7814 btrfs_header_owner(eb), root->root_key.objectid);
7815 return -EUCLEAN;
2c47e605
YZ
7816}
7817
7818static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7819 struct btrfs_root *root,
7820 struct btrfs_path *path,
7821 struct walk_control *wc)
7822{
2c47e605 7823 int level = wc->level;
94fcca9f 7824 int lookup_info = 1;
2c47e605
YZ
7825 int ret;
7826
7827 while (level >= 0) {
94fcca9f 7828 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
7829 if (ret > 0)
7830 break;
7831
7832 if (level == 0)
7833 break;
7834
7a7965f8
YZ
7835 if (path->slots[level] >=
7836 btrfs_header_nritems(path->nodes[level]))
7837 break;
7838
94fcca9f 7839 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
7840 if (ret > 0) {
7841 path->slots[level]++;
7842 continue;
90d2c51d
MX
7843 } else if (ret < 0)
7844 return ret;
1c4850e2 7845 level = wc->level;
f82d02d9 7846 }
f82d02d9
YZ
7847 return 0;
7848}
7849
d397712b 7850static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 7851 struct btrfs_root *root,
f82d02d9 7852 struct btrfs_path *path,
2c47e605 7853 struct walk_control *wc, int max_level)
20524f02 7854{
2c47e605 7855 int level = wc->level;
20524f02 7856 int ret;
9f3a7427 7857
2c47e605
YZ
7858 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7859 while (level < max_level && path->nodes[level]) {
7860 wc->level = level;
7861 if (path->slots[level] + 1 <
7862 btrfs_header_nritems(path->nodes[level])) {
7863 path->slots[level]++;
20524f02
CM
7864 return 0;
7865 } else {
2c47e605
YZ
7866 ret = walk_up_proc(trans, root, path, wc);
7867 if (ret > 0)
7868 return 0;
65c6e82b
QW
7869 if (ret < 0)
7870 return ret;
bd56b302 7871
2c47e605 7872 if (path->locks[level]) {
bd681513
CM
7873 btrfs_tree_unlock_rw(path->nodes[level],
7874 path->locks[level]);
2c47e605 7875 path->locks[level] = 0;
f82d02d9 7876 }
2c47e605
YZ
7877 free_extent_buffer(path->nodes[level]);
7878 path->nodes[level] = NULL;
7879 level++;
20524f02
CM
7880 }
7881 }
7882 return 1;
7883}
7884
9aca1d51 7885/*
2c47e605
YZ
7886 * drop a subvolume tree.
7887 *
7888 * this function traverses the tree freeing any blocks that only
7889 * referenced by the tree.
7890 *
7891 * when a shared tree block is found. this function decreases its
7892 * reference count by one. if update_ref is true, this function
7893 * also make sure backrefs for the shared block and all lower level
7894 * blocks are properly updated.
9d1a2a3a
DS
7895 *
7896 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 7897 */
2c536799 7898int btrfs_drop_snapshot(struct btrfs_root *root,
66d7e7f0
AJ
7899 struct btrfs_block_rsv *block_rsv, int update_ref,
7900 int for_reloc)
20524f02 7901{
ab8d0fc4 7902 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 7903 struct btrfs_path *path;
2c47e605 7904 struct btrfs_trans_handle *trans;
ab8d0fc4 7905 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 7906 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
7907 struct walk_control *wc;
7908 struct btrfs_key key;
7909 int err = 0;
7910 int ret;
7911 int level;
d29a9f62 7912 bool root_dropped = false;
20524f02 7913
4fd786e6 7914 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
1152651a 7915
5caf2a00 7916 path = btrfs_alloc_path();
cb1b69f4
TI
7917 if (!path) {
7918 err = -ENOMEM;
7919 goto out;
7920 }
20524f02 7921
2c47e605 7922 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
7923 if (!wc) {
7924 btrfs_free_path(path);
cb1b69f4
TI
7925 err = -ENOMEM;
7926 goto out;
38a1a919 7927 }
2c47e605 7928
a22285a6 7929 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
7930 if (IS_ERR(trans)) {
7931 err = PTR_ERR(trans);
7932 goto out_free;
7933 }
98d5dc13 7934
0568e82d
JB
7935 err = btrfs_run_delayed_items(trans);
7936 if (err)
7937 goto out_end_trans;
7938
3fd0a558
YZ
7939 if (block_rsv)
7940 trans->block_rsv = block_rsv;
2c47e605 7941
83354f07
JB
7942 /*
7943 * This will help us catch people modifying the fs tree while we're
7944 * dropping it. It is unsafe to mess with the fs tree while it's being
7945 * dropped as we unlock the root node and parent nodes as we walk down
7946 * the tree, assuming nothing will change. If something does change
7947 * then we'll have stale information and drop references to blocks we've
7948 * already dropped.
7949 */
7950 set_bit(BTRFS_ROOT_DELETING, &root->state);
9f3a7427 7951 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 7952 level = btrfs_header_level(root->node);
5d4f98a2 7953 path->nodes[level] = btrfs_lock_root_node(root);
8bead258 7954 btrfs_set_lock_blocking_write(path->nodes[level]);
9f3a7427 7955 path->slots[level] = 0;
bd681513 7956 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
7957 memset(&wc->update_progress, 0,
7958 sizeof(wc->update_progress));
9f3a7427 7959 } else {
9f3a7427 7960 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
7961 memcpy(&wc->update_progress, &key,
7962 sizeof(wc->update_progress));
7963
6702ed49 7964 level = root_item->drop_level;
2c47e605 7965 BUG_ON(level == 0);
6702ed49 7966 path->lowest_level = level;
2c47e605
YZ
7967 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7968 path->lowest_level = 0;
7969 if (ret < 0) {
7970 err = ret;
79787eaa 7971 goto out_end_trans;
9f3a7427 7972 }
1c4850e2 7973 WARN_ON(ret > 0);
2c47e605 7974
7d9eb12c
CM
7975 /*
7976 * unlock our path, this is safe because only this
7977 * function is allowed to delete this snapshot
7978 */
5d4f98a2 7979 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
7980
7981 level = btrfs_header_level(root->node);
7982 while (1) {
7983 btrfs_tree_lock(path->nodes[level]);
8bead258 7984 btrfs_set_lock_blocking_write(path->nodes[level]);
fec386ac 7985 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 7986
2ff7e61e 7987 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 7988 path->nodes[level]->start,
3173a18f 7989 level, 1, &wc->refs[level],
2c47e605 7990 &wc->flags[level]);
79787eaa
JM
7991 if (ret < 0) {
7992 err = ret;
7993 goto out_end_trans;
7994 }
2c47e605
YZ
7995 BUG_ON(wc->refs[level] == 0);
7996
7997 if (level == root_item->drop_level)
7998 break;
7999
8000 btrfs_tree_unlock(path->nodes[level]);
fec386ac 8001 path->locks[level] = 0;
2c47e605
YZ
8002 WARN_ON(wc->refs[level] != 1);
8003 level--;
8004 }
9f3a7427 8005 }
2c47e605 8006
78c52d9e 8007 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
8008 wc->level = level;
8009 wc->shared_level = -1;
8010 wc->stage = DROP_REFERENCE;
8011 wc->update_ref = update_ref;
8012 wc->keep_locks = 0;
0b246afa 8013 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 8014
d397712b 8015 while (1) {
9d1a2a3a 8016
2c47e605
YZ
8017 ret = walk_down_tree(trans, root, path, wc);
8018 if (ret < 0) {
8019 err = ret;
20524f02 8020 break;
2c47e605 8021 }
9aca1d51 8022
2c47e605
YZ
8023 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8024 if (ret < 0) {
8025 err = ret;
20524f02 8026 break;
2c47e605
YZ
8027 }
8028
8029 if (ret > 0) {
8030 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
8031 break;
8032 }
2c47e605
YZ
8033
8034 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
8035 wc->drop_level = wc->level;
8036 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
8037 &wc->drop_progress,
8038 path->slots[wc->drop_level]);
8039 }
8040 btrfs_cpu_key_to_disk(&root_item->drop_progress,
8041 &wc->drop_progress);
8042 root_item->drop_level = wc->drop_level;
2c47e605
YZ
8043
8044 BUG_ON(wc->level == 0);
3a45bb20 8045 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 8046 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
8047 ret = btrfs_update_root(trans, tree_root,
8048 &root->root_key,
8049 root_item);
79787eaa 8050 if (ret) {
66642832 8051 btrfs_abort_transaction(trans, ret);
79787eaa
JM
8052 err = ret;
8053 goto out_end_trans;
8054 }
2c47e605 8055
3a45bb20 8056 btrfs_end_transaction_throttle(trans);
2ff7e61e 8057 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
8058 btrfs_debug(fs_info,
8059 "drop snapshot early exit");
3c8f2422
JB
8060 err = -EAGAIN;
8061 goto out_free;
8062 }
8063
a22285a6 8064 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
8065 if (IS_ERR(trans)) {
8066 err = PTR_ERR(trans);
8067 goto out_free;
8068 }
3fd0a558
YZ
8069 if (block_rsv)
8070 trans->block_rsv = block_rsv;
c3e69d58 8071 }
20524f02 8072 }
b3b4aa74 8073 btrfs_release_path(path);
79787eaa
JM
8074 if (err)
8075 goto out_end_trans;
2c47e605 8076
ab9ce7d4 8077 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 8078 if (ret) {
66642832 8079 btrfs_abort_transaction(trans, ret);
e19182c0 8080 err = ret;
79787eaa
JM
8081 goto out_end_trans;
8082 }
2c47e605 8083
76dda93c 8084 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
cb517eab
MX
8085 ret = btrfs_find_root(tree_root, &root->root_key, path,
8086 NULL, NULL);
79787eaa 8087 if (ret < 0) {
66642832 8088 btrfs_abort_transaction(trans, ret);
79787eaa
JM
8089 err = ret;
8090 goto out_end_trans;
8091 } else if (ret > 0) {
84cd948c
JB
8092 /* if we fail to delete the orphan item this time
8093 * around, it'll get picked up the next time.
8094 *
8095 * The most common failure here is just -ENOENT.
8096 */
8097 btrfs_del_orphan_item(trans, tree_root,
8098 root->root_key.objectid);
76dda93c
YZ
8099 }
8100 }
8101
27cdeb70 8102 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
2b9dbef2 8103 btrfs_add_dropped_root(trans, root);
76dda93c
YZ
8104 } else {
8105 free_extent_buffer(root->node);
8106 free_extent_buffer(root->commit_root);
b0feb9d9 8107 btrfs_put_fs_root(root);
76dda93c 8108 }
d29a9f62 8109 root_dropped = true;
79787eaa 8110out_end_trans:
3a45bb20 8111 btrfs_end_transaction_throttle(trans);
79787eaa 8112out_free:
2c47e605 8113 kfree(wc);
5caf2a00 8114 btrfs_free_path(path);
cb1b69f4 8115out:
d29a9f62
JB
8116 /*
8117 * So if we need to stop dropping the snapshot for whatever reason we
8118 * need to make sure to add it back to the dead root list so that we
8119 * keep trying to do the work later. This also cleans up roots if we
8120 * don't have it in the radix (like when we recover after a power fail
8121 * or unmount) so we don't leak memory.
8122 */
897ca819 8123 if (!for_reloc && !root_dropped)
d29a9f62 8124 btrfs_add_dead_root(root);
90515e7f 8125 if (err && err != -EAGAIN)
ab8d0fc4 8126 btrfs_handle_fs_error(fs_info, err, NULL);
2c536799 8127 return err;
20524f02 8128}
9078a3e1 8129
2c47e605
YZ
8130/*
8131 * drop subtree rooted at tree block 'node'.
8132 *
8133 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 8134 * only used by relocation code
2c47e605 8135 */
f82d02d9
YZ
8136int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8137 struct btrfs_root *root,
8138 struct extent_buffer *node,
8139 struct extent_buffer *parent)
8140{
0b246afa 8141 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 8142 struct btrfs_path *path;
2c47e605 8143 struct walk_control *wc;
f82d02d9
YZ
8144 int level;
8145 int parent_level;
8146 int ret = 0;
8147 int wret;
8148
2c47e605
YZ
8149 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8150
f82d02d9 8151 path = btrfs_alloc_path();
db5b493a
TI
8152 if (!path)
8153 return -ENOMEM;
f82d02d9 8154
2c47e605 8155 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
8156 if (!wc) {
8157 btrfs_free_path(path);
8158 return -ENOMEM;
8159 }
2c47e605 8160
b9447ef8 8161 btrfs_assert_tree_locked(parent);
f82d02d9
YZ
8162 parent_level = btrfs_header_level(parent);
8163 extent_buffer_get(parent);
8164 path->nodes[parent_level] = parent;
8165 path->slots[parent_level] = btrfs_header_nritems(parent);
8166
b9447ef8 8167 btrfs_assert_tree_locked(node);
f82d02d9 8168 level = btrfs_header_level(node);
f82d02d9
YZ
8169 path->nodes[level] = node;
8170 path->slots[level] = 0;
bd681513 8171 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
8172
8173 wc->refs[parent_level] = 1;
8174 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8175 wc->level = level;
8176 wc->shared_level = -1;
8177 wc->stage = DROP_REFERENCE;
8178 wc->update_ref = 0;
8179 wc->keep_locks = 1;
0b246afa 8180 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
8181
8182 while (1) {
2c47e605
YZ
8183 wret = walk_down_tree(trans, root, path, wc);
8184 if (wret < 0) {
f82d02d9 8185 ret = wret;
f82d02d9 8186 break;
2c47e605 8187 }
f82d02d9 8188
2c47e605 8189 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
8190 if (wret < 0)
8191 ret = wret;
8192 if (wret != 0)
8193 break;
8194 }
8195
2c47e605 8196 kfree(wc);
f82d02d9
YZ
8197 btrfs_free_path(path);
8198 return ret;
8199}
8200
6202df69 8201static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c
CM
8202{
8203 u64 num_devices;
fc67c450 8204 u64 stripped;
e4d8ec0f 8205
fc67c450
ID
8206 /*
8207 * if restripe for this chunk_type is on pick target profile and
8208 * return, otherwise do the usual balance
8209 */
6202df69 8210 stripped = get_restripe_target(fs_info, flags);
fc67c450
ID
8211 if (stripped)
8212 return extended_to_chunk(stripped);
e4d8ec0f 8213
6202df69 8214 num_devices = fs_info->fs_devices->rw_devices;
cd02dca5 8215
a07e8a46 8216 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
c7369b3f 8217 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
fc67c450 8218
ec44a35c
CM
8219 if (num_devices == 1) {
8220 stripped |= BTRFS_BLOCK_GROUP_DUP;
8221 stripped = flags & ~stripped;
8222
8223 /* turn raid0 into single device chunks */
8224 if (flags & BTRFS_BLOCK_GROUP_RAID0)
8225 return stripped;
8226
8227 /* turn mirroring into duplication */
c7369b3f 8228 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
ec44a35c
CM
8229 BTRFS_BLOCK_GROUP_RAID10))
8230 return stripped | BTRFS_BLOCK_GROUP_DUP;
ec44a35c
CM
8231 } else {
8232 /* they already had raid on here, just return */
ec44a35c
CM
8233 if (flags & stripped)
8234 return flags;
8235
8236 stripped |= BTRFS_BLOCK_GROUP_DUP;
8237 stripped = flags & ~stripped;
8238
8239 /* switch duplicated blocks with raid1 */
8240 if (flags & BTRFS_BLOCK_GROUP_DUP)
8241 return stripped | BTRFS_BLOCK_GROUP_RAID1;
8242
e3176ca2 8243 /* this is drive concat, leave it alone */
ec44a35c 8244 }
e3176ca2 8245
ec44a35c
CM
8246 return flags;
8247}
8248
868f401a 8249static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
0ef3e66b 8250{
f0486c68
YZ
8251 struct btrfs_space_info *sinfo = cache->space_info;
8252 u64 num_bytes;
3ece54e5 8253 u64 sinfo_used;
199c36ea 8254 u64 min_allocable_bytes;
f0486c68 8255 int ret = -ENOSPC;
0ef3e66b 8256
199c36ea
MX
8257 /*
8258 * We need some metadata space and system metadata space for
8259 * allocating chunks in some corner cases until we force to set
8260 * it to be readonly.
8261 */
8262 if ((sinfo->flags &
8263 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8264 !force)
ee22184b 8265 min_allocable_bytes = SZ_1M;
199c36ea
MX
8266 else
8267 min_allocable_bytes = 0;
8268
f0486c68
YZ
8269 spin_lock(&sinfo->lock);
8270 spin_lock(&cache->lock);
61cfea9b
W
8271
8272 if (cache->ro) {
868f401a 8273 cache->ro++;
61cfea9b
W
8274 ret = 0;
8275 goto out;
8276 }
8277
f0486c68
YZ
8278 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8279 cache->bytes_super - btrfs_block_group_used(&cache->item);
3ece54e5 8280 sinfo_used = btrfs_space_info_used(sinfo, true);
f0486c68 8281
3ece54e5
QW
8282 if (sinfo_used + num_bytes + min_allocable_bytes <=
8283 sinfo->total_bytes) {
f0486c68 8284 sinfo->bytes_readonly += num_bytes;
868f401a 8285 cache->ro++;
633c0aad 8286 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
f0486c68
YZ
8287 ret = 0;
8288 }
61cfea9b 8289out:
f0486c68
YZ
8290 spin_unlock(&cache->lock);
8291 spin_unlock(&sinfo->lock);
3ece54e5
QW
8292 if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
8293 btrfs_info(cache->fs_info,
8294 "unable to make block group %llu ro",
8295 cache->key.objectid);
8296 btrfs_info(cache->fs_info,
8297 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
8298 sinfo_used, num_bytes, min_allocable_bytes);
5da6afeb 8299 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
3ece54e5 8300 }
f0486c68
YZ
8301 return ret;
8302}
7d9eb12c 8303
c83488af 8304int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
c286ac48 8305
f0486c68 8306{
c83488af 8307 struct btrfs_fs_info *fs_info = cache->fs_info;
f0486c68
YZ
8308 struct btrfs_trans_handle *trans;
8309 u64 alloc_flags;
8310 int ret;
7d9eb12c 8311
1bbc621e 8312again:
5e00f193 8313 trans = btrfs_join_transaction(fs_info->extent_root);
79787eaa
JM
8314 if (IS_ERR(trans))
8315 return PTR_ERR(trans);
5d4f98a2 8316
1bbc621e
CM
8317 /*
8318 * we're not allowed to set block groups readonly after the dirty
8319 * block groups cache has started writing. If it already started,
8320 * back off and let this transaction commit
8321 */
0b246afa 8322 mutex_lock(&fs_info->ro_block_group_mutex);
3204d33c 8323 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
1bbc621e
CM
8324 u64 transid = trans->transid;
8325
0b246afa 8326 mutex_unlock(&fs_info->ro_block_group_mutex);
3a45bb20 8327 btrfs_end_transaction(trans);
1bbc621e 8328
2ff7e61e 8329 ret = btrfs_wait_for_commit(fs_info, transid);
1bbc621e
CM
8330 if (ret)
8331 return ret;
8332 goto again;
8333 }
8334
153c35b6
CM
8335 /*
8336 * if we are changing raid levels, try to allocate a corresponding
8337 * block group with the new raid level.
8338 */
0b246afa 8339 alloc_flags = update_block_group_flags(fs_info, cache->flags);
153c35b6 8340 if (alloc_flags != cache->flags) {
fc471cb0 8341 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
153c35b6
CM
8342 /*
8343 * ENOSPC is allowed here, we may have enough space
8344 * already allocated at the new raid level to
8345 * carry on
8346 */
8347 if (ret == -ENOSPC)
8348 ret = 0;
8349 if (ret < 0)
8350 goto out;
8351 }
1bbc621e 8352
868f401a 8353 ret = inc_block_group_ro(cache, 0);
f0486c68
YZ
8354 if (!ret)
8355 goto out;
2ff7e61e 8356 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
fc471cb0 8357 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
f0486c68
YZ
8358 if (ret < 0)
8359 goto out;
868f401a 8360 ret = inc_block_group_ro(cache, 0);
f0486c68 8361out:
2f081088 8362 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 8363 alloc_flags = update_block_group_flags(fs_info, cache->flags);
34441361 8364 mutex_lock(&fs_info->chunk_mutex);
451a2c13 8365 check_system_chunk(trans, alloc_flags);
34441361 8366 mutex_unlock(&fs_info->chunk_mutex);
2f081088 8367 }
0b246afa 8368 mutex_unlock(&fs_info->ro_block_group_mutex);
2f081088 8369
3a45bb20 8370 btrfs_end_transaction(trans);
f0486c68
YZ
8371 return ret;
8372}
5d4f98a2 8373
43a7e99d 8374int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
c87f08ca 8375{
43a7e99d 8376 u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
2ff7e61e 8377
fc471cb0 8378 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
c87f08ca
CM
8379}
8380
6d07bcec
MX
8381/*
8382 * helper to account the unused space of all the readonly block group in the
633c0aad 8383 * space_info. takes mirrors into account.
6d07bcec 8384 */
633c0aad 8385u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6d07bcec
MX
8386{
8387 struct btrfs_block_group_cache *block_group;
8388 u64 free_bytes = 0;
8389 int factor;
8390
01327610 8391 /* It's df, we don't care if it's racy */
633c0aad
JB
8392 if (list_empty(&sinfo->ro_bgs))
8393 return 0;
8394
8395 spin_lock(&sinfo->lock);
8396 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
6d07bcec
MX
8397 spin_lock(&block_group->lock);
8398
8399 if (!block_group->ro) {
8400 spin_unlock(&block_group->lock);
8401 continue;
8402 }
8403
46df06b8 8404 factor = btrfs_bg_type_to_factor(block_group->flags);
6d07bcec
MX
8405 free_bytes += (block_group->key.offset -
8406 btrfs_block_group_used(&block_group->item)) *
8407 factor;
8408
8409 spin_unlock(&block_group->lock);
8410 }
6d07bcec
MX
8411 spin_unlock(&sinfo->lock);
8412
8413 return free_bytes;
8414}
8415
2ff7e61e 8416void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
5d4f98a2 8417{
f0486c68
YZ
8418 struct btrfs_space_info *sinfo = cache->space_info;
8419 u64 num_bytes;
8420
8421 BUG_ON(!cache->ro);
8422
8423 spin_lock(&sinfo->lock);
8424 spin_lock(&cache->lock);
868f401a
Z
8425 if (!--cache->ro) {
8426 num_bytes = cache->key.offset - cache->reserved -
8427 cache->pinned - cache->bytes_super -
8428 btrfs_block_group_used(&cache->item);
8429 sinfo->bytes_readonly -= num_bytes;
8430 list_del_init(&cache->ro_list);
8431 }
f0486c68
YZ
8432 spin_unlock(&cache->lock);
8433 spin_unlock(&sinfo->lock);
5d4f98a2
YZ
8434}
8435
ba1bf481 8436/*
52042d8e 8437 * Checks to see if it's even possible to relocate this block group.
ba1bf481
JB
8438 *
8439 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8440 * ok to go ahead and try.
8441 */
6bccf3ab 8442int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
1a40e23b 8443{
ba1bf481
JB
8444 struct btrfs_block_group_cache *block_group;
8445 struct btrfs_space_info *space_info;
0b246afa 8446 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
ba1bf481 8447 struct btrfs_device *device;
cdcb725c 8448 u64 min_free;
6719db6a
JB
8449 u64 dev_min = 1;
8450 u64 dev_nr = 0;
4a5e98f5 8451 u64 target;
0305bc27 8452 int debug;
cdcb725c 8453 int index;
ba1bf481
JB
8454 int full = 0;
8455 int ret = 0;
1a40e23b 8456
0b246afa 8457 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
0305bc27 8458
0b246afa 8459 block_group = btrfs_lookup_block_group(fs_info, bytenr);
1a40e23b 8460
ba1bf481 8461 /* odd, couldn't find the block group, leave it alone */
0305bc27
QW
8462 if (!block_group) {
8463 if (debug)
0b246afa 8464 btrfs_warn(fs_info,
0305bc27
QW
8465 "can't find block group for bytenr %llu",
8466 bytenr);
ba1bf481 8467 return -1;
0305bc27 8468 }
1a40e23b 8469
cdcb725c 8470 min_free = btrfs_block_group_used(&block_group->item);
8471
ba1bf481 8472 /* no bytes used, we're good */
cdcb725c 8473 if (!min_free)
1a40e23b
ZY
8474 goto out;
8475
ba1bf481
JB
8476 space_info = block_group->space_info;
8477 spin_lock(&space_info->lock);
17d217fe 8478
ba1bf481 8479 full = space_info->full;
17d217fe 8480
ba1bf481
JB
8481 /*
8482 * if this is the last block group we have in this space, we can't
7ce618db
CM
8483 * relocate it unless we're able to allocate a new chunk below.
8484 *
8485 * Otherwise, we need to make sure we have room in the space to handle
8486 * all of the extents from this block group. If we can, we're good
ba1bf481 8487 */
7ce618db 8488 if ((space_info->total_bytes != block_group->key.offset) &&
4136135b
LB
8489 (btrfs_space_info_used(space_info, false) + min_free <
8490 space_info->total_bytes)) {
ba1bf481
JB
8491 spin_unlock(&space_info->lock);
8492 goto out;
17d217fe 8493 }
ba1bf481 8494 spin_unlock(&space_info->lock);
ea8c2819 8495
ba1bf481
JB
8496 /*
8497 * ok we don't have enough space, but maybe we have free space on our
8498 * devices to allocate new chunks for relocation, so loop through our
4a5e98f5
ID
8499 * alloc devices and guess if we have enough space. if this block
8500 * group is going to be restriped, run checks against the target
8501 * profile instead of the current one.
ba1bf481
JB
8502 */
8503 ret = -1;
ea8c2819 8504
cdcb725c 8505 /*
8506 * index:
8507 * 0: raid10
8508 * 1: raid1
8509 * 2: dup
8510 * 3: raid0
8511 * 4: single
8512 */
0b246afa 8513 target = get_restripe_target(fs_info, block_group->flags);
4a5e98f5 8514 if (target) {
3e72ee88 8515 index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
4a5e98f5
ID
8516 } else {
8517 /*
8518 * this is just a balance, so if we were marked as full
8519 * we know there is no space for a new chunk
8520 */
0305bc27
QW
8521 if (full) {
8522 if (debug)
0b246afa
JM
8523 btrfs_warn(fs_info,
8524 "no space to alloc new chunk for block group %llu",
8525 block_group->key.objectid);
4a5e98f5 8526 goto out;
0305bc27 8527 }
4a5e98f5 8528
3e72ee88 8529 index = btrfs_bg_flags_to_raid_index(block_group->flags);
4a5e98f5
ID
8530 }
8531
e6ec716f 8532 if (index == BTRFS_RAID_RAID10) {
cdcb725c 8533 dev_min = 4;
6719db6a
JB
8534 /* Divide by 2 */
8535 min_free >>= 1;
e6ec716f 8536 } else if (index == BTRFS_RAID_RAID1) {
cdcb725c 8537 dev_min = 2;
e6ec716f 8538 } else if (index == BTRFS_RAID_DUP) {
6719db6a
JB
8539 /* Multiply by 2 */
8540 min_free <<= 1;
e6ec716f 8541 } else if (index == BTRFS_RAID_RAID0) {
cdcb725c 8542 dev_min = fs_devices->rw_devices;
47c5713f 8543 min_free = div64_u64(min_free, dev_min);
cdcb725c 8544 }
8545
0b246afa 8546 mutex_lock(&fs_info->chunk_mutex);
ba1bf481 8547 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7bfc837d 8548 u64 dev_offset;
56bec294 8549
ba1bf481
JB
8550 /*
8551 * check to make sure we can actually find a chunk with enough
8552 * space to fit our block group in.
8553 */
63a212ab 8554 if (device->total_bytes > device->bytes_used + min_free &&
401e29c1 8555 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
60dfdf25 8556 ret = find_free_dev_extent(device, min_free,
7bfc837d 8557 &dev_offset, NULL);
ba1bf481 8558 if (!ret)
cdcb725c 8559 dev_nr++;
8560
8561 if (dev_nr >= dev_min)
73e48b27 8562 break;
cdcb725c 8563
ba1bf481 8564 ret = -1;
725c8463 8565 }
edbd8d4e 8566 }
0305bc27 8567 if (debug && ret == -1)
0b246afa
JM
8568 btrfs_warn(fs_info,
8569 "no space to allocate a new chunk for block group %llu",
8570 block_group->key.objectid);
8571 mutex_unlock(&fs_info->chunk_mutex);
edbd8d4e 8572out:
ba1bf481 8573 btrfs_put_block_group(block_group);
edbd8d4e
CM
8574 return ret;
8575}
8576
6bccf3ab
JM
8577static int find_first_block_group(struct btrfs_fs_info *fs_info,
8578 struct btrfs_path *path,
8579 struct btrfs_key *key)
0b86a832 8580{
6bccf3ab 8581 struct btrfs_root *root = fs_info->extent_root;
925baedd 8582 int ret = 0;
0b86a832
CM
8583 struct btrfs_key found_key;
8584 struct extent_buffer *leaf;
514c7dca
QW
8585 struct btrfs_block_group_item bg;
8586 u64 flags;
0b86a832 8587 int slot;
edbd8d4e 8588
0b86a832
CM
8589 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8590 if (ret < 0)
925baedd
CM
8591 goto out;
8592
d397712b 8593 while (1) {
0b86a832 8594 slot = path->slots[0];
edbd8d4e 8595 leaf = path->nodes[0];
0b86a832
CM
8596 if (slot >= btrfs_header_nritems(leaf)) {
8597 ret = btrfs_next_leaf(root, path);
8598 if (ret == 0)
8599 continue;
8600 if (ret < 0)
925baedd 8601 goto out;
0b86a832 8602 break;
edbd8d4e 8603 }
0b86a832 8604 btrfs_item_key_to_cpu(leaf, &found_key, slot);
edbd8d4e 8605
0b86a832 8606 if (found_key.objectid >= key->objectid &&
925baedd 8607 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6fb37b75
LB
8608 struct extent_map_tree *em_tree;
8609 struct extent_map *em;
8610
c8bf1b67 8611 em_tree = &root->fs_info->mapping_tree;
6fb37b75
LB
8612 read_lock(&em_tree->lock);
8613 em = lookup_extent_mapping(em_tree, found_key.objectid,
8614 found_key.offset);
8615 read_unlock(&em_tree->lock);
8616 if (!em) {
0b246afa 8617 btrfs_err(fs_info,
6fb37b75
LB
8618 "logical %llu len %llu found bg but no related chunk",
8619 found_key.objectid, found_key.offset);
8620 ret = -ENOENT;
514c7dca
QW
8621 } else if (em->start != found_key.objectid ||
8622 em->len != found_key.offset) {
8623 btrfs_err(fs_info,
8624 "block group %llu len %llu mismatch with chunk %llu len %llu",
8625 found_key.objectid, found_key.offset,
8626 em->start, em->len);
8627 ret = -EUCLEAN;
6fb37b75 8628 } else {
514c7dca
QW
8629 read_extent_buffer(leaf, &bg,
8630 btrfs_item_ptr_offset(leaf, slot),
8631 sizeof(bg));
8632 flags = btrfs_block_group_flags(&bg) &
8633 BTRFS_BLOCK_GROUP_TYPE_MASK;
8634
8635 if (flags != (em->map_lookup->type &
8636 BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8637 btrfs_err(fs_info,
8638"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
8639 found_key.objectid,
8640 found_key.offset, flags,
8641 (BTRFS_BLOCK_GROUP_TYPE_MASK &
8642 em->map_lookup->type));
8643 ret = -EUCLEAN;
8644 } else {
8645 ret = 0;
8646 }
6fb37b75 8647 }
187ee58c 8648 free_extent_map(em);
925baedd
CM
8649 goto out;
8650 }
0b86a832 8651 path->slots[0]++;
edbd8d4e 8652 }
925baedd 8653out:
0b86a832 8654 return ret;
edbd8d4e
CM
8655}
8656
0af3d00b
JB
8657void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8658{
8659 struct btrfs_block_group_cache *block_group;
8660 u64 last = 0;
8661
8662 while (1) {
8663 struct inode *inode;
8664
8665 block_group = btrfs_lookup_first_block_group(info, last);
8666 while (block_group) {
3aa7c7a3 8667 wait_block_group_cache_done(block_group);
0af3d00b
JB
8668 spin_lock(&block_group->lock);
8669 if (block_group->iref)
8670 break;
8671 spin_unlock(&block_group->lock);
f87b7eb8 8672 block_group = next_block_group(block_group);
0af3d00b
JB
8673 }
8674 if (!block_group) {
8675 if (last == 0)
8676 break;
8677 last = 0;
8678 continue;
8679 }
8680
8681 inode = block_group->inode;
8682 block_group->iref = 0;
8683 block_group->inode = NULL;
8684 spin_unlock(&block_group->lock);
f3bca802 8685 ASSERT(block_group->io_ctl.inode == NULL);
0af3d00b
JB
8686 iput(inode);
8687 last = block_group->key.objectid + block_group->key.offset;
8688 btrfs_put_block_group(block_group);
8689 }
8690}
8691
5cdd7db6
FM
8692/*
8693 * Must be called only after stopping all workers, since we could have block
8694 * group caching kthreads running, and therefore they could race with us if we
8695 * freed the block groups before stopping them.
8696 */
1a40e23b
ZY
8697int btrfs_free_block_groups(struct btrfs_fs_info *info)
8698{
8699 struct btrfs_block_group_cache *block_group;
4184ea7f 8700 struct btrfs_space_info *space_info;
11833d66 8701 struct btrfs_caching_control *caching_ctl;
1a40e23b
ZY
8702 struct rb_node *n;
8703
9e351cc8 8704 down_write(&info->commit_root_sem);
11833d66
YZ
8705 while (!list_empty(&info->caching_block_groups)) {
8706 caching_ctl = list_entry(info->caching_block_groups.next,
8707 struct btrfs_caching_control, list);
8708 list_del(&caching_ctl->list);
8709 put_caching_control(caching_ctl);
8710 }
9e351cc8 8711 up_write(&info->commit_root_sem);
11833d66 8712
47ab2a6c
JB
8713 spin_lock(&info->unused_bgs_lock);
8714 while (!list_empty(&info->unused_bgs)) {
8715 block_group = list_first_entry(&info->unused_bgs,
8716 struct btrfs_block_group_cache,
8717 bg_list);
8718 list_del_init(&block_group->bg_list);
8719 btrfs_put_block_group(block_group);
8720 }
8721 spin_unlock(&info->unused_bgs_lock);
8722
1a40e23b
ZY
8723 spin_lock(&info->block_group_cache_lock);
8724 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8725 block_group = rb_entry(n, struct btrfs_block_group_cache,
8726 cache_node);
1a40e23b
ZY
8727 rb_erase(&block_group->cache_node,
8728 &info->block_group_cache_tree);
01eacb27 8729 RB_CLEAR_NODE(&block_group->cache_node);
d899e052
YZ
8730 spin_unlock(&info->block_group_cache_lock);
8731
80eb234a 8732 down_write(&block_group->space_info->groups_sem);
1a40e23b 8733 list_del(&block_group->list);
80eb234a 8734 up_write(&block_group->space_info->groups_sem);
d2fb3437 8735
3c14874a
JB
8736 /*
8737 * We haven't cached this block group, which means we could
8738 * possibly have excluded extents on this block group.
8739 */
36cce922
JB
8740 if (block_group->cached == BTRFS_CACHE_NO ||
8741 block_group->cached == BTRFS_CACHE_ERROR)
9e715da8 8742 free_excluded_extents(block_group);
3c14874a 8743
817d52f8 8744 btrfs_remove_free_space_cache(block_group);
5cdd7db6 8745 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
f3bca802
LB
8746 ASSERT(list_empty(&block_group->dirty_list));
8747 ASSERT(list_empty(&block_group->io_list));
8748 ASSERT(list_empty(&block_group->bg_list));
8749 ASSERT(atomic_read(&block_group->count) == 1);
11dfe35a 8750 btrfs_put_block_group(block_group);
d899e052
YZ
8751
8752 spin_lock(&info->block_group_cache_lock);
1a40e23b
ZY
8753 }
8754 spin_unlock(&info->block_group_cache_lock);
4184ea7f
CM
8755
8756 /* now that all the block groups are freed, go through and
8757 * free all the space_info structs. This is only called during
8758 * the final stages of unmount, and so we know nobody is
8759 * using them. We call synchronize_rcu() once before we start,
8760 * just to be on the safe side.
8761 */
8762 synchronize_rcu();
8763
8929ecfa
YZ
8764 release_global_block_rsv(info);
8765
67871254 8766 while (!list_empty(&info->space_info)) {
6ab0a202
JM
8767 int i;
8768
4184ea7f
CM
8769 space_info = list_entry(info->space_info.next,
8770 struct btrfs_space_info,
8771 list);
d555b6c3
JB
8772
8773 /*
8774 * Do not hide this behind enospc_debug, this is actually
8775 * important and indicates a real bug if this happens.
8776 */
8777 if (WARN_ON(space_info->bytes_pinned > 0 ||
b069e0c3 8778 space_info->bytes_reserved > 0 ||
d555b6c3 8779 space_info->bytes_may_use > 0))
5da6afeb 8780 btrfs_dump_space_info(info, space_info, 0, 0);
4184ea7f 8781 list_del(&space_info->list);
6ab0a202
JM
8782 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8783 struct kobject *kobj;
c1895442
JM
8784 kobj = space_info->block_group_kobjs[i];
8785 space_info->block_group_kobjs[i] = NULL;
8786 if (kobj) {
6ab0a202
JM
8787 kobject_del(kobj);
8788 kobject_put(kobj);
8789 }
8790 }
8791 kobject_del(&space_info->kobj);
8792 kobject_put(&space_info->kobj);
4184ea7f 8793 }
1a40e23b
ZY
8794 return 0;
8795}
8796
75cb379d
JM
8797/* link_block_group will queue up kobjects to add when we're reclaim-safe */
8798void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
8799{
8800 struct btrfs_space_info *space_info;
8801 struct raid_kobject *rkobj;
8802 LIST_HEAD(list);
75cb379d
JM
8803 int ret = 0;
8804
8805 spin_lock(&fs_info->pending_raid_kobjs_lock);
8806 list_splice_init(&fs_info->pending_raid_kobjs, &list);
8807 spin_unlock(&fs_info->pending_raid_kobjs_lock);
8808
8809 list_for_each_entry(rkobj, &list, list) {
280c2908 8810 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
75cb379d
JM
8811
8812 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
158da513 8813 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
75cb379d
JM
8814 if (ret) {
8815 kobject_put(&rkobj->kobj);
8816 break;
8817 }
8818 }
8819 if (ret)
8820 btrfs_warn(fs_info,
8821 "failed to add kobject for block cache, ignoring");
8822}
8823
c434d21c 8824static void link_block_group(struct btrfs_block_group_cache *cache)
b742bb82 8825{
c434d21c 8826 struct btrfs_space_info *space_info = cache->space_info;
75cb379d 8827 struct btrfs_fs_info *fs_info = cache->fs_info;
3e72ee88 8828 int index = btrfs_bg_flags_to_raid_index(cache->flags);
ed55b6ac 8829 bool first = false;
b742bb82
YZ
8830
8831 down_write(&space_info->groups_sem);
ed55b6ac
JM
8832 if (list_empty(&space_info->block_groups[index]))
8833 first = true;
8834 list_add_tail(&cache->list, &space_info->block_groups[index]);
8835 up_write(&space_info->groups_sem);
8836
8837 if (first) {
75cb379d
JM
8838 struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8839 if (!rkobj) {
8840 btrfs_warn(cache->fs_info,
8841 "couldn't alloc memory for raid level kobject");
8842 return;
6ab0a202 8843 }
75cb379d
JM
8844 rkobj->flags = cache->flags;
8845 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8846
8847 spin_lock(&fs_info->pending_raid_kobjs_lock);
8848 list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
8849 spin_unlock(&fs_info->pending_raid_kobjs_lock);
c1895442 8850 space_info->block_group_kobjs[index] = &rkobj->kobj;
6ab0a202 8851 }
b742bb82
YZ
8852}
8853
920e4a58 8854static struct btrfs_block_group_cache *
2ff7e61e
JM
8855btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
8856 u64 start, u64 size)
920e4a58
MX
8857{
8858 struct btrfs_block_group_cache *cache;
8859
8860 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8861 if (!cache)
8862 return NULL;
8863
8864 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8865 GFP_NOFS);
8866 if (!cache->free_space_ctl) {
8867 kfree(cache);
8868 return NULL;
8869 }
8870
8871 cache->key.objectid = start;
8872 cache->key.offset = size;
8873 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8874
0b246afa 8875 cache->fs_info = fs_info;
e4ff5fb5 8876 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1e144fb8
OS
8877 set_free_space_tree_thresholds(cache);
8878
920e4a58
MX
8879 atomic_set(&cache->count, 1);
8880 spin_lock_init(&cache->lock);
e570fd27 8881 init_rwsem(&cache->data_rwsem);
920e4a58
MX
8882 INIT_LIST_HEAD(&cache->list);
8883 INIT_LIST_HEAD(&cache->cluster_list);
47ab2a6c 8884 INIT_LIST_HEAD(&cache->bg_list);
633c0aad 8885 INIT_LIST_HEAD(&cache->ro_list);
ce93ec54 8886 INIT_LIST_HEAD(&cache->dirty_list);
c9dc4c65 8887 INIT_LIST_HEAD(&cache->io_list);
920e4a58 8888 btrfs_init_free_space_ctl(cache);
04216820 8889 atomic_set(&cache->trimming, 0);
a5ed9182 8890 mutex_init(&cache->free_space_lock);
0966a7b1 8891 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
920e4a58
MX
8892
8893 return cache;
8894}
8895
7ef49515
QW
8896
8897/*
8898 * Iterate all chunks and verify that each of them has the corresponding block
8899 * group
8900 */
8901static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
8902{
c8bf1b67 8903 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7ef49515
QW
8904 struct extent_map *em;
8905 struct btrfs_block_group_cache *bg;
8906 u64 start = 0;
8907 int ret = 0;
8908
8909 while (1) {
c8bf1b67 8910 read_lock(&map_tree->lock);
7ef49515
QW
8911 /*
8912 * lookup_extent_mapping will return the first extent map
8913 * intersecting the range, so setting @len to 1 is enough to
8914 * get the first chunk.
8915 */
c8bf1b67
DS
8916 em = lookup_extent_mapping(map_tree, start, 1);
8917 read_unlock(&map_tree->lock);
7ef49515
QW
8918 if (!em)
8919 break;
8920
8921 bg = btrfs_lookup_block_group(fs_info, em->start);
8922 if (!bg) {
8923 btrfs_err(fs_info,
8924 "chunk start=%llu len=%llu doesn't have corresponding block group",
8925 em->start, em->len);
8926 ret = -EUCLEAN;
8927 free_extent_map(em);
8928 break;
8929 }
8930 if (bg->key.objectid != em->start ||
8931 bg->key.offset != em->len ||
8932 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
8933 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
8934 btrfs_err(fs_info,
8935"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
8936 em->start, em->len,
8937 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
8938 bg->key.objectid, bg->key.offset,
8939 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
8940 ret = -EUCLEAN;
8941 free_extent_map(em);
8942 btrfs_put_block_group(bg);
8943 break;
8944 }
8945 start = em->start + em->len;
8946 free_extent_map(em);
8947 btrfs_put_block_group(bg);
8948 }
8949 return ret;
8950}
8951
5b4aacef 8952int btrfs_read_block_groups(struct btrfs_fs_info *info)
9078a3e1
CM
8953{
8954 struct btrfs_path *path;
8955 int ret;
9078a3e1 8956 struct btrfs_block_group_cache *cache;
6324fbf3 8957 struct btrfs_space_info *space_info;
9078a3e1
CM
8958 struct btrfs_key key;
8959 struct btrfs_key found_key;
5f39d397 8960 struct extent_buffer *leaf;
0af3d00b
JB
8961 int need_clear = 0;
8962 u64 cache_gen;
49303381
LB
8963 u64 feature;
8964 int mixed;
8965
8966 feature = btrfs_super_incompat_flags(info->super_copy);
8967 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
96b5179d 8968
9078a3e1 8969 key.objectid = 0;
0b86a832 8970 key.offset = 0;
962a298f 8971 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9078a3e1
CM
8972 path = btrfs_alloc_path();
8973 if (!path)
8974 return -ENOMEM;
e4058b54 8975 path->reada = READA_FORWARD;
9078a3e1 8976
0b246afa
JM
8977 cache_gen = btrfs_super_cache_generation(info->super_copy);
8978 if (btrfs_test_opt(info, SPACE_CACHE) &&
8979 btrfs_super_generation(info->super_copy) != cache_gen)
0af3d00b 8980 need_clear = 1;
0b246afa 8981 if (btrfs_test_opt(info, CLEAR_CACHE))
88c2ba3b 8982 need_clear = 1;
0af3d00b 8983
d397712b 8984 while (1) {
6bccf3ab 8985 ret = find_first_block_group(info, path, &key);
b742bb82
YZ
8986 if (ret > 0)
8987 break;
0b86a832
CM
8988 if (ret != 0)
8989 goto error;
920e4a58 8990
5f39d397
CM
8991 leaf = path->nodes[0];
8992 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
920e4a58 8993
2ff7e61e 8994 cache = btrfs_create_block_group_cache(info, found_key.objectid,
920e4a58 8995 found_key.offset);
9078a3e1 8996 if (!cache) {
0b86a832 8997 ret = -ENOMEM;
f0486c68 8998 goto error;
9078a3e1 8999 }
96303081 9000
cf7c1ef6
LB
9001 if (need_clear) {
9002 /*
9003 * When we mount with old space cache, we need to
9004 * set BTRFS_DC_CLEAR and set dirty flag.
9005 *
9006 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9007 * truncate the old free space cache inode and
9008 * setup a new one.
9009 * b) Setting 'dirty flag' makes sure that we flush
9010 * the new space cache info onto disk.
9011 */
0b246afa 9012 if (btrfs_test_opt(info, SPACE_CACHE))
ce93ec54 9013 cache->disk_cache_state = BTRFS_DC_CLEAR;
cf7c1ef6 9014 }
0af3d00b 9015
5f39d397
CM
9016 read_extent_buffer(leaf, &cache->item,
9017 btrfs_item_ptr_offset(leaf, path->slots[0]),
9018 sizeof(cache->item));
920e4a58 9019 cache->flags = btrfs_block_group_flags(&cache->item);
49303381
LB
9020 if (!mixed &&
9021 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
9022 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
9023 btrfs_err(info,
9024"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
9025 cache->key.objectid);
9026 ret = -EINVAL;
9027 goto error;
9028 }
0b86a832 9029
9078a3e1 9030 key.objectid = found_key.objectid + found_key.offset;
b3b4aa74 9031 btrfs_release_path(path);
34d52cb6 9032
3c14874a
JB
9033 /*
9034 * We need to exclude the super stripes now so that the space
9035 * info has super bytes accounted for, otherwise we'll think
9036 * we have more space than we actually do.
9037 */
3c4da657 9038 ret = exclude_super_stripes(cache);
835d974f
JB
9039 if (ret) {
9040 /*
9041 * We may have excluded something, so call this just in
9042 * case.
9043 */
9e715da8 9044 free_excluded_extents(cache);
920e4a58 9045 btrfs_put_block_group(cache);
835d974f
JB
9046 goto error;
9047 }
3c14874a 9048
817d52f8
JB
9049 /*
9050 * check for two cases, either we are full, and therefore
9051 * don't need to bother with the caching work since we won't
9052 * find any space, or we are empty, and we can just add all
52042d8e 9053 * the space in and be done with it. This saves us _a_lot_ of
817d52f8
JB
9054 * time, particularly in the full case.
9055 */
9056 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
11833d66 9057 cache->last_byte_to_unpin = (u64)-1;
817d52f8 9058 cache->cached = BTRFS_CACHE_FINISHED;
9e715da8 9059 free_excluded_extents(cache);
817d52f8 9060 } else if (btrfs_block_group_used(&cache->item) == 0) {
11833d66 9061 cache->last_byte_to_unpin = (u64)-1;
817d52f8 9062 cache->cached = BTRFS_CACHE_FINISHED;
4457c1c7 9063 add_new_free_space(cache, found_key.objectid,
817d52f8
JB
9064 found_key.objectid +
9065 found_key.offset);
9e715da8 9066 free_excluded_extents(cache);
817d52f8 9067 }
96b5179d 9068
0b246afa 9069 ret = btrfs_add_block_group_cache(info, cache);
8c579fe7
JB
9070 if (ret) {
9071 btrfs_remove_free_space_cache(cache);
9072 btrfs_put_block_group(cache);
9073 goto error;
9074 }
9075
0b246afa 9076 trace_btrfs_add_block_group(info, cache, 0);
280c2908
JB
9077 btrfs_update_space_info(info, cache->flags, found_key.offset,
9078 btrfs_block_group_used(&cache->item),
9079 cache->bytes_super, &space_info);
8c579fe7 9080
6324fbf3 9081 cache->space_info = space_info;
1b2da372 9082
c434d21c 9083 link_block_group(cache);
0f9dd46c 9084
0b246afa 9085 set_avail_alloc_bits(info, cache->flags);
2ff7e61e 9086 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
868f401a 9087 inc_block_group_ro(cache, 1);
47ab2a6c 9088 } else if (btrfs_block_group_used(&cache->item) == 0) {
031f24da
QW
9089 ASSERT(list_empty(&cache->bg_list));
9090 btrfs_mark_bg_unused(cache);
47ab2a6c 9091 }
9078a3e1 9092 }
b742bb82 9093
0b246afa 9094 list_for_each_entry_rcu(space_info, &info->space_info, list) {
2ff7e61e 9095 if (!(get_alloc_profile(info, space_info->flags) &
b742bb82 9096 (BTRFS_BLOCK_GROUP_RAID10 |
c7369b3f 9097 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 9098 BTRFS_BLOCK_GROUP_RAID56_MASK |
b742bb82
YZ
9099 BTRFS_BLOCK_GROUP_DUP)))
9100 continue;
9101 /*
9102 * avoid allocating from un-mirrored block group if there are
9103 * mirrored block groups.
9104 */
1095cc0d 9105 list_for_each_entry(cache,
9106 &space_info->block_groups[BTRFS_RAID_RAID0],
9107 list)
868f401a 9108 inc_block_group_ro(cache, 1);
1095cc0d 9109 list_for_each_entry(cache,
9110 &space_info->block_groups[BTRFS_RAID_SINGLE],
9111 list)
868f401a 9112 inc_block_group_ro(cache, 1);
9078a3e1 9113 }
f0486c68 9114
75cb379d 9115 btrfs_add_raid_kobjects(info);
f0486c68 9116 init_global_block_rsv(info);
7ef49515 9117 ret = check_chunk_block_group_mappings(info);
0b86a832 9118error:
9078a3e1 9119 btrfs_free_path(path);
0b86a832 9120 return ret;
9078a3e1 9121}
6324fbf3 9122
6c686b35 9123void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ea658bad 9124{
6c686b35 9125 struct btrfs_fs_info *fs_info = trans->fs_info;
545e3366 9126 struct btrfs_block_group_cache *block_group;
0b246afa 9127 struct btrfs_root *extent_root = fs_info->extent_root;
ea658bad
JB
9128 struct btrfs_block_group_item item;
9129 struct btrfs_key key;
9130 int ret = 0;
9131
5ce55557
FM
9132 if (!trans->can_flush_pending_bgs)
9133 return;
9134
545e3366
JB
9135 while (!list_empty(&trans->new_bgs)) {
9136 block_group = list_first_entry(&trans->new_bgs,
9137 struct btrfs_block_group_cache,
9138 bg_list);
ea658bad 9139 if (ret)
c92f6be3 9140 goto next;
ea658bad
JB
9141
9142 spin_lock(&block_group->lock);
9143 memcpy(&item, &block_group->item, sizeof(item));
9144 memcpy(&key, &block_group->key, sizeof(key));
9145 spin_unlock(&block_group->lock);
9146
9147 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9148 sizeof(item));
9149 if (ret)
66642832 9150 btrfs_abort_transaction(trans, ret);
97aff912 9151 ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
6df9a95e 9152 if (ret)
66642832 9153 btrfs_abort_transaction(trans, ret);
e4e0711c 9154 add_block_group_free_space(trans, block_group);
1e144fb8 9155 /* already aborted the transaction if it failed. */
c92f6be3 9156next:
ba2c4d4e 9157 btrfs_delayed_refs_rsv_release(fs_info, 1);
c92f6be3 9158 list_del_init(&block_group->bg_list);
ea658bad 9159 }
5ce55557 9160 btrfs_trans_release_chunk_metadata(trans);
ea658bad
JB
9161}
9162
e7e02096 9163int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
0174484d 9164 u64 type, u64 chunk_offset, u64 size)
6324fbf3 9165{
e7e02096 9166 struct btrfs_fs_info *fs_info = trans->fs_info;
6324fbf3 9167 struct btrfs_block_group_cache *cache;
0b246afa 9168 int ret;
6324fbf3 9169
90787766 9170 btrfs_set_log_full_commit(trans);
e02119d5 9171
2ff7e61e 9172 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
0f9dd46c
JB
9173 if (!cache)
9174 return -ENOMEM;
34d52cb6 9175
6324fbf3 9176 btrfs_set_block_group_used(&cache->item, bytes_used);
0174484d
NB
9177 btrfs_set_block_group_chunk_objectid(&cache->item,
9178 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
6324fbf3
CM
9179 btrfs_set_block_group_flags(&cache->item, type);
9180
920e4a58 9181 cache->flags = type;
11833d66 9182 cache->last_byte_to_unpin = (u64)-1;
817d52f8 9183 cache->cached = BTRFS_CACHE_FINISHED;
1e144fb8 9184 cache->needs_free_space = 1;
3c4da657 9185 ret = exclude_super_stripes(cache);
835d974f
JB
9186 if (ret) {
9187 /*
9188 * We may have excluded something, so call this just in
9189 * case.
9190 */
9e715da8 9191 free_excluded_extents(cache);
920e4a58 9192 btrfs_put_block_group(cache);
835d974f
JB
9193 return ret;
9194 }
96303081 9195
4457c1c7 9196 add_new_free_space(cache, chunk_offset, chunk_offset + size);
817d52f8 9197
9e715da8 9198 free_excluded_extents(cache);
11833d66 9199
d0bd4560 9200#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 9201 if (btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
9202 u64 new_bytes_used = size - bytes_used;
9203
9204 bytes_used += new_bytes_used >> 1;
2ff7e61e 9205 fragment_free_space(cache);
d0bd4560
JB
9206 }
9207#endif
2e6e5183 9208 /*
2be12ef7
NB
9209 * Ensure the corresponding space_info object is created and
9210 * assigned to our block group. We want our bg to be added to the rbtree
9211 * with its ->space_info set.
2e6e5183 9212 */
280c2908 9213 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
dc2d3005 9214 ASSERT(cache->space_info);
2e6e5183 9215
0b246afa 9216 ret = btrfs_add_block_group_cache(fs_info, cache);
8c579fe7
JB
9217 if (ret) {
9218 btrfs_remove_free_space_cache(cache);
9219 btrfs_put_block_group(cache);
9220 return ret;
9221 }
9222
2e6e5183
FM
9223 /*
9224 * Now that our block group has its ->space_info set and is inserted in
9225 * the rbtree, update the space info's counters.
9226 */
0b246afa 9227 trace_btrfs_add_block_group(fs_info, cache, 1);
280c2908 9228 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
e40edf2d 9229 cache->bytes_super, &cache->space_info);
0b246afa 9230 update_global_block_rsv(fs_info);
1b2da372 9231
c434d21c 9232 link_block_group(cache);
6324fbf3 9233
47ab2a6c 9234 list_add_tail(&cache->bg_list, &trans->new_bgs);
ba2c4d4e
JB
9235 trans->delayed_ref_updates++;
9236 btrfs_update_delayed_refs_rsv(trans);
6324fbf3 9237
0b246afa 9238 set_avail_alloc_bits(fs_info, type);
6324fbf3
CM
9239 return 0;
9240}
1a40e23b 9241
10ea00f5
ID
9242static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9243{
899c81ea
ID
9244 u64 extra_flags = chunk_to_extended(flags) &
9245 BTRFS_EXTENDED_PROFILE_MASK;
10ea00f5 9246
de98ced9 9247 write_seqlock(&fs_info->profiles_lock);
10ea00f5
ID
9248 if (flags & BTRFS_BLOCK_GROUP_DATA)
9249 fs_info->avail_data_alloc_bits &= ~extra_flags;
9250 if (flags & BTRFS_BLOCK_GROUP_METADATA)
9251 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9252 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9253 fs_info->avail_system_alloc_bits &= ~extra_flags;
de98ced9 9254 write_sequnlock(&fs_info->profiles_lock);
10ea00f5
ID
9255}
9256
6d58a55a
DS
9257/*
9258 * Clear incompat bits for the following feature(s):
9259 *
9260 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
9261 * in the whole filesystem
9262 */
9263static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
9264{
9265 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
9266 struct list_head *head = &fs_info->space_info;
9267 struct btrfs_space_info *sinfo;
9268
9269 list_for_each_entry_rcu(sinfo, head, list) {
9270 bool found = false;
9271
9272 down_read(&sinfo->groups_sem);
9273 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
9274 found = true;
9275 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
9276 found = true;
9277 up_read(&sinfo->groups_sem);
9278
9279 if (found)
9280 return;
9281 }
9282 btrfs_clear_fs_incompat(fs_info, RAID56);
9283 }
9284}
9285
1a40e23b 9286int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5a98ec01 9287 u64 group_start, struct extent_map *em)
1a40e23b 9288{
5a98ec01 9289 struct btrfs_fs_info *fs_info = trans->fs_info;
6bccf3ab 9290 struct btrfs_root *root = fs_info->extent_root;
1a40e23b
ZY
9291 struct btrfs_path *path;
9292 struct btrfs_block_group_cache *block_group;
44fb5511 9293 struct btrfs_free_cluster *cluster;
0b246afa 9294 struct btrfs_root *tree_root = fs_info->tree_root;
1a40e23b 9295 struct btrfs_key key;
0af3d00b 9296 struct inode *inode;
c1895442 9297 struct kobject *kobj = NULL;
1a40e23b 9298 int ret;
10ea00f5 9299 int index;
89a55897 9300 int factor;
4f69cb98 9301 struct btrfs_caching_control *caching_ctl = NULL;
04216820 9302 bool remove_em;
ba2c4d4e 9303 bool remove_rsv = false;
1a40e23b 9304
6bccf3ab 9305 block_group = btrfs_lookup_block_group(fs_info, group_start);
1a40e23b 9306 BUG_ON(!block_group);
c146afad 9307 BUG_ON(!block_group->ro);
1a40e23b 9308
4ed0a7a3 9309 trace_btrfs_remove_block_group(block_group);
9f7c43c9 9310 /*
9311 * Free the reserved super bytes from this block group before
9312 * remove it.
9313 */
9e715da8 9314 free_excluded_extents(block_group);
fd708b81
JB
9315 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
9316 block_group->key.offset);
9f7c43c9 9317
1a40e23b 9318 memcpy(&key, &block_group->key, sizeof(key));
3e72ee88 9319 index = btrfs_bg_flags_to_raid_index(block_group->flags);
46df06b8 9320 factor = btrfs_bg_type_to_factor(block_group->flags);
1a40e23b 9321
44fb5511 9322 /* make sure this block group isn't part of an allocation cluster */
0b246afa 9323 cluster = &fs_info->data_alloc_cluster;
44fb5511
CM
9324 spin_lock(&cluster->refill_lock);
9325 btrfs_return_cluster_to_free_space(block_group, cluster);
9326 spin_unlock(&cluster->refill_lock);
9327
9328 /*
9329 * make sure this block group isn't part of a metadata
9330 * allocation cluster
9331 */
0b246afa 9332 cluster = &fs_info->meta_alloc_cluster;
44fb5511
CM
9333 spin_lock(&cluster->refill_lock);
9334 btrfs_return_cluster_to_free_space(block_group, cluster);
9335 spin_unlock(&cluster->refill_lock);
9336
1a40e23b 9337 path = btrfs_alloc_path();
d8926bb3
MF
9338 if (!path) {
9339 ret = -ENOMEM;
9340 goto out;
9341 }
1a40e23b 9342
1bbc621e
CM
9343 /*
9344 * get the inode first so any iput calls done for the io_list
9345 * aren't the final iput (no unlinks allowed now)
9346 */
7949f339 9347 inode = lookup_free_space_inode(block_group, path);
1bbc621e
CM
9348
9349 mutex_lock(&trans->transaction->cache_write_mutex);
9350 /*
52042d8e 9351 * Make sure our free space cache IO is done before removing the
1bbc621e
CM
9352 * free space inode
9353 */
9354 spin_lock(&trans->transaction->dirty_bgs_lock);
9355 if (!list_empty(&block_group->io_list)) {
9356 list_del_init(&block_group->io_list);
9357
9358 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9359
9360 spin_unlock(&trans->transaction->dirty_bgs_lock);
afdb5718 9361 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
9362 btrfs_put_block_group(block_group);
9363 spin_lock(&trans->transaction->dirty_bgs_lock);
9364 }
9365
9366 if (!list_empty(&block_group->dirty_list)) {
9367 list_del_init(&block_group->dirty_list);
ba2c4d4e 9368 remove_rsv = true;
1bbc621e
CM
9369 btrfs_put_block_group(block_group);
9370 }
9371 spin_unlock(&trans->transaction->dirty_bgs_lock);
9372 mutex_unlock(&trans->transaction->cache_write_mutex);
9373
0af3d00b 9374 if (!IS_ERR(inode)) {
73f2e545 9375 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
79787eaa
JM
9376 if (ret) {
9377 btrfs_add_delayed_iput(inode);
9378 goto out;
9379 }
0af3d00b
JB
9380 clear_nlink(inode);
9381 /* One for the block groups ref */
9382 spin_lock(&block_group->lock);
9383 if (block_group->iref) {
9384 block_group->iref = 0;
9385 block_group->inode = NULL;
9386 spin_unlock(&block_group->lock);
9387 iput(inode);
9388 } else {
9389 spin_unlock(&block_group->lock);
9390 }
9391 /* One for our lookup ref */
455757c3 9392 btrfs_add_delayed_iput(inode);
0af3d00b
JB
9393 }
9394
9395 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9396 key.offset = block_group->key.objectid;
9397 key.type = 0;
9398
9399 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9400 if (ret < 0)
9401 goto out;
9402 if (ret > 0)
b3b4aa74 9403 btrfs_release_path(path);
0af3d00b
JB
9404 if (ret == 0) {
9405 ret = btrfs_del_item(trans, tree_root, path);
9406 if (ret)
9407 goto out;
b3b4aa74 9408 btrfs_release_path(path);
0af3d00b
JB
9409 }
9410
0b246afa 9411 spin_lock(&fs_info->block_group_cache_lock);
1a40e23b 9412 rb_erase(&block_group->cache_node,
0b246afa 9413 &fs_info->block_group_cache_tree);
292cbd51 9414 RB_CLEAR_NODE(&block_group->cache_node);
a1897fdd 9415
0b246afa
JM
9416 if (fs_info->first_logical_byte == block_group->key.objectid)
9417 fs_info->first_logical_byte = (u64)-1;
9418 spin_unlock(&fs_info->block_group_cache_lock);
817d52f8 9419
80eb234a 9420 down_write(&block_group->space_info->groups_sem);
44fb5511
CM
9421 /*
9422 * we must use list_del_init so people can check to see if they
9423 * are still on the list after taking the semaphore
9424 */
9425 list_del_init(&block_group->list);
6ab0a202 9426 if (list_empty(&block_group->space_info->block_groups[index])) {
c1895442
JM
9427 kobj = block_group->space_info->block_group_kobjs[index];
9428 block_group->space_info->block_group_kobjs[index] = NULL;
0b246afa 9429 clear_avail_alloc_bits(fs_info, block_group->flags);
6ab0a202 9430 }
80eb234a 9431 up_write(&block_group->space_info->groups_sem);
6d58a55a 9432 clear_incompat_bg_bits(fs_info, block_group->flags);
c1895442
JM
9433 if (kobj) {
9434 kobject_del(kobj);
9435 kobject_put(kobj);
9436 }
1a40e23b 9437
4f69cb98
FM
9438 if (block_group->has_caching_ctl)
9439 caching_ctl = get_caching_control(block_group);
817d52f8 9440 if (block_group->cached == BTRFS_CACHE_STARTED)
11833d66 9441 wait_block_group_cache_done(block_group);
4f69cb98 9442 if (block_group->has_caching_ctl) {
0b246afa 9443 down_write(&fs_info->commit_root_sem);
4f69cb98
FM
9444 if (!caching_ctl) {
9445 struct btrfs_caching_control *ctl;
9446
9447 list_for_each_entry(ctl,
0b246afa 9448 &fs_info->caching_block_groups, list)
4f69cb98
FM
9449 if (ctl->block_group == block_group) {
9450 caching_ctl = ctl;
1e4f4714 9451 refcount_inc(&caching_ctl->count);
4f69cb98
FM
9452 break;
9453 }
9454 }
9455 if (caching_ctl)
9456 list_del_init(&caching_ctl->list);
0b246afa 9457 up_write(&fs_info->commit_root_sem);
4f69cb98
FM
9458 if (caching_ctl) {
9459 /* Once for the caching bgs list and once for us. */
9460 put_caching_control(caching_ctl);
9461 put_caching_control(caching_ctl);
9462 }
9463 }
817d52f8 9464
ce93ec54 9465 spin_lock(&trans->transaction->dirty_bgs_lock);
9a0ec83d
NB
9466 WARN_ON(!list_empty(&block_group->dirty_list));
9467 WARN_ON(!list_empty(&block_group->io_list));
ce93ec54 9468 spin_unlock(&trans->transaction->dirty_bgs_lock);
9a0ec83d 9469
817d52f8
JB
9470 btrfs_remove_free_space_cache(block_group);
9471
c146afad 9472 spin_lock(&block_group->space_info->lock);
75c68e9f 9473 list_del_init(&block_group->ro_list);
18d018ad 9474
0b246afa 9475 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
18d018ad
ZL
9476 WARN_ON(block_group->space_info->total_bytes
9477 < block_group->key.offset);
9478 WARN_ON(block_group->space_info->bytes_readonly
9479 < block_group->key.offset);
9480 WARN_ON(block_group->space_info->disk_total
9481 < block_group->key.offset * factor);
9482 }
c146afad
YZ
9483 block_group->space_info->total_bytes -= block_group->key.offset;
9484 block_group->space_info->bytes_readonly -= block_group->key.offset;
89a55897 9485 block_group->space_info->disk_total -= block_group->key.offset * factor;
18d018ad 9486
c146afad 9487 spin_unlock(&block_group->space_info->lock);
283bb197 9488
0af3d00b
JB
9489 memcpy(&key, &block_group->key, sizeof(key));
9490
34441361 9491 mutex_lock(&fs_info->chunk_mutex);
04216820
FM
9492 spin_lock(&block_group->lock);
9493 block_group->removed = 1;
9494 /*
9495 * At this point trimming can't start on this block group, because we
9496 * removed the block group from the tree fs_info->block_group_cache_tree
9497 * so no one can't find it anymore and even if someone already got this
9498 * block group before we removed it from the rbtree, they have already
9499 * incremented block_group->trimming - if they didn't, they won't find
9500 * any free space entries because we already removed them all when we
9501 * called btrfs_remove_free_space_cache().
9502 *
9503 * And we must not remove the extent map from the fs_info->mapping_tree
9504 * to prevent the same logical address range and physical device space
9505 * ranges from being reused for a new block group. This is because our
9506 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9507 * completely transactionless, so while it is trimming a range the
9508 * currently running transaction might finish and a new one start,
9509 * allowing for new block groups to be created that can reuse the same
9510 * physical device locations unless we take this special care.
e33e17ee
JM
9511 *
9512 * There may also be an implicit trim operation if the file system
9513 * is mounted with -odiscard. The same protections must remain
9514 * in place until the extents have been discarded completely when
9515 * the transaction commit has completed.
04216820
FM
9516 */
9517 remove_em = (atomic_read(&block_group->trimming) == 0);
04216820 9518 spin_unlock(&block_group->lock);
04216820 9519
34441361 9520 mutex_unlock(&fs_info->chunk_mutex);
8dbcd10f 9521
f3f72779 9522 ret = remove_block_group_free_space(trans, block_group);
1e144fb8
OS
9523 if (ret)
9524 goto out;
9525
fa9c0d79
CM
9526 btrfs_put_block_group(block_group);
9527 btrfs_put_block_group(block_group);
1a40e23b
ZY
9528
9529 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9530 if (ret > 0)
9531 ret = -EIO;
9532 if (ret < 0)
9533 goto out;
9534
9535 ret = btrfs_del_item(trans, root, path);
8eaf40c0
FM
9536 if (ret)
9537 goto out;
9538
9539 if (remove_em) {
9540 struct extent_map_tree *em_tree;
9541
c8bf1b67 9542 em_tree = &fs_info->mapping_tree;
8eaf40c0
FM
9543 write_lock(&em_tree->lock);
9544 remove_extent_mapping(em_tree, em);
9545 write_unlock(&em_tree->lock);
9546 /* once for the tree */
9547 free_extent_map(em);
9548 }
1a40e23b 9549out:
ba2c4d4e
JB
9550 if (remove_rsv)
9551 btrfs_delayed_refs_rsv_release(fs_info, 1);
1a40e23b
ZY
9552 btrfs_free_path(path);
9553 return ret;
9554}
acce952b 9555
8eab77ff 9556struct btrfs_trans_handle *
7fd01182
FM
9557btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
9558 const u64 chunk_offset)
8eab77ff 9559{
c8bf1b67 9560 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7fd01182
FM
9561 struct extent_map *em;
9562 struct map_lookup *map;
9563 unsigned int num_items;
9564
9565 read_lock(&em_tree->lock);
9566 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
9567 read_unlock(&em_tree->lock);
9568 ASSERT(em && em->start == chunk_offset);
9569
8eab77ff 9570 /*
7fd01182
FM
9571 * We need to reserve 3 + N units from the metadata space info in order
9572 * to remove a block group (done at btrfs_remove_chunk() and at
9573 * btrfs_remove_block_group()), which are used for:
9574 *
8eab77ff
FM
9575 * 1 unit for adding the free space inode's orphan (located in the tree
9576 * of tree roots).
7fd01182
FM
9577 * 1 unit for deleting the block group item (located in the extent
9578 * tree).
9579 * 1 unit for deleting the free space item (located in tree of tree
9580 * roots).
9581 * N units for deleting N device extent items corresponding to each
9582 * stripe (located in the device tree).
9583 *
9584 * In order to remove a block group we also need to reserve units in the
9585 * system space info in order to update the chunk tree (update one or
9586 * more device items and remove one chunk item), but this is done at
9587 * btrfs_remove_chunk() through a call to check_system_chunk().
8eab77ff 9588 */
95617d69 9589 map = em->map_lookup;
7fd01182
FM
9590 num_items = 3 + map->num_stripes;
9591 free_extent_map(em);
9592
8eab77ff 9593 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
7fd01182 9594 num_items, 1);
8eab77ff
FM
9595}
9596
47ab2a6c
JB
9597/*
9598 * Process the unused_bgs list and remove any that don't have any allocated
9599 * space inside of them.
9600 */
9601void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9602{
9603 struct btrfs_block_group_cache *block_group;
9604 struct btrfs_space_info *space_info;
47ab2a6c
JB
9605 struct btrfs_trans_handle *trans;
9606 int ret = 0;
9607
afcdd129 9608 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
47ab2a6c
JB
9609 return;
9610
9611 spin_lock(&fs_info->unused_bgs_lock);
9612 while (!list_empty(&fs_info->unused_bgs)) {
9613 u64 start, end;
e33e17ee 9614 int trimming;
47ab2a6c
JB
9615
9616 block_group = list_first_entry(&fs_info->unused_bgs,
9617 struct btrfs_block_group_cache,
9618 bg_list);
47ab2a6c 9619 list_del_init(&block_group->bg_list);
aefbe9a6
ZL
9620
9621 space_info = block_group->space_info;
9622
47ab2a6c
JB
9623 if (ret || btrfs_mixed_space_info(space_info)) {
9624 btrfs_put_block_group(block_group);
9625 continue;
9626 }
9627 spin_unlock(&fs_info->unused_bgs_lock);
9628
d5f2e33b 9629 mutex_lock(&fs_info->delete_unused_bgs_mutex);
67c5e7d4 9630
47ab2a6c
JB
9631 /* Don't want to race with allocators so take the groups_sem */
9632 down_write(&space_info->groups_sem);
9633 spin_lock(&block_group->lock);
43794446 9634 if (block_group->reserved || block_group->pinned ||
47ab2a6c 9635 btrfs_block_group_used(&block_group->item) ||
19c4d2f9 9636 block_group->ro ||
aefbe9a6 9637 list_is_singular(&block_group->list)) {
47ab2a6c
JB
9638 /*
9639 * We want to bail if we made new allocations or have
9640 * outstanding allocations in this block group. We do
9641 * the ro check in case balance is currently acting on
9642 * this block group.
9643 */
4ed0a7a3 9644 trace_btrfs_skip_unused_block_group(block_group);
47ab2a6c
JB
9645 spin_unlock(&block_group->lock);
9646 up_write(&space_info->groups_sem);
9647 goto next;
9648 }
9649 spin_unlock(&block_group->lock);
9650
9651 /* We don't want to force the issue, only flip if it's ok. */
868f401a 9652 ret = inc_block_group_ro(block_group, 0);
47ab2a6c
JB
9653 up_write(&space_info->groups_sem);
9654 if (ret < 0) {
9655 ret = 0;
9656 goto next;
9657 }
9658
9659 /*
9660 * Want to do this before we do anything else so we can recover
9661 * properly if we fail to join the transaction.
9662 */
7fd01182
FM
9663 trans = btrfs_start_trans_remove_block_group(fs_info,
9664 block_group->key.objectid);
47ab2a6c 9665 if (IS_ERR(trans)) {
2ff7e61e 9666 btrfs_dec_block_group_ro(block_group);
47ab2a6c
JB
9667 ret = PTR_ERR(trans);
9668 goto next;
9669 }
9670
9671 /*
9672 * We could have pending pinned extents for this block group,
9673 * just delete them, we don't care about them anymore.
9674 */
9675 start = block_group->key.objectid;
9676 end = start + block_group->key.offset - 1;
d4b450cd
FM
9677 /*
9678 * Hold the unused_bg_unpin_mutex lock to avoid racing with
9679 * btrfs_finish_extent_commit(). If we are at transaction N,
9680 * another task might be running finish_extent_commit() for the
9681 * previous transaction N - 1, and have seen a range belonging
9682 * to the block group in freed_extents[] before we were able to
9683 * clear the whole block group range from freed_extents[]. This
9684 * means that task can lookup for the block group after we
9685 * unpinned it from freed_extents[] and removed it, leading to
9686 * a BUG_ON() at btrfs_unpin_extent_range().
9687 */
9688 mutex_lock(&fs_info->unused_bg_unpin_mutex);
758eb51e 9689 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
91166212 9690 EXTENT_DIRTY);
758eb51e 9691 if (ret) {
d4b450cd 9692 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 9693 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
9694 goto end_trans;
9695 }
9696 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
91166212 9697 EXTENT_DIRTY);
758eb51e 9698 if (ret) {
d4b450cd 9699 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 9700 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
9701 goto end_trans;
9702 }
d4b450cd 9703 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
47ab2a6c
JB
9704
9705 /* Reset pinned so btrfs_put_block_group doesn't complain */
c30666d4
ZL
9706 spin_lock(&space_info->lock);
9707 spin_lock(&block_group->lock);
9708
bb96c4e5
JB
9709 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
9710 -block_group->pinned);
c30666d4 9711 space_info->bytes_readonly += block_group->pinned;
dec59fa3
EL
9712 percpu_counter_add_batch(&space_info->total_bytes_pinned,
9713 -block_group->pinned,
9714 BTRFS_TOTAL_BYTES_PINNED_BATCH);
47ab2a6c
JB
9715 block_group->pinned = 0;
9716
c30666d4
ZL
9717 spin_unlock(&block_group->lock);
9718 spin_unlock(&space_info->lock);
9719
e33e17ee 9720 /* DISCARD can flip during remount */
0b246afa 9721 trimming = btrfs_test_opt(fs_info, DISCARD);
e33e17ee
JM
9722
9723 /* Implicit trim during transaction commit. */
9724 if (trimming)
9725 btrfs_get_block_group_trimming(block_group);
9726
47ab2a6c
JB
9727 /*
9728 * Btrfs_remove_chunk will abort the transaction if things go
9729 * horribly wrong.
9730 */
97aff912 9731 ret = btrfs_remove_chunk(trans, block_group->key.objectid);
e33e17ee
JM
9732
9733 if (ret) {
9734 if (trimming)
9735 btrfs_put_block_group_trimming(block_group);
9736 goto end_trans;
9737 }
9738
9739 /*
9740 * If we're not mounted with -odiscard, we can just forget
9741 * about this block group. Otherwise we'll need to wait
9742 * until transaction commit to do the actual discard.
9743 */
9744 if (trimming) {
348a0013
FM
9745 spin_lock(&fs_info->unused_bgs_lock);
9746 /*
9747 * A concurrent scrub might have added us to the list
9748 * fs_info->unused_bgs, so use a list_move operation
9749 * to add the block group to the deleted_bgs list.
9750 */
e33e17ee
JM
9751 list_move(&block_group->bg_list,
9752 &trans->transaction->deleted_bgs);
348a0013 9753 spin_unlock(&fs_info->unused_bgs_lock);
e33e17ee
JM
9754 btrfs_get_block_group(block_group);
9755 }
758eb51e 9756end_trans:
3a45bb20 9757 btrfs_end_transaction(trans);
47ab2a6c 9758next:
d5f2e33b 9759 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
47ab2a6c
JB
9760 btrfs_put_block_group(block_group);
9761 spin_lock(&fs_info->unused_bgs_lock);
9762 }
9763 spin_unlock(&fs_info->unused_bgs_lock);
9764}
9765
2ff7e61e
JM
9766int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
9767 u64 start, u64 end)
acce952b 9768{
2ff7e61e 9769 return unpin_extent_range(fs_info, start, end, false);
acce952b 9770}
9771
499f377f
JM
9772/*
9773 * It used to be that old block groups would be left around forever.
9774 * Iterating over them would be enough to trim unused space. Since we
9775 * now automatically remove them, we also need to iterate over unallocated
9776 * space.
9777 *
9778 * We don't want a transaction for this since the discard may take a
9779 * substantial amount of time. We don't require that a transaction be
9780 * running, but we do need to take a running transaction into account
fee7acc3
JM
9781 * to ensure that we're not discarding chunks that were released or
9782 * allocated in the current transaction.
499f377f
JM
9783 *
9784 * Holding the chunks lock will prevent other threads from allocating
9785 * or releasing chunks, but it won't prevent a running transaction
9786 * from committing and releasing the memory that the pending chunks
9787 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
9788 * transaction and hold the commit root sem. We only need to hold
9789 * it while performing the free space search since we have already
9790 * held back allocations.
499f377f 9791 */
8103d10b 9792static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 9793{
8103d10b 9794 u64 start = SZ_1M, len = 0, end = 0;
499f377f
JM
9795 int ret;
9796
9797 *trimmed = 0;
9798
0be88e36
JM
9799 /* Discard not supported = nothing to do. */
9800 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
9801 return 0;
9802
52042d8e 9803 /* Not writable = nothing to do. */
ebbede42 9804 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
9805 return 0;
9806
9807 /* No free space = nothing to do. */
9808 if (device->total_bytes <= device->bytes_used)
9809 return 0;
9810
9811 ret = 0;
9812
9813 while (1) {
fb456252 9814 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
9815 u64 bytes;
9816
9817 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
9818 if (ret)
fee7acc3 9819 break;
499f377f 9820
929be17a
NB
9821 find_first_clear_extent_bit(&device->alloc_state, start,
9822 &start, &end,
9823 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45
NB
9824
9825 /* Ensure we skip the reserved area in the first 1M */
9826 start = max_t(u64, start, SZ_1M);
9827
929be17a
NB
9828 /*
9829 * If find_first_clear_extent_bit find a range that spans the
9830 * end of the device it will set end to -1, in this case it's up
9831 * to the caller to trim the value to the size of the device.
9832 */
9833 end = min(end, device->total_bytes - 1);
53460a45 9834
929be17a 9835 len = end - start + 1;
499f377f 9836
929be17a
NB
9837 /* We didn't find any extents */
9838 if (!len) {
499f377f 9839 mutex_unlock(&fs_info->chunk_mutex);
929be17a 9840 ret = 0;
499f377f
JM
9841 break;
9842 }
9843
929be17a
NB
9844 ret = btrfs_issue_discard(device->bdev, start, len,
9845 &bytes);
9846 if (!ret)
9847 set_extent_bits(&device->alloc_state, start,
9848 start + bytes - 1,
9849 CHUNK_TRIMMED);
499f377f
JM
9850 mutex_unlock(&fs_info->chunk_mutex);
9851
9852 if (ret)
9853 break;
9854
9855 start += len;
9856 *trimmed += bytes;
9857
9858 if (fatal_signal_pending(current)) {
9859 ret = -ERESTARTSYS;
9860 break;
9861 }
9862
9863 cond_resched();
9864 }
9865
9866 return ret;
9867}
9868
93bba24d
QW
9869/*
9870 * Trim the whole filesystem by:
9871 * 1) trimming the free space in each block group
9872 * 2) trimming the unallocated space on each device
9873 *
9874 * This will also continue trimming even if a block group or device encounters
9875 * an error. The return value will be the last error, or 0 if nothing bad
9876 * happens.
9877 */
2ff7e61e 9878int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 9879{
f7039b1d 9880 struct btrfs_block_group_cache *cache = NULL;
499f377f
JM
9881 struct btrfs_device *device;
9882 struct list_head *devices;
f7039b1d
LD
9883 u64 group_trimmed;
9884 u64 start;
9885 u64 end;
9886 u64 trimmed = 0;
93bba24d
QW
9887 u64 bg_failed = 0;
9888 u64 dev_failed = 0;
9889 int bg_ret = 0;
9890 int dev_ret = 0;
f7039b1d
LD
9891 int ret = 0;
9892
6ba9fc8e 9893 cache = btrfs_lookup_first_block_group(fs_info, range->start);
f87b7eb8 9894 for (; cache; cache = next_block_group(cache)) {
f7039b1d
LD
9895 if (cache->key.objectid >= (range->start + range->len)) {
9896 btrfs_put_block_group(cache);
9897 break;
9898 }
9899
9900 start = max(range->start, cache->key.objectid);
9901 end = min(range->start + range->len,
9902 cache->key.objectid + cache->key.offset);
9903
9904 if (end - start >= range->minlen) {
9905 if (!block_group_cache_done(cache)) {
f6373bf3 9906 ret = cache_block_group(cache, 0);
1be41b78 9907 if (ret) {
93bba24d
QW
9908 bg_failed++;
9909 bg_ret = ret;
9910 continue;
1be41b78
JB
9911 }
9912 ret = wait_block_group_cache_done(cache);
9913 if (ret) {
93bba24d
QW
9914 bg_failed++;
9915 bg_ret = ret;
9916 continue;
1be41b78 9917 }
f7039b1d
LD
9918 }
9919 ret = btrfs_trim_block_group(cache,
9920 &group_trimmed,
9921 start,
9922 end,
9923 range->minlen);
9924
9925 trimmed += group_trimmed;
9926 if (ret) {
93bba24d
QW
9927 bg_failed++;
9928 bg_ret = ret;
9929 continue;
f7039b1d
LD
9930 }
9931 }
f7039b1d
LD
9932 }
9933
93bba24d
QW
9934 if (bg_failed)
9935 btrfs_warn(fs_info,
9936 "failed to trim %llu block group(s), last error %d",
9937 bg_failed, bg_ret);
0b246afa 9938 mutex_lock(&fs_info->fs_devices->device_list_mutex);
d4e329de
JM
9939 devices = &fs_info->fs_devices->devices;
9940 list_for_each_entry(device, devices, dev_list) {
8103d10b 9941 ret = btrfs_trim_free_extents(device, &group_trimmed);
93bba24d
QW
9942 if (ret) {
9943 dev_failed++;
9944 dev_ret = ret;
499f377f 9945 break;
93bba24d 9946 }
499f377f
JM
9947
9948 trimmed += group_trimmed;
9949 }
0b246afa 9950 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
499f377f 9951
93bba24d
QW
9952 if (dev_failed)
9953 btrfs_warn(fs_info,
9954 "failed to trim %llu device(s), last error %d",
9955 dev_failed, dev_ret);
f7039b1d 9956 range->len = trimmed;
93bba24d
QW
9957 if (bg_ret)
9958 return bg_ret;
9959 return dev_ret;
f7039b1d 9960}
8257b2dc
MX
9961
9962/*
ea14b57f 9963 * btrfs_{start,end}_write_no_snapshotting() are similar to
9ea24bbe
FM
9964 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9965 * data into the page cache through nocow before the subvolume is snapshoted,
9966 * but flush the data into disk after the snapshot creation, or to prevent
ea14b57f 9967 * operations while snapshotting is ongoing and that cause the snapshot to be
9ea24bbe 9968 * inconsistent (writes followed by expanding truncates for example).
8257b2dc 9969 */
ea14b57f 9970void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
8257b2dc
MX
9971{
9972 percpu_counter_dec(&root->subv_writers->counter);
093258e6 9973 cond_wake_up(&root->subv_writers->wait);
8257b2dc
MX
9974}
9975
ea14b57f 9976int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
8257b2dc 9977{
ea14b57f 9978 if (atomic_read(&root->will_be_snapshotted))
8257b2dc
MX
9979 return 0;
9980
9981 percpu_counter_inc(&root->subv_writers->counter);
9982 /*
9983 * Make sure counter is updated before we check for snapshot creation.
9984 */
9985 smp_mb();
ea14b57f
DS
9986 if (atomic_read(&root->will_be_snapshotted)) {
9987 btrfs_end_write_no_snapshotting(root);
8257b2dc
MX
9988 return 0;
9989 }
9990 return 1;
9991}
0bc19f90 9992
0bc19f90
ZL
9993void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
9994{
9995 while (true) {
9996 int ret;
9997
ea14b57f 9998 ret = btrfs_start_write_no_snapshotting(root);
0bc19f90
ZL
9999 if (ret)
10000 break;
4625956a
PZ
10001 wait_var_event(&root->will_be_snapshotted,
10002 !atomic_read(&root->will_be_snapshotted));
0bc19f90
ZL
10003 }
10004}
031f24da
QW
10005
10006void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
10007{
10008 struct btrfs_fs_info *fs_info = bg->fs_info;
10009
10010 spin_lock(&fs_info->unused_bgs_lock);
10011 if (list_empty(&bg->bg_list)) {
10012 btrfs_get_block_group(bg);
10013 trace_btrfs_add_unused_block_group(bg);
10014 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
10015 }
10016 spin_unlock(&fs_info->unused_bgs_lock);
10017}