]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent-tree.c
Btrfs: let btrfs_print_leaf print more about block group
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent-tree.c
CommitLineData
6cbd5570
CM
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
ec6b910f 18#include <linux/sched.h>
f361bf4a 19#include <linux/sched/signal.h>
edbd8d4e 20#include <linux/pagemap.h>
ec44a35c 21#include <linux/writeback.h>
21af804c 22#include <linux/blkdev.h>
b7a9f29f 23#include <linux/sort.h>
4184ea7f 24#include <linux/rcupdate.h>
817d52f8 25#include <linux/kthread.h>
5a0e3ad6 26#include <linux/slab.h>
dff51cd1 27#include <linux/ratelimit.h>
b150a4f1 28#include <linux/percpu_counter.h>
74493f7a 29#include "hash.h"
995946dd 30#include "tree-log.h"
fec577fb
CM
31#include "disk-io.h"
32#include "print-tree.h"
0b86a832 33#include "volumes.h"
53b381b3 34#include "raid56.h"
925baedd 35#include "locking.h"
fa9c0d79 36#include "free-space-cache.h"
1e144fb8 37#include "free-space-tree.h"
3fed40cc 38#include "math.h"
6ab0a202 39#include "sysfs.h"
fcebe456 40#include "qgroup.h"
fec577fb 41
709c0486
AJ
42#undef SCRAMBLE_DELAYED_REFS
43
9e622d6b
MX
44/*
45 * control flags for do_chunk_alloc's force field
0e4f8f88
CM
46 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
47 * if we really need one.
48 *
0e4f8f88
CM
49 * CHUNK_ALLOC_LIMITED means to only try and allocate one
50 * if we have very few chunks already allocated. This is
51 * used as part of the clustering code to help make sure
52 * we have a good pool of storage to cluster in, without
53 * filling the FS with empty chunks
54 *
9e622d6b
MX
55 * CHUNK_ALLOC_FORCE means it must try to allocate one
56 *
0e4f8f88
CM
57 */
58enum {
59 CHUNK_ALLOC_NO_FORCE = 0,
9e622d6b
MX
60 CHUNK_ALLOC_LIMITED = 1,
61 CHUNK_ALLOC_FORCE = 2,
0e4f8f88
CM
62};
63
ce93ec54 64static int update_block_group(struct btrfs_trans_handle *trans,
6202df69 65 struct btrfs_fs_info *fs_info, u64 bytenr,
ce93ec54 66 u64 num_bytes, int alloc);
5d4f98a2 67static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2ff7e61e 68 struct btrfs_fs_info *fs_info,
c682f9b3 69 struct btrfs_delayed_ref_node *node, u64 parent,
5d4f98a2
YZ
70 u64 root_objectid, u64 owner_objectid,
71 u64 owner_offset, int refs_to_drop,
c682f9b3 72 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
73static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
74 struct extent_buffer *leaf,
75 struct btrfs_extent_item *ei);
76static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
2ff7e61e 77 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
78 u64 parent, u64 root_objectid,
79 u64 flags, u64 owner, u64 offset,
80 struct btrfs_key *ins, int ref_mod);
81static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
2ff7e61e 82 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
83 u64 parent, u64 root_objectid,
84 u64 flags, struct btrfs_disk_key *key,
b06c4bf5 85 int level, struct btrfs_key *ins);
6a63209f 86static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2ff7e61e 87 struct btrfs_fs_info *fs_info, u64 flags,
698d0082 88 int force);
11833d66
YZ
89static int find_next_key(struct btrfs_path *path, int level,
90 struct btrfs_key *key);
ab8d0fc4
JM
91static void dump_space_info(struct btrfs_fs_info *fs_info,
92 struct btrfs_space_info *info, u64 bytes,
9ed74f2d 93 int dump_block_groups);
4824f1f4 94static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
18513091 95 u64 ram_bytes, u64 num_bytes, int delalloc);
4824f1f4
WX
96static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
97 u64 num_bytes, int delalloc);
5d80366e
JB
98static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
99 u64 num_bytes);
c1c4919b 100static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
957780eb
JB
101 struct btrfs_space_info *space_info,
102 u64 orig_bytes,
c1c4919b
JM
103 enum btrfs_reserve_flush_enum flush,
104 bool system_chunk);
957780eb
JB
105static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
106 struct btrfs_space_info *space_info,
107 u64 num_bytes);
108static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
109 struct btrfs_space_info *space_info,
110 u64 num_bytes);
6a63209f 111
817d52f8
JB
112static noinline int
113block_group_cache_done(struct btrfs_block_group_cache *cache)
114{
115 smp_mb();
36cce922
JB
116 return cache->cached == BTRFS_CACHE_FINISHED ||
117 cache->cached == BTRFS_CACHE_ERROR;
817d52f8
JB
118}
119
0f9dd46c
JB
120static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
121{
122 return (cache->flags & bits) == bits;
123}
124
758f2dfc 125void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
11dfe35a
JB
126{
127 atomic_inc(&cache->count);
128}
129
130void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
131{
f0486c68
YZ
132 if (atomic_dec_and_test(&cache->count)) {
133 WARN_ON(cache->pinned > 0);
134 WARN_ON(cache->reserved > 0);
0966a7b1
QW
135
136 /*
137 * If not empty, someone is still holding mutex of
138 * full_stripe_lock, which can only be released by caller.
139 * And it will definitely cause use-after-free when caller
140 * tries to release full stripe lock.
141 *
142 * No better way to resolve, but only to warn.
143 */
144 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
34d52cb6 145 kfree(cache->free_space_ctl);
11dfe35a 146 kfree(cache);
f0486c68 147 }
11dfe35a
JB
148}
149
0f9dd46c
JB
150/*
151 * this adds the block group to the fs_info rb tree for the block group
152 * cache
153 */
b2950863 154static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
0f9dd46c
JB
155 struct btrfs_block_group_cache *block_group)
156{
157 struct rb_node **p;
158 struct rb_node *parent = NULL;
159 struct btrfs_block_group_cache *cache;
160
161 spin_lock(&info->block_group_cache_lock);
162 p = &info->block_group_cache_tree.rb_node;
163
164 while (*p) {
165 parent = *p;
166 cache = rb_entry(parent, struct btrfs_block_group_cache,
167 cache_node);
168 if (block_group->key.objectid < cache->key.objectid) {
169 p = &(*p)->rb_left;
170 } else if (block_group->key.objectid > cache->key.objectid) {
171 p = &(*p)->rb_right;
172 } else {
173 spin_unlock(&info->block_group_cache_lock);
174 return -EEXIST;
175 }
176 }
177
178 rb_link_node(&block_group->cache_node, parent, p);
179 rb_insert_color(&block_group->cache_node,
180 &info->block_group_cache_tree);
a1897fdd
LB
181
182 if (info->first_logical_byte > block_group->key.objectid)
183 info->first_logical_byte = block_group->key.objectid;
184
0f9dd46c
JB
185 spin_unlock(&info->block_group_cache_lock);
186
187 return 0;
188}
189
190/*
191 * This will return the block group at or after bytenr if contains is 0, else
192 * it will return the block group that contains the bytenr
193 */
194static struct btrfs_block_group_cache *
195block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
196 int contains)
197{
198 struct btrfs_block_group_cache *cache, *ret = NULL;
199 struct rb_node *n;
200 u64 end, start;
201
202 spin_lock(&info->block_group_cache_lock);
203 n = info->block_group_cache_tree.rb_node;
204
205 while (n) {
206 cache = rb_entry(n, struct btrfs_block_group_cache,
207 cache_node);
208 end = cache->key.objectid + cache->key.offset - 1;
209 start = cache->key.objectid;
210
211 if (bytenr < start) {
212 if (!contains && (!ret || start < ret->key.objectid))
213 ret = cache;
214 n = n->rb_left;
215 } else if (bytenr > start) {
216 if (contains && bytenr <= end) {
217 ret = cache;
218 break;
219 }
220 n = n->rb_right;
221 } else {
222 ret = cache;
223 break;
224 }
225 }
a1897fdd 226 if (ret) {
11dfe35a 227 btrfs_get_block_group(ret);
a1897fdd
LB
228 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
229 info->first_logical_byte = ret->key.objectid;
230 }
0f9dd46c
JB
231 spin_unlock(&info->block_group_cache_lock);
232
233 return ret;
234}
235
2ff7e61e 236static int add_excluded_extent(struct btrfs_fs_info *fs_info,
11833d66 237 u64 start, u64 num_bytes)
817d52f8 238{
11833d66 239 u64 end = start + num_bytes - 1;
0b246afa 240 set_extent_bits(&fs_info->freed_extents[0],
ceeb0ae7 241 start, end, EXTENT_UPTODATE);
0b246afa 242 set_extent_bits(&fs_info->freed_extents[1],
ceeb0ae7 243 start, end, EXTENT_UPTODATE);
11833d66
YZ
244 return 0;
245}
817d52f8 246
2ff7e61e 247static void free_excluded_extents(struct btrfs_fs_info *fs_info,
11833d66
YZ
248 struct btrfs_block_group_cache *cache)
249{
250 u64 start, end;
817d52f8 251
11833d66
YZ
252 start = cache->key.objectid;
253 end = start + cache->key.offset - 1;
254
0b246afa 255 clear_extent_bits(&fs_info->freed_extents[0],
91166212 256 start, end, EXTENT_UPTODATE);
0b246afa 257 clear_extent_bits(&fs_info->freed_extents[1],
91166212 258 start, end, EXTENT_UPTODATE);
817d52f8
JB
259}
260
2ff7e61e 261static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
11833d66 262 struct btrfs_block_group_cache *cache)
817d52f8 263{
817d52f8
JB
264 u64 bytenr;
265 u64 *logical;
266 int stripe_len;
267 int i, nr, ret;
268
06b2331f
YZ
269 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
270 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
271 cache->bytes_super += stripe_len;
2ff7e61e 272 ret = add_excluded_extent(fs_info, cache->key.objectid,
06b2331f 273 stripe_len);
835d974f
JB
274 if (ret)
275 return ret;
06b2331f
YZ
276 }
277
817d52f8
JB
278 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
279 bytenr = btrfs_sb_offset(i);
0b246afa 280 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
ab8d0fc4 281 bytenr, 0, &logical, &nr, &stripe_len);
835d974f
JB
282 if (ret)
283 return ret;
11833d66 284
817d52f8 285 while (nr--) {
51bf5f0b
JB
286 u64 start, len;
287
288 if (logical[nr] > cache->key.objectid +
289 cache->key.offset)
290 continue;
291
292 if (logical[nr] + stripe_len <= cache->key.objectid)
293 continue;
294
295 start = logical[nr];
296 if (start < cache->key.objectid) {
297 start = cache->key.objectid;
298 len = (logical[nr] + stripe_len) - start;
299 } else {
300 len = min_t(u64, stripe_len,
301 cache->key.objectid +
302 cache->key.offset - start);
303 }
304
305 cache->bytes_super += len;
2ff7e61e 306 ret = add_excluded_extent(fs_info, start, len);
835d974f
JB
307 if (ret) {
308 kfree(logical);
309 return ret;
310 }
817d52f8 311 }
11833d66 312
817d52f8
JB
313 kfree(logical);
314 }
817d52f8
JB
315 return 0;
316}
317
11833d66
YZ
318static struct btrfs_caching_control *
319get_caching_control(struct btrfs_block_group_cache *cache)
320{
321 struct btrfs_caching_control *ctl;
322
323 spin_lock(&cache->lock);
dde5abee
JB
324 if (!cache->caching_ctl) {
325 spin_unlock(&cache->lock);
11833d66
YZ
326 return NULL;
327 }
328
329 ctl = cache->caching_ctl;
1e4f4714 330 refcount_inc(&ctl->count);
11833d66
YZ
331 spin_unlock(&cache->lock);
332 return ctl;
333}
334
335static void put_caching_control(struct btrfs_caching_control *ctl)
336{
1e4f4714 337 if (refcount_dec_and_test(&ctl->count))
11833d66
YZ
338 kfree(ctl);
339}
340
d0bd4560 341#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 342static void fragment_free_space(struct btrfs_block_group_cache *block_group)
d0bd4560 343{
2ff7e61e 344 struct btrfs_fs_info *fs_info = block_group->fs_info;
d0bd4560
JB
345 u64 start = block_group->key.objectid;
346 u64 len = block_group->key.offset;
347 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
0b246afa 348 fs_info->nodesize : fs_info->sectorsize;
d0bd4560
JB
349 u64 step = chunk << 1;
350
351 while (len > chunk) {
352 btrfs_remove_free_space(block_group, start, chunk);
353 start += step;
354 if (len < step)
355 len = 0;
356 else
357 len -= step;
358 }
359}
360#endif
361
0f9dd46c
JB
362/*
363 * this is only called by cache_block_group, since we could have freed extents
364 * we need to check the pinned_extents for any extents that can't be used yet
365 * since their free space will be released as soon as the transaction commits.
366 */
a5ed9182
OS
367u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
368 struct btrfs_fs_info *info, u64 start, u64 end)
0f9dd46c 369{
817d52f8 370 u64 extent_start, extent_end, size, total_added = 0;
0f9dd46c
JB
371 int ret;
372
373 while (start < end) {
11833d66 374 ret = find_first_extent_bit(info->pinned_extents, start,
0f9dd46c 375 &extent_start, &extent_end,
e6138876
JB
376 EXTENT_DIRTY | EXTENT_UPTODATE,
377 NULL);
0f9dd46c
JB
378 if (ret)
379 break;
380
06b2331f 381 if (extent_start <= start) {
0f9dd46c
JB
382 start = extent_end + 1;
383 } else if (extent_start > start && extent_start < end) {
384 size = extent_start - start;
817d52f8 385 total_added += size;
ea6a478e
JB
386 ret = btrfs_add_free_space(block_group, start,
387 size);
79787eaa 388 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
389 start = extent_end + 1;
390 } else {
391 break;
392 }
393 }
394
395 if (start < end) {
396 size = end - start;
817d52f8 397 total_added += size;
ea6a478e 398 ret = btrfs_add_free_space(block_group, start, size);
79787eaa 399 BUG_ON(ret); /* -ENOMEM or logic error */
0f9dd46c
JB
400 }
401
817d52f8 402 return total_added;
0f9dd46c
JB
403}
404
73fa48b6 405static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
e37c9e69 406{
0b246afa
JM
407 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
408 struct btrfs_fs_info *fs_info = block_group->fs_info;
409 struct btrfs_root *extent_root = fs_info->extent_root;
e37c9e69 410 struct btrfs_path *path;
5f39d397 411 struct extent_buffer *leaf;
11833d66 412 struct btrfs_key key;
817d52f8 413 u64 total_found = 0;
11833d66
YZ
414 u64 last = 0;
415 u32 nritems;
73fa48b6 416 int ret;
d0bd4560 417 bool wakeup = true;
f510cfec 418
e37c9e69
CM
419 path = btrfs_alloc_path();
420 if (!path)
73fa48b6 421 return -ENOMEM;
7d7d6068 422
817d52f8 423 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
11833d66 424
d0bd4560
JB
425#ifdef CONFIG_BTRFS_DEBUG
426 /*
427 * If we're fragmenting we don't want to make anybody think we can
428 * allocate from this block group until we've had a chance to fragment
429 * the free space.
430 */
2ff7e61e 431 if (btrfs_should_fragment_free_space(block_group))
d0bd4560
JB
432 wakeup = false;
433#endif
5cd57b2c 434 /*
817d52f8
JB
435 * We don't want to deadlock with somebody trying to allocate a new
436 * extent for the extent root while also trying to search the extent
437 * root to add free space. So we skip locking and search the commit
438 * root, since its read-only
5cd57b2c
CM
439 */
440 path->skip_locking = 1;
817d52f8 441 path->search_commit_root = 1;
e4058b54 442 path->reada = READA_FORWARD;
817d52f8 443
e4404d6e 444 key.objectid = last;
e37c9e69 445 key.offset = 0;
11833d66 446 key.type = BTRFS_EXTENT_ITEM_KEY;
013f1b12 447
52ee28d2 448next:
11833d66 449 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
e37c9e69 450 if (ret < 0)
73fa48b6 451 goto out;
a512bbf8 452
11833d66
YZ
453 leaf = path->nodes[0];
454 nritems = btrfs_header_nritems(leaf);
455
d397712b 456 while (1) {
7841cb28 457 if (btrfs_fs_closing(fs_info) > 1) {
f25784b3 458 last = (u64)-1;
817d52f8 459 break;
f25784b3 460 }
817d52f8 461
11833d66
YZ
462 if (path->slots[0] < nritems) {
463 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
464 } else {
465 ret = find_next_key(path, 0, &key);
466 if (ret)
e37c9e69 467 break;
817d52f8 468
c9ea7b24 469 if (need_resched() ||
9e351cc8 470 rwsem_is_contended(&fs_info->commit_root_sem)) {
d0bd4560
JB
471 if (wakeup)
472 caching_ctl->progress = last;
ff5714cc 473 btrfs_release_path(path);
9e351cc8 474 up_read(&fs_info->commit_root_sem);
589d8ade 475 mutex_unlock(&caching_ctl->mutex);
11833d66 476 cond_resched();
73fa48b6
OS
477 mutex_lock(&caching_ctl->mutex);
478 down_read(&fs_info->commit_root_sem);
479 goto next;
589d8ade 480 }
0a3896d0
JB
481
482 ret = btrfs_next_leaf(extent_root, path);
483 if (ret < 0)
73fa48b6 484 goto out;
0a3896d0
JB
485 if (ret)
486 break;
589d8ade
JB
487 leaf = path->nodes[0];
488 nritems = btrfs_header_nritems(leaf);
489 continue;
11833d66 490 }
817d52f8 491
52ee28d2
LB
492 if (key.objectid < last) {
493 key.objectid = last;
494 key.offset = 0;
495 key.type = BTRFS_EXTENT_ITEM_KEY;
496
d0bd4560
JB
497 if (wakeup)
498 caching_ctl->progress = last;
52ee28d2
LB
499 btrfs_release_path(path);
500 goto next;
501 }
502
11833d66
YZ
503 if (key.objectid < block_group->key.objectid) {
504 path->slots[0]++;
817d52f8 505 continue;
e37c9e69 506 }
0f9dd46c 507
e37c9e69 508 if (key.objectid >= block_group->key.objectid +
0f9dd46c 509 block_group->key.offset)
e37c9e69 510 break;
7d7d6068 511
3173a18f
JB
512 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
513 key.type == BTRFS_METADATA_ITEM_KEY) {
817d52f8
JB
514 total_found += add_new_free_space(block_group,
515 fs_info, last,
516 key.objectid);
3173a18f
JB
517 if (key.type == BTRFS_METADATA_ITEM_KEY)
518 last = key.objectid +
da17066c 519 fs_info->nodesize;
3173a18f
JB
520 else
521 last = key.objectid + key.offset;
817d52f8 522
73fa48b6 523 if (total_found > CACHING_CTL_WAKE_UP) {
11833d66 524 total_found = 0;
d0bd4560
JB
525 if (wakeup)
526 wake_up(&caching_ctl->wait);
11833d66 527 }
817d52f8 528 }
e37c9e69
CM
529 path->slots[0]++;
530 }
817d52f8 531 ret = 0;
e37c9e69 532
817d52f8
JB
533 total_found += add_new_free_space(block_group, fs_info, last,
534 block_group->key.objectid +
535 block_group->key.offset);
11833d66 536 caching_ctl->progress = (u64)-1;
817d52f8 537
73fa48b6
OS
538out:
539 btrfs_free_path(path);
540 return ret;
541}
542
543static noinline void caching_thread(struct btrfs_work *work)
544{
545 struct btrfs_block_group_cache *block_group;
546 struct btrfs_fs_info *fs_info;
547 struct btrfs_caching_control *caching_ctl;
b4570aa9 548 struct btrfs_root *extent_root;
73fa48b6
OS
549 int ret;
550
551 caching_ctl = container_of(work, struct btrfs_caching_control, work);
552 block_group = caching_ctl->block_group;
553 fs_info = block_group->fs_info;
b4570aa9 554 extent_root = fs_info->extent_root;
73fa48b6
OS
555
556 mutex_lock(&caching_ctl->mutex);
557 down_read(&fs_info->commit_root_sem);
558
1e144fb8
OS
559 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
560 ret = load_free_space_tree(caching_ctl);
561 else
562 ret = load_extent_tree_free(caching_ctl);
73fa48b6 563
817d52f8 564 spin_lock(&block_group->lock);
11833d66 565 block_group->caching_ctl = NULL;
73fa48b6 566 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
817d52f8 567 spin_unlock(&block_group->lock);
0f9dd46c 568
d0bd4560 569#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 570 if (btrfs_should_fragment_free_space(block_group)) {
d0bd4560
JB
571 u64 bytes_used;
572
573 spin_lock(&block_group->space_info->lock);
574 spin_lock(&block_group->lock);
575 bytes_used = block_group->key.offset -
576 btrfs_block_group_used(&block_group->item);
577 block_group->space_info->bytes_used += bytes_used >> 1;
578 spin_unlock(&block_group->lock);
579 spin_unlock(&block_group->space_info->lock);
2ff7e61e 580 fragment_free_space(block_group);
d0bd4560
JB
581 }
582#endif
583
584 caching_ctl->progress = (u64)-1;
11833d66 585
9e351cc8 586 up_read(&fs_info->commit_root_sem);
2ff7e61e 587 free_excluded_extents(fs_info, block_group);
11833d66 588 mutex_unlock(&caching_ctl->mutex);
73fa48b6 589
11833d66
YZ
590 wake_up(&caching_ctl->wait);
591
592 put_caching_control(caching_ctl);
11dfe35a 593 btrfs_put_block_group(block_group);
817d52f8
JB
594}
595
9d66e233 596static int cache_block_group(struct btrfs_block_group_cache *cache,
9d66e233 597 int load_cache_only)
817d52f8 598{
291c7d2f 599 DEFINE_WAIT(wait);
11833d66
YZ
600 struct btrfs_fs_info *fs_info = cache->fs_info;
601 struct btrfs_caching_control *caching_ctl;
817d52f8
JB
602 int ret = 0;
603
291c7d2f 604 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
79787eaa
JM
605 if (!caching_ctl)
606 return -ENOMEM;
291c7d2f
JB
607
608 INIT_LIST_HEAD(&caching_ctl->list);
609 mutex_init(&caching_ctl->mutex);
610 init_waitqueue_head(&caching_ctl->wait);
611 caching_ctl->block_group = cache;
612 caching_ctl->progress = cache->key.objectid;
1e4f4714 613 refcount_set(&caching_ctl->count, 1);
9e0af237
LB
614 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
615 caching_thread, NULL, NULL);
291c7d2f
JB
616
617 spin_lock(&cache->lock);
618 /*
619 * This should be a rare occasion, but this could happen I think in the
620 * case where one thread starts to load the space cache info, and then
621 * some other thread starts a transaction commit which tries to do an
622 * allocation while the other thread is still loading the space cache
623 * info. The previous loop should have kept us from choosing this block
624 * group, but if we've moved to the state where we will wait on caching
625 * block groups we need to first check if we're doing a fast load here,
626 * so we can wait for it to finish, otherwise we could end up allocating
627 * from a block group who's cache gets evicted for one reason or
628 * another.
629 */
630 while (cache->cached == BTRFS_CACHE_FAST) {
631 struct btrfs_caching_control *ctl;
632
633 ctl = cache->caching_ctl;
1e4f4714 634 refcount_inc(&ctl->count);
291c7d2f
JB
635 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
636 spin_unlock(&cache->lock);
637
638 schedule();
639
640 finish_wait(&ctl->wait, &wait);
641 put_caching_control(ctl);
642 spin_lock(&cache->lock);
643 }
644
645 if (cache->cached != BTRFS_CACHE_NO) {
646 spin_unlock(&cache->lock);
647 kfree(caching_ctl);
11833d66 648 return 0;
291c7d2f
JB
649 }
650 WARN_ON(cache->caching_ctl);
651 cache->caching_ctl = caching_ctl;
652 cache->cached = BTRFS_CACHE_FAST;
653 spin_unlock(&cache->lock);
11833d66 654
d53ba474 655 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
cb83b7b8 656 mutex_lock(&caching_ctl->mutex);
9d66e233
JB
657 ret = load_free_space_cache(fs_info, cache);
658
659 spin_lock(&cache->lock);
660 if (ret == 1) {
291c7d2f 661 cache->caching_ctl = NULL;
9d66e233
JB
662 cache->cached = BTRFS_CACHE_FINISHED;
663 cache->last_byte_to_unpin = (u64)-1;
cb83b7b8 664 caching_ctl->progress = (u64)-1;
9d66e233 665 } else {
291c7d2f
JB
666 if (load_cache_only) {
667 cache->caching_ctl = NULL;
668 cache->cached = BTRFS_CACHE_NO;
669 } else {
670 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 671 cache->has_caching_ctl = 1;
291c7d2f 672 }
9d66e233
JB
673 }
674 spin_unlock(&cache->lock);
d0bd4560
JB
675#ifdef CONFIG_BTRFS_DEBUG
676 if (ret == 1 &&
2ff7e61e 677 btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
678 u64 bytes_used;
679
680 spin_lock(&cache->space_info->lock);
681 spin_lock(&cache->lock);
682 bytes_used = cache->key.offset -
683 btrfs_block_group_used(&cache->item);
684 cache->space_info->bytes_used += bytes_used >> 1;
685 spin_unlock(&cache->lock);
686 spin_unlock(&cache->space_info->lock);
2ff7e61e 687 fragment_free_space(cache);
d0bd4560
JB
688 }
689#endif
cb83b7b8
JB
690 mutex_unlock(&caching_ctl->mutex);
691
291c7d2f 692 wake_up(&caching_ctl->wait);
3c14874a 693 if (ret == 1) {
291c7d2f 694 put_caching_control(caching_ctl);
2ff7e61e 695 free_excluded_extents(fs_info, cache);
9d66e233 696 return 0;
3c14874a 697 }
291c7d2f
JB
698 } else {
699 /*
1e144fb8
OS
700 * We're either using the free space tree or no caching at all.
701 * Set cached to the appropriate value and wakeup any waiters.
291c7d2f
JB
702 */
703 spin_lock(&cache->lock);
704 if (load_cache_only) {
705 cache->caching_ctl = NULL;
706 cache->cached = BTRFS_CACHE_NO;
707 } else {
708 cache->cached = BTRFS_CACHE_STARTED;
4f69cb98 709 cache->has_caching_ctl = 1;
291c7d2f
JB
710 }
711 spin_unlock(&cache->lock);
712 wake_up(&caching_ctl->wait);
9d66e233
JB
713 }
714
291c7d2f
JB
715 if (load_cache_only) {
716 put_caching_control(caching_ctl);
11833d66 717 return 0;
817d52f8 718 }
817d52f8 719
9e351cc8 720 down_write(&fs_info->commit_root_sem);
1e4f4714 721 refcount_inc(&caching_ctl->count);
11833d66 722 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
9e351cc8 723 up_write(&fs_info->commit_root_sem);
11833d66 724
11dfe35a 725 btrfs_get_block_group(cache);
11833d66 726
e66f0bb1 727 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
817d52f8 728
ef8bbdfe 729 return ret;
e37c9e69
CM
730}
731
0f9dd46c
JB
732/*
733 * return the block group that starts at or after bytenr
734 */
d397712b
CM
735static struct btrfs_block_group_cache *
736btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
0ef3e66b 737{
e2c89907 738 return block_group_cache_tree_search(info, bytenr, 0);
0ef3e66b
CM
739}
740
0f9dd46c 741/*
9f55684c 742 * return the block group that contains the given bytenr
0f9dd46c 743 */
d397712b
CM
744struct btrfs_block_group_cache *btrfs_lookup_block_group(
745 struct btrfs_fs_info *info,
746 u64 bytenr)
be744175 747{
e2c89907 748 return block_group_cache_tree_search(info, bytenr, 1);
be744175 749}
0b86a832 750
0f9dd46c
JB
751static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
752 u64 flags)
6324fbf3 753{
0f9dd46c 754 struct list_head *head = &info->space_info;
0f9dd46c 755 struct btrfs_space_info *found;
4184ea7f 756
52ba6929 757 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
b742bb82 758
4184ea7f
CM
759 rcu_read_lock();
760 list_for_each_entry_rcu(found, head, list) {
67377734 761 if (found->flags & flags) {
4184ea7f 762 rcu_read_unlock();
0f9dd46c 763 return found;
4184ea7f 764 }
0f9dd46c 765 }
4184ea7f 766 rcu_read_unlock();
0f9dd46c 767 return NULL;
6324fbf3
CM
768}
769
4184ea7f
CM
770/*
771 * after adding space to the filesystem, we need to clear the full flags
772 * on all the space infos.
773 */
774void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
775{
776 struct list_head *head = &info->space_info;
777 struct btrfs_space_info *found;
778
779 rcu_read_lock();
780 list_for_each_entry_rcu(found, head, list)
781 found->full = 0;
782 rcu_read_unlock();
783}
784
1a4ed8fd 785/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 786int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5
CM
787{
788 int ret;
789 struct btrfs_key key;
31840ae1 790 struct btrfs_path *path;
e02119d5 791
31840ae1 792 path = btrfs_alloc_path();
d8926bb3
MF
793 if (!path)
794 return -ENOMEM;
795
e02119d5
CM
796 key.objectid = start;
797 key.offset = len;
3173a18f 798 key.type = BTRFS_EXTENT_ITEM_KEY;
0b246afa 799 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
31840ae1 800 btrfs_free_path(path);
7bb86316
CM
801 return ret;
802}
803
a22285a6 804/*
3173a18f 805 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
806 *
807 * the head node for delayed ref is used to store the sum of all the
808 * reference count modifications queued up in the rbtree. the head
809 * node may also store the extent flags to set. This way you can check
810 * to see what the reference count and extent flags would be if all of
811 * the delayed refs are not processed.
812 */
813int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 814 struct btrfs_fs_info *fs_info, u64 bytenr,
3173a18f 815 u64 offset, int metadata, u64 *refs, u64 *flags)
a22285a6
YZ
816{
817 struct btrfs_delayed_ref_head *head;
818 struct btrfs_delayed_ref_root *delayed_refs;
819 struct btrfs_path *path;
820 struct btrfs_extent_item *ei;
821 struct extent_buffer *leaf;
822 struct btrfs_key key;
823 u32 item_size;
824 u64 num_refs;
825 u64 extent_flags;
826 int ret;
827
3173a18f
JB
828 /*
829 * If we don't have skinny metadata, don't bother doing anything
830 * different
831 */
0b246afa
JM
832 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
833 offset = fs_info->nodesize;
3173a18f
JB
834 metadata = 0;
835 }
836
a22285a6
YZ
837 path = btrfs_alloc_path();
838 if (!path)
839 return -ENOMEM;
840
a22285a6
YZ
841 if (!trans) {
842 path->skip_locking = 1;
843 path->search_commit_root = 1;
844 }
639eefc8
FDBM
845
846search_again:
847 key.objectid = bytenr;
848 key.offset = offset;
849 if (metadata)
850 key.type = BTRFS_METADATA_ITEM_KEY;
851 else
852 key.type = BTRFS_EXTENT_ITEM_KEY;
853
0b246afa 854 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
a22285a6
YZ
855 if (ret < 0)
856 goto out_free;
857
3173a18f 858 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
859 if (path->slots[0]) {
860 path->slots[0]--;
861 btrfs_item_key_to_cpu(path->nodes[0], &key,
862 path->slots[0]);
863 if (key.objectid == bytenr &&
864 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 865 key.offset == fs_info->nodesize)
74be9510
FDBM
866 ret = 0;
867 }
3173a18f
JB
868 }
869
a22285a6
YZ
870 if (ret == 0) {
871 leaf = path->nodes[0];
872 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
873 if (item_size >= sizeof(*ei)) {
874 ei = btrfs_item_ptr(leaf, path->slots[0],
875 struct btrfs_extent_item);
876 num_refs = btrfs_extent_refs(leaf, ei);
877 extent_flags = btrfs_extent_flags(leaf, ei);
878 } else {
879#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
880 struct btrfs_extent_item_v0 *ei0;
881 BUG_ON(item_size != sizeof(*ei0));
882 ei0 = btrfs_item_ptr(leaf, path->slots[0],
883 struct btrfs_extent_item_v0);
884 num_refs = btrfs_extent_refs_v0(leaf, ei0);
885 /* FIXME: this isn't correct for data */
886 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
887#else
888 BUG();
889#endif
890 }
891 BUG_ON(num_refs == 0);
892 } else {
893 num_refs = 0;
894 extent_flags = 0;
895 ret = 0;
896 }
897
898 if (!trans)
899 goto out;
900
901 delayed_refs = &trans->transaction->delayed_refs;
902 spin_lock(&delayed_refs->lock);
f72ad18e 903 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
904 if (head) {
905 if (!mutex_trylock(&head->mutex)) {
6df8cdf5 906 refcount_inc(&head->node.refs);
a22285a6
YZ
907 spin_unlock(&delayed_refs->lock);
908
b3b4aa74 909 btrfs_release_path(path);
a22285a6 910
8cc33e5c
DS
911 /*
912 * Mutex was contended, block until it's released and try
913 * again
914 */
a22285a6
YZ
915 mutex_lock(&head->mutex);
916 mutex_unlock(&head->mutex);
917 btrfs_put_delayed_ref(&head->node);
639eefc8 918 goto search_again;
a22285a6 919 }
d7df2c79 920 spin_lock(&head->lock);
a22285a6
YZ
921 if (head->extent_op && head->extent_op->update_flags)
922 extent_flags |= head->extent_op->flags_to_set;
923 else
924 BUG_ON(num_refs == 0);
925
926 num_refs += head->node.ref_mod;
d7df2c79 927 spin_unlock(&head->lock);
a22285a6
YZ
928 mutex_unlock(&head->mutex);
929 }
930 spin_unlock(&delayed_refs->lock);
931out:
932 WARN_ON(num_refs == 0);
933 if (refs)
934 *refs = num_refs;
935 if (flags)
936 *flags = extent_flags;
937out_free:
938 btrfs_free_path(path);
939 return ret;
940}
941
d8d5f3e1
CM
942/*
943 * Back reference rules. Back refs have three main goals:
944 *
945 * 1) differentiate between all holders of references to an extent so that
946 * when a reference is dropped we can make sure it was a valid reference
947 * before freeing the extent.
948 *
949 * 2) Provide enough information to quickly find the holders of an extent
950 * if we notice a given block is corrupted or bad.
951 *
952 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
953 * maintenance. This is actually the same as #2, but with a slightly
954 * different use case.
955 *
5d4f98a2
YZ
956 * There are two kinds of back refs. The implicit back refs is optimized
957 * for pointers in non-shared tree blocks. For a given pointer in a block,
958 * back refs of this kind provide information about the block's owner tree
959 * and the pointer's key. These information allow us to find the block by
960 * b-tree searching. The full back refs is for pointers in tree blocks not
961 * referenced by their owner trees. The location of tree block is recorded
962 * in the back refs. Actually the full back refs is generic, and can be
963 * used in all cases the implicit back refs is used. The major shortcoming
964 * of the full back refs is its overhead. Every time a tree block gets
965 * COWed, we have to update back refs entry for all pointers in it.
966 *
967 * For a newly allocated tree block, we use implicit back refs for
968 * pointers in it. This means most tree related operations only involve
969 * implicit back refs. For a tree block created in old transaction, the
970 * only way to drop a reference to it is COW it. So we can detect the
971 * event that tree block loses its owner tree's reference and do the
972 * back refs conversion.
973 *
01327610 974 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
975 *
976 * The reference count of the block is one and the tree is the block's
977 * owner tree. Nothing to do in this case.
978 *
979 * The reference count of the block is one and the tree is not the
980 * block's owner tree. In this case, full back refs is used for pointers
981 * in the block. Remove these full back refs, add implicit back refs for
982 * every pointers in the new block.
983 *
984 * The reference count of the block is greater than one and the tree is
985 * the block's owner tree. In this case, implicit back refs is used for
986 * pointers in the block. Add full back refs for every pointers in the
987 * block, increase lower level extents' reference counts. The original
988 * implicit back refs are entailed to the new block.
989 *
990 * The reference count of the block is greater than one and the tree is
991 * not the block's owner tree. Add implicit back refs for every pointer in
992 * the new block, increase lower level extents' reference count.
993 *
994 * Back Reference Key composing:
995 *
996 * The key objectid corresponds to the first byte in the extent,
997 * The key type is used to differentiate between types of back refs.
998 * There are different meanings of the key offset for different types
999 * of back refs.
1000 *
d8d5f3e1
CM
1001 * File extents can be referenced by:
1002 *
1003 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 1004 * - different files inside a single subvolume
d8d5f3e1
CM
1005 * - different offsets inside a file (bookend extents in file.c)
1006 *
5d4f98a2 1007 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
1008 *
1009 * - Objectid of the subvolume root
d8d5f3e1 1010 * - objectid of the file holding the reference
5d4f98a2
YZ
1011 * - original offset in the file
1012 * - how many bookend extents
d8d5f3e1 1013 *
5d4f98a2
YZ
1014 * The key offset for the implicit back refs is hash of the first
1015 * three fields.
d8d5f3e1 1016 *
5d4f98a2 1017 * The extent ref structure for the full back refs has field for:
d8d5f3e1 1018 *
5d4f98a2 1019 * - number of pointers in the tree leaf
d8d5f3e1 1020 *
5d4f98a2
YZ
1021 * The key offset for the implicit back refs is the first byte of
1022 * the tree leaf
d8d5f3e1 1023 *
5d4f98a2
YZ
1024 * When a file extent is allocated, The implicit back refs is used.
1025 * the fields are filled in:
d8d5f3e1 1026 *
5d4f98a2 1027 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 1028 *
5d4f98a2
YZ
1029 * When a file extent is removed file truncation, we find the
1030 * corresponding implicit back refs and check the following fields:
d8d5f3e1 1031 *
5d4f98a2 1032 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 1033 *
5d4f98a2 1034 * Btree extents can be referenced by:
d8d5f3e1 1035 *
5d4f98a2 1036 * - Different subvolumes
d8d5f3e1 1037 *
5d4f98a2
YZ
1038 * Both the implicit back refs and the full back refs for tree blocks
1039 * only consist of key. The key offset for the implicit back refs is
1040 * objectid of block's owner tree. The key offset for the full back refs
1041 * is the first byte of parent block.
d8d5f3e1 1042 *
5d4f98a2
YZ
1043 * When implicit back refs is used, information about the lowest key and
1044 * level of the tree block are required. These information are stored in
1045 * tree block info structure.
d8d5f3e1 1046 */
31840ae1 1047
5d4f98a2
YZ
1048#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1049static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
87bde3cd 1050 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1051 struct btrfs_path *path,
1052 u64 owner, u32 extra_size)
7bb86316 1053{
87bde3cd 1054 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
1055 struct btrfs_extent_item *item;
1056 struct btrfs_extent_item_v0 *ei0;
1057 struct btrfs_extent_ref_v0 *ref0;
1058 struct btrfs_tree_block_info *bi;
1059 struct extent_buffer *leaf;
7bb86316 1060 struct btrfs_key key;
5d4f98a2
YZ
1061 struct btrfs_key found_key;
1062 u32 new_size = sizeof(*item);
1063 u64 refs;
1064 int ret;
1065
1066 leaf = path->nodes[0];
1067 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1068
1069 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1070 ei0 = btrfs_item_ptr(leaf, path->slots[0],
1071 struct btrfs_extent_item_v0);
1072 refs = btrfs_extent_refs_v0(leaf, ei0);
1073
1074 if (owner == (u64)-1) {
1075 while (1) {
1076 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1077 ret = btrfs_next_leaf(root, path);
1078 if (ret < 0)
1079 return ret;
79787eaa 1080 BUG_ON(ret > 0); /* Corruption */
5d4f98a2
YZ
1081 leaf = path->nodes[0];
1082 }
1083 btrfs_item_key_to_cpu(leaf, &found_key,
1084 path->slots[0]);
1085 BUG_ON(key.objectid != found_key.objectid);
1086 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1087 path->slots[0]++;
1088 continue;
1089 }
1090 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1091 struct btrfs_extent_ref_v0);
1092 owner = btrfs_ref_objectid_v0(leaf, ref0);
1093 break;
1094 }
1095 }
b3b4aa74 1096 btrfs_release_path(path);
5d4f98a2
YZ
1097
1098 if (owner < BTRFS_FIRST_FREE_OBJECTID)
1099 new_size += sizeof(*bi);
1100
1101 new_size -= sizeof(*ei0);
1102 ret = btrfs_search_slot(trans, root, &key, path,
1103 new_size + extra_size, 1);
1104 if (ret < 0)
1105 return ret;
79787eaa 1106 BUG_ON(ret); /* Corruption */
5d4f98a2 1107
87bde3cd 1108 btrfs_extend_item(fs_info, path, new_size);
5d4f98a2
YZ
1109
1110 leaf = path->nodes[0];
1111 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1112 btrfs_set_extent_refs(leaf, item, refs);
1113 /* FIXME: get real generation */
1114 btrfs_set_extent_generation(leaf, item, 0);
1115 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1116 btrfs_set_extent_flags(leaf, item,
1117 BTRFS_EXTENT_FLAG_TREE_BLOCK |
1118 BTRFS_BLOCK_FLAG_FULL_BACKREF);
1119 bi = (struct btrfs_tree_block_info *)(item + 1);
1120 /* FIXME: get first key of the block */
b159fa28 1121 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
5d4f98a2
YZ
1122 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1123 } else {
1124 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1125 }
1126 btrfs_mark_buffer_dirty(leaf);
1127 return 0;
1128}
1129#endif
1130
1131static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1132{
1133 u32 high_crc = ~(u32)0;
1134 u32 low_crc = ~(u32)0;
1135 __le64 lenum;
1136
1137 lenum = cpu_to_le64(root_objectid);
14a958e6 1138 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 1139 lenum = cpu_to_le64(owner);
14a958e6 1140 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 1141 lenum = cpu_to_le64(offset);
14a958e6 1142 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
1143
1144 return ((u64)high_crc << 31) ^ (u64)low_crc;
1145}
1146
1147static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1148 struct btrfs_extent_data_ref *ref)
1149{
1150 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1151 btrfs_extent_data_ref_objectid(leaf, ref),
1152 btrfs_extent_data_ref_offset(leaf, ref));
1153}
1154
1155static int match_extent_data_ref(struct extent_buffer *leaf,
1156 struct btrfs_extent_data_ref *ref,
1157 u64 root_objectid, u64 owner, u64 offset)
1158{
1159 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1160 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1161 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1162 return 0;
1163 return 1;
1164}
1165
1166static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
87bde3cd 1167 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1168 struct btrfs_path *path,
1169 u64 bytenr, u64 parent,
1170 u64 root_objectid,
1171 u64 owner, u64 offset)
1172{
87bde3cd 1173 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
1174 struct btrfs_key key;
1175 struct btrfs_extent_data_ref *ref;
31840ae1 1176 struct extent_buffer *leaf;
5d4f98a2 1177 u32 nritems;
74493f7a 1178 int ret;
5d4f98a2
YZ
1179 int recow;
1180 int err = -ENOENT;
74493f7a 1181
31840ae1 1182 key.objectid = bytenr;
5d4f98a2
YZ
1183 if (parent) {
1184 key.type = BTRFS_SHARED_DATA_REF_KEY;
1185 key.offset = parent;
1186 } else {
1187 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1188 key.offset = hash_extent_data_ref(root_objectid,
1189 owner, offset);
1190 }
1191again:
1192 recow = 0;
1193 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1194 if (ret < 0) {
1195 err = ret;
1196 goto fail;
1197 }
31840ae1 1198
5d4f98a2
YZ
1199 if (parent) {
1200 if (!ret)
1201 return 0;
1202#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1203 key.type = BTRFS_EXTENT_REF_V0_KEY;
b3b4aa74 1204 btrfs_release_path(path);
5d4f98a2
YZ
1205 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1206 if (ret < 0) {
1207 err = ret;
1208 goto fail;
1209 }
1210 if (!ret)
1211 return 0;
1212#endif
1213 goto fail;
31840ae1
ZY
1214 }
1215
1216 leaf = path->nodes[0];
5d4f98a2
YZ
1217 nritems = btrfs_header_nritems(leaf);
1218 while (1) {
1219 if (path->slots[0] >= nritems) {
1220 ret = btrfs_next_leaf(root, path);
1221 if (ret < 0)
1222 err = ret;
1223 if (ret)
1224 goto fail;
1225
1226 leaf = path->nodes[0];
1227 nritems = btrfs_header_nritems(leaf);
1228 recow = 1;
1229 }
1230
1231 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1232 if (key.objectid != bytenr ||
1233 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1234 goto fail;
1235
1236 ref = btrfs_item_ptr(leaf, path->slots[0],
1237 struct btrfs_extent_data_ref);
1238
1239 if (match_extent_data_ref(leaf, ref, root_objectid,
1240 owner, offset)) {
1241 if (recow) {
b3b4aa74 1242 btrfs_release_path(path);
5d4f98a2
YZ
1243 goto again;
1244 }
1245 err = 0;
1246 break;
1247 }
1248 path->slots[0]++;
31840ae1 1249 }
5d4f98a2
YZ
1250fail:
1251 return err;
31840ae1
ZY
1252}
1253
5d4f98a2 1254static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
87bde3cd 1255 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1256 struct btrfs_path *path,
1257 u64 bytenr, u64 parent,
1258 u64 root_objectid, u64 owner,
1259 u64 offset, int refs_to_add)
31840ae1 1260{
87bde3cd 1261 struct btrfs_root *root = fs_info->extent_root;
31840ae1
ZY
1262 struct btrfs_key key;
1263 struct extent_buffer *leaf;
5d4f98a2 1264 u32 size;
31840ae1
ZY
1265 u32 num_refs;
1266 int ret;
74493f7a 1267
74493f7a 1268 key.objectid = bytenr;
5d4f98a2
YZ
1269 if (parent) {
1270 key.type = BTRFS_SHARED_DATA_REF_KEY;
1271 key.offset = parent;
1272 size = sizeof(struct btrfs_shared_data_ref);
1273 } else {
1274 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1275 key.offset = hash_extent_data_ref(root_objectid,
1276 owner, offset);
1277 size = sizeof(struct btrfs_extent_data_ref);
1278 }
74493f7a 1279
5d4f98a2
YZ
1280 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1281 if (ret && ret != -EEXIST)
1282 goto fail;
1283
1284 leaf = path->nodes[0];
1285 if (parent) {
1286 struct btrfs_shared_data_ref *ref;
31840ae1 1287 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
1288 struct btrfs_shared_data_ref);
1289 if (ret == 0) {
1290 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1291 } else {
1292 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1293 num_refs += refs_to_add;
1294 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 1295 }
5d4f98a2
YZ
1296 } else {
1297 struct btrfs_extent_data_ref *ref;
1298 while (ret == -EEXIST) {
1299 ref = btrfs_item_ptr(leaf, path->slots[0],
1300 struct btrfs_extent_data_ref);
1301 if (match_extent_data_ref(leaf, ref, root_objectid,
1302 owner, offset))
1303 break;
b3b4aa74 1304 btrfs_release_path(path);
5d4f98a2
YZ
1305 key.offset++;
1306 ret = btrfs_insert_empty_item(trans, root, path, &key,
1307 size);
1308 if (ret && ret != -EEXIST)
1309 goto fail;
31840ae1 1310
5d4f98a2
YZ
1311 leaf = path->nodes[0];
1312 }
1313 ref = btrfs_item_ptr(leaf, path->slots[0],
1314 struct btrfs_extent_data_ref);
1315 if (ret == 0) {
1316 btrfs_set_extent_data_ref_root(leaf, ref,
1317 root_objectid);
1318 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1319 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1320 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1321 } else {
1322 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1323 num_refs += refs_to_add;
1324 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 1325 }
31840ae1 1326 }
5d4f98a2
YZ
1327 btrfs_mark_buffer_dirty(leaf);
1328 ret = 0;
1329fail:
b3b4aa74 1330 btrfs_release_path(path);
7bb86316 1331 return ret;
74493f7a
CM
1332}
1333
5d4f98a2 1334static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
87bde3cd 1335 struct btrfs_fs_info *fs_info,
5d4f98a2 1336 struct btrfs_path *path,
fcebe456 1337 int refs_to_drop, int *last_ref)
31840ae1 1338{
5d4f98a2
YZ
1339 struct btrfs_key key;
1340 struct btrfs_extent_data_ref *ref1 = NULL;
1341 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 1342 struct extent_buffer *leaf;
5d4f98a2 1343 u32 num_refs = 0;
31840ae1
ZY
1344 int ret = 0;
1345
1346 leaf = path->nodes[0];
5d4f98a2
YZ
1347 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1348
1349 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1350 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1351 struct btrfs_extent_data_ref);
1352 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1353 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1354 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1355 struct btrfs_shared_data_ref);
1356 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1357#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1358 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1359 struct btrfs_extent_ref_v0 *ref0;
1360 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1361 struct btrfs_extent_ref_v0);
1362 num_refs = btrfs_ref_count_v0(leaf, ref0);
1363#endif
1364 } else {
1365 BUG();
1366 }
1367
56bec294
CM
1368 BUG_ON(num_refs < refs_to_drop);
1369 num_refs -= refs_to_drop;
5d4f98a2 1370
31840ae1 1371 if (num_refs == 0) {
87bde3cd 1372 ret = btrfs_del_item(trans, fs_info->extent_root, path);
fcebe456 1373 *last_ref = 1;
31840ae1 1374 } else {
5d4f98a2
YZ
1375 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1376 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1377 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1378 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1379#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1380 else {
1381 struct btrfs_extent_ref_v0 *ref0;
1382 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1383 struct btrfs_extent_ref_v0);
1384 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1385 }
1386#endif
31840ae1
ZY
1387 btrfs_mark_buffer_dirty(leaf);
1388 }
31840ae1
ZY
1389 return ret;
1390}
1391
9ed0dea0 1392static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 1393 struct btrfs_extent_inline_ref *iref)
15916de8 1394{
5d4f98a2
YZ
1395 struct btrfs_key key;
1396 struct extent_buffer *leaf;
1397 struct btrfs_extent_data_ref *ref1;
1398 struct btrfs_shared_data_ref *ref2;
1399 u32 num_refs = 0;
1400
1401 leaf = path->nodes[0];
1402 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1403 if (iref) {
1404 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1405 BTRFS_EXTENT_DATA_REF_KEY) {
1406 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1407 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1408 } else {
1409 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1410 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1411 }
1412 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1413 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1414 struct btrfs_extent_data_ref);
1415 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1416 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1417 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1418 struct btrfs_shared_data_ref);
1419 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1420#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1421 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1422 struct btrfs_extent_ref_v0 *ref0;
1423 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1424 struct btrfs_extent_ref_v0);
1425 num_refs = btrfs_ref_count_v0(leaf, ref0);
4b4e25f2 1426#endif
5d4f98a2
YZ
1427 } else {
1428 WARN_ON(1);
1429 }
1430 return num_refs;
1431}
15916de8 1432
5d4f98a2 1433static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
87bde3cd 1434 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1435 struct btrfs_path *path,
1436 u64 bytenr, u64 parent,
1437 u64 root_objectid)
1f3c79a2 1438{
87bde3cd 1439 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2 1440 struct btrfs_key key;
1f3c79a2 1441 int ret;
1f3c79a2 1442
5d4f98a2
YZ
1443 key.objectid = bytenr;
1444 if (parent) {
1445 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1446 key.offset = parent;
1447 } else {
1448 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1449 key.offset = root_objectid;
1f3c79a2
LH
1450 }
1451
5d4f98a2
YZ
1452 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1453 if (ret > 0)
1454 ret = -ENOENT;
1455#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1456 if (ret == -ENOENT && parent) {
b3b4aa74 1457 btrfs_release_path(path);
5d4f98a2
YZ
1458 key.type = BTRFS_EXTENT_REF_V0_KEY;
1459 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1460 if (ret > 0)
1461 ret = -ENOENT;
1462 }
1f3c79a2 1463#endif
5d4f98a2 1464 return ret;
1f3c79a2
LH
1465}
1466
5d4f98a2 1467static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
87bde3cd 1468 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1469 struct btrfs_path *path,
1470 u64 bytenr, u64 parent,
1471 u64 root_objectid)
31840ae1 1472{
5d4f98a2 1473 struct btrfs_key key;
31840ae1 1474 int ret;
31840ae1 1475
5d4f98a2
YZ
1476 key.objectid = bytenr;
1477 if (parent) {
1478 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1479 key.offset = parent;
1480 } else {
1481 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1482 key.offset = root_objectid;
1483 }
1484
87bde3cd
JM
1485 ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1486 path, &key, 0);
b3b4aa74 1487 btrfs_release_path(path);
31840ae1
ZY
1488 return ret;
1489}
1490
5d4f98a2 1491static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 1492{
5d4f98a2
YZ
1493 int type;
1494 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1495 if (parent > 0)
1496 type = BTRFS_SHARED_BLOCK_REF_KEY;
1497 else
1498 type = BTRFS_TREE_BLOCK_REF_KEY;
1499 } else {
1500 if (parent > 0)
1501 type = BTRFS_SHARED_DATA_REF_KEY;
1502 else
1503 type = BTRFS_EXTENT_DATA_REF_KEY;
1504 }
1505 return type;
31840ae1 1506}
56bec294 1507
2c47e605
YZ
1508static int find_next_key(struct btrfs_path *path, int level,
1509 struct btrfs_key *key)
56bec294 1510
02217ed2 1511{
2c47e605 1512 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
1513 if (!path->nodes[level])
1514 break;
5d4f98a2
YZ
1515 if (path->slots[level] + 1 >=
1516 btrfs_header_nritems(path->nodes[level]))
1517 continue;
1518 if (level == 0)
1519 btrfs_item_key_to_cpu(path->nodes[level], key,
1520 path->slots[level] + 1);
1521 else
1522 btrfs_node_key_to_cpu(path->nodes[level], key,
1523 path->slots[level] + 1);
1524 return 0;
1525 }
1526 return 1;
1527}
037e6390 1528
5d4f98a2
YZ
1529/*
1530 * look for inline back ref. if back ref is found, *ref_ret is set
1531 * to the address of inline back ref, and 0 is returned.
1532 *
1533 * if back ref isn't found, *ref_ret is set to the address where it
1534 * should be inserted, and -ENOENT is returned.
1535 *
1536 * if insert is true and there are too many inline back refs, the path
1537 * points to the extent item, and -EAGAIN is returned.
1538 *
1539 * NOTE: inline back refs are ordered in the same way that back ref
1540 * items in the tree are ordered.
1541 */
1542static noinline_for_stack
1543int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
87bde3cd 1544 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1545 struct btrfs_path *path,
1546 struct btrfs_extent_inline_ref **ref_ret,
1547 u64 bytenr, u64 num_bytes,
1548 u64 parent, u64 root_objectid,
1549 u64 owner, u64 offset, int insert)
1550{
87bde3cd 1551 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
1552 struct btrfs_key key;
1553 struct extent_buffer *leaf;
1554 struct btrfs_extent_item *ei;
1555 struct btrfs_extent_inline_ref *iref;
1556 u64 flags;
1557 u64 item_size;
1558 unsigned long ptr;
1559 unsigned long end;
1560 int extra_size;
1561 int type;
1562 int want;
1563 int ret;
1564 int err = 0;
0b246afa 1565 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
26b8003f 1566
db94535d 1567 key.objectid = bytenr;
31840ae1 1568 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 1569 key.offset = num_bytes;
31840ae1 1570
5d4f98a2
YZ
1571 want = extent_ref_type(parent, owner);
1572 if (insert) {
1573 extra_size = btrfs_extent_inline_ref_size(want);
85d4198e 1574 path->keep_locks = 1;
5d4f98a2
YZ
1575 } else
1576 extra_size = -1;
3173a18f
JB
1577
1578 /*
1579 * Owner is our parent level, so we can just add one to get the level
1580 * for the block we are interested in.
1581 */
1582 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1583 key.type = BTRFS_METADATA_ITEM_KEY;
1584 key.offset = owner;
1585 }
1586
1587again:
5d4f98a2 1588 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 1589 if (ret < 0) {
5d4f98a2
YZ
1590 err = ret;
1591 goto out;
1592 }
3173a18f
JB
1593
1594 /*
1595 * We may be a newly converted file system which still has the old fat
1596 * extent entries for metadata, so try and see if we have one of those.
1597 */
1598 if (ret > 0 && skinny_metadata) {
1599 skinny_metadata = false;
1600 if (path->slots[0]) {
1601 path->slots[0]--;
1602 btrfs_item_key_to_cpu(path->nodes[0], &key,
1603 path->slots[0]);
1604 if (key.objectid == bytenr &&
1605 key.type == BTRFS_EXTENT_ITEM_KEY &&
1606 key.offset == num_bytes)
1607 ret = 0;
1608 }
1609 if (ret) {
9ce49a0b 1610 key.objectid = bytenr;
3173a18f
JB
1611 key.type = BTRFS_EXTENT_ITEM_KEY;
1612 key.offset = num_bytes;
1613 btrfs_release_path(path);
1614 goto again;
1615 }
1616 }
1617
79787eaa
JM
1618 if (ret && !insert) {
1619 err = -ENOENT;
1620 goto out;
fae7f21c 1621 } else if (WARN_ON(ret)) {
492104c8 1622 err = -EIO;
492104c8 1623 goto out;
79787eaa 1624 }
5d4f98a2
YZ
1625
1626 leaf = path->nodes[0];
1627 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1628#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1629 if (item_size < sizeof(*ei)) {
1630 if (!insert) {
1631 err = -ENOENT;
1632 goto out;
1633 }
87bde3cd 1634 ret = convert_extent_item_v0(trans, fs_info, path, owner,
5d4f98a2
YZ
1635 extra_size);
1636 if (ret < 0) {
1637 err = ret;
1638 goto out;
1639 }
1640 leaf = path->nodes[0];
1641 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1642 }
1643#endif
1644 BUG_ON(item_size < sizeof(*ei));
1645
5d4f98a2
YZ
1646 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1647 flags = btrfs_extent_flags(leaf, ei);
1648
1649 ptr = (unsigned long)(ei + 1);
1650 end = (unsigned long)ei + item_size;
1651
3173a18f 1652 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
1653 ptr += sizeof(struct btrfs_tree_block_info);
1654 BUG_ON(ptr > end);
5d4f98a2
YZ
1655 }
1656
1657 err = -ENOENT;
1658 while (1) {
1659 if (ptr >= end) {
1660 WARN_ON(ptr > end);
1661 break;
1662 }
1663 iref = (struct btrfs_extent_inline_ref *)ptr;
1664 type = btrfs_extent_inline_ref_type(leaf, iref);
1665 if (want < type)
1666 break;
1667 if (want > type) {
1668 ptr += btrfs_extent_inline_ref_size(type);
1669 continue;
1670 }
1671
1672 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1673 struct btrfs_extent_data_ref *dref;
1674 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1675 if (match_extent_data_ref(leaf, dref, root_objectid,
1676 owner, offset)) {
1677 err = 0;
1678 break;
1679 }
1680 if (hash_extent_data_ref_item(leaf, dref) <
1681 hash_extent_data_ref(root_objectid, owner, offset))
1682 break;
1683 } else {
1684 u64 ref_offset;
1685 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1686 if (parent > 0) {
1687 if (parent == ref_offset) {
1688 err = 0;
1689 break;
1690 }
1691 if (ref_offset < parent)
1692 break;
1693 } else {
1694 if (root_objectid == ref_offset) {
1695 err = 0;
1696 break;
1697 }
1698 if (ref_offset < root_objectid)
1699 break;
1700 }
1701 }
1702 ptr += btrfs_extent_inline_ref_size(type);
1703 }
1704 if (err == -ENOENT && insert) {
1705 if (item_size + extra_size >=
1706 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1707 err = -EAGAIN;
1708 goto out;
1709 }
1710 /*
1711 * To add new inline back ref, we have to make sure
1712 * there is no corresponding back ref item.
1713 * For simplicity, we just do not add new inline back
1714 * ref if there is any kind of item for this block
1715 */
2c47e605
YZ
1716 if (find_next_key(path, 0, &key) == 0 &&
1717 key.objectid == bytenr &&
85d4198e 1718 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
1719 err = -EAGAIN;
1720 goto out;
1721 }
1722 }
1723 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1724out:
85d4198e 1725 if (insert) {
5d4f98a2
YZ
1726 path->keep_locks = 0;
1727 btrfs_unlock_up_safe(path, 1);
1728 }
1729 return err;
1730}
1731
1732/*
1733 * helper to add new inline back ref
1734 */
1735static noinline_for_stack
87bde3cd 1736void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
1737 struct btrfs_path *path,
1738 struct btrfs_extent_inline_ref *iref,
1739 u64 parent, u64 root_objectid,
1740 u64 owner, u64 offset, int refs_to_add,
1741 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
1742{
1743 struct extent_buffer *leaf;
1744 struct btrfs_extent_item *ei;
1745 unsigned long ptr;
1746 unsigned long end;
1747 unsigned long item_offset;
1748 u64 refs;
1749 int size;
1750 int type;
5d4f98a2
YZ
1751
1752 leaf = path->nodes[0];
1753 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1754 item_offset = (unsigned long)iref - (unsigned long)ei;
1755
1756 type = extent_ref_type(parent, owner);
1757 size = btrfs_extent_inline_ref_size(type);
1758
87bde3cd 1759 btrfs_extend_item(fs_info, path, size);
5d4f98a2
YZ
1760
1761 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1762 refs = btrfs_extent_refs(leaf, ei);
1763 refs += refs_to_add;
1764 btrfs_set_extent_refs(leaf, ei, refs);
1765 if (extent_op)
1766 __run_delayed_extent_op(extent_op, leaf, ei);
1767
1768 ptr = (unsigned long)ei + item_offset;
1769 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1770 if (ptr < end - size)
1771 memmove_extent_buffer(leaf, ptr + size, ptr,
1772 end - size - ptr);
1773
1774 iref = (struct btrfs_extent_inline_ref *)ptr;
1775 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1776 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1777 struct btrfs_extent_data_ref *dref;
1778 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1779 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1780 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1781 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1782 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1783 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1784 struct btrfs_shared_data_ref *sref;
1785 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1786 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1787 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1788 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1789 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1790 } else {
1791 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1792 }
1793 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1794}
1795
1796static int lookup_extent_backref(struct btrfs_trans_handle *trans,
87bde3cd 1797 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1798 struct btrfs_path *path,
1799 struct btrfs_extent_inline_ref **ref_ret,
1800 u64 bytenr, u64 num_bytes, u64 parent,
1801 u64 root_objectid, u64 owner, u64 offset)
1802{
1803 int ret;
1804
87bde3cd 1805 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
5d4f98a2
YZ
1806 bytenr, num_bytes, parent,
1807 root_objectid, owner, offset, 0);
1808 if (ret != -ENOENT)
54aa1f4d 1809 return ret;
5d4f98a2 1810
b3b4aa74 1811 btrfs_release_path(path);
5d4f98a2
YZ
1812 *ref_ret = NULL;
1813
1814 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
87bde3cd
JM
1815 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1816 parent, root_objectid);
5d4f98a2 1817 } else {
87bde3cd
JM
1818 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1819 parent, root_objectid, owner,
1820 offset);
b9473439 1821 }
5d4f98a2
YZ
1822 return ret;
1823}
31840ae1 1824
5d4f98a2
YZ
1825/*
1826 * helper to update/remove inline back ref
1827 */
1828static noinline_for_stack
87bde3cd 1829void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
1830 struct btrfs_path *path,
1831 struct btrfs_extent_inline_ref *iref,
1832 int refs_to_mod,
fcebe456
JB
1833 struct btrfs_delayed_extent_op *extent_op,
1834 int *last_ref)
5d4f98a2
YZ
1835{
1836 struct extent_buffer *leaf;
1837 struct btrfs_extent_item *ei;
1838 struct btrfs_extent_data_ref *dref = NULL;
1839 struct btrfs_shared_data_ref *sref = NULL;
1840 unsigned long ptr;
1841 unsigned long end;
1842 u32 item_size;
1843 int size;
1844 int type;
5d4f98a2
YZ
1845 u64 refs;
1846
1847 leaf = path->nodes[0];
1848 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1849 refs = btrfs_extent_refs(leaf, ei);
1850 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1851 refs += refs_to_mod;
1852 btrfs_set_extent_refs(leaf, ei, refs);
1853 if (extent_op)
1854 __run_delayed_extent_op(extent_op, leaf, ei);
1855
1856 type = btrfs_extent_inline_ref_type(leaf, iref);
1857
1858 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1859 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1860 refs = btrfs_extent_data_ref_count(leaf, dref);
1861 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1862 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1863 refs = btrfs_shared_data_ref_count(leaf, sref);
1864 } else {
1865 refs = 1;
1866 BUG_ON(refs_to_mod != -1);
56bec294 1867 }
31840ae1 1868
5d4f98a2
YZ
1869 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1870 refs += refs_to_mod;
1871
1872 if (refs > 0) {
1873 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1874 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1875 else
1876 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1877 } else {
fcebe456 1878 *last_ref = 1;
5d4f98a2
YZ
1879 size = btrfs_extent_inline_ref_size(type);
1880 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1881 ptr = (unsigned long)iref;
1882 end = (unsigned long)ei + item_size;
1883 if (ptr + size < end)
1884 memmove_extent_buffer(leaf, ptr, ptr + size,
1885 end - ptr - size);
1886 item_size -= size;
87bde3cd 1887 btrfs_truncate_item(fs_info, path, item_size, 1);
5d4f98a2
YZ
1888 }
1889 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1890}
1891
1892static noinline_for_stack
1893int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
87bde3cd 1894 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1895 struct btrfs_path *path,
1896 u64 bytenr, u64 num_bytes, u64 parent,
1897 u64 root_objectid, u64 owner,
1898 u64 offset, int refs_to_add,
1899 struct btrfs_delayed_extent_op *extent_op)
1900{
1901 struct btrfs_extent_inline_ref *iref;
1902 int ret;
1903
87bde3cd 1904 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
5d4f98a2
YZ
1905 bytenr, num_bytes, parent,
1906 root_objectid, owner, offset, 1);
1907 if (ret == 0) {
1908 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
87bde3cd 1909 update_inline_extent_backref(fs_info, path, iref,
fcebe456 1910 refs_to_add, extent_op, NULL);
5d4f98a2 1911 } else if (ret == -ENOENT) {
87bde3cd 1912 setup_inline_extent_backref(fs_info, path, iref, parent,
143bede5
JM
1913 root_objectid, owner, offset,
1914 refs_to_add, extent_op);
1915 ret = 0;
771ed689 1916 }
5d4f98a2
YZ
1917 return ret;
1918}
31840ae1 1919
5d4f98a2 1920static int insert_extent_backref(struct btrfs_trans_handle *trans,
87bde3cd 1921 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1922 struct btrfs_path *path,
1923 u64 bytenr, u64 parent, u64 root_objectid,
1924 u64 owner, u64 offset, int refs_to_add)
1925{
1926 int ret;
1927 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1928 BUG_ON(refs_to_add != 1);
87bde3cd 1929 ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
5d4f98a2
YZ
1930 parent, root_objectid);
1931 } else {
87bde3cd 1932 ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
5d4f98a2
YZ
1933 parent, root_objectid,
1934 owner, offset, refs_to_add);
1935 }
1936 return ret;
1937}
56bec294 1938
5d4f98a2 1939static int remove_extent_backref(struct btrfs_trans_handle *trans,
87bde3cd 1940 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
1941 struct btrfs_path *path,
1942 struct btrfs_extent_inline_ref *iref,
fcebe456 1943 int refs_to_drop, int is_data, int *last_ref)
5d4f98a2 1944{
143bede5 1945 int ret = 0;
b9473439 1946
5d4f98a2
YZ
1947 BUG_ON(!is_data && refs_to_drop != 1);
1948 if (iref) {
87bde3cd 1949 update_inline_extent_backref(fs_info, path, iref,
fcebe456 1950 -refs_to_drop, NULL, last_ref);
5d4f98a2 1951 } else if (is_data) {
87bde3cd 1952 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
fcebe456 1953 last_ref);
5d4f98a2 1954 } else {
fcebe456 1955 *last_ref = 1;
87bde3cd 1956 ret = btrfs_del_item(trans, fs_info->extent_root, path);
5d4f98a2
YZ
1957 }
1958 return ret;
1959}
1960
86557861 1961#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
d04c6b88
JM
1962static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1963 u64 *discarded_bytes)
5d4f98a2 1964{
86557861
JM
1965 int j, ret = 0;
1966 u64 bytes_left, end;
4d89d377 1967 u64 aligned_start = ALIGN(start, 1 << 9);
d04c6b88 1968
4d89d377
JM
1969 if (WARN_ON(start != aligned_start)) {
1970 len -= aligned_start - start;
1971 len = round_down(len, 1 << 9);
1972 start = aligned_start;
1973 }
d04c6b88 1974
4d89d377 1975 *discarded_bytes = 0;
86557861
JM
1976
1977 if (!len)
1978 return 0;
1979
1980 end = start + len;
1981 bytes_left = len;
1982
1983 /* Skip any superblocks on this device. */
1984 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1985 u64 sb_start = btrfs_sb_offset(j);
1986 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1987 u64 size = sb_start - start;
1988
1989 if (!in_range(sb_start, start, bytes_left) &&
1990 !in_range(sb_end, start, bytes_left) &&
1991 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1992 continue;
1993
1994 /*
1995 * Superblock spans beginning of range. Adjust start and
1996 * try again.
1997 */
1998 if (sb_start <= start) {
1999 start += sb_end - start;
2000 if (start > end) {
2001 bytes_left = 0;
2002 break;
2003 }
2004 bytes_left = end - start;
2005 continue;
2006 }
2007
2008 if (size) {
2009 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2010 GFP_NOFS, 0);
2011 if (!ret)
2012 *discarded_bytes += size;
2013 else if (ret != -EOPNOTSUPP)
2014 return ret;
2015 }
2016
2017 start = sb_end;
2018 if (start > end) {
2019 bytes_left = 0;
2020 break;
2021 }
2022 bytes_left = end - start;
2023 }
2024
2025 if (bytes_left) {
2026 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
4d89d377
JM
2027 GFP_NOFS, 0);
2028 if (!ret)
86557861 2029 *discarded_bytes += bytes_left;
4d89d377 2030 }
d04c6b88 2031 return ret;
5d4f98a2 2032}
5d4f98a2 2033
2ff7e61e 2034int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 2035 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 2036{
5d4f98a2 2037 int ret;
5378e607 2038 u64 discarded_bytes = 0;
a1d3c478 2039 struct btrfs_bio *bbio = NULL;
5d4f98a2 2040
e244a0ae 2041
2999241d
FM
2042 /*
2043 * Avoid races with device replace and make sure our bbio has devices
2044 * associated to its stripes that don't go away while we are discarding.
2045 */
0b246afa 2046 btrfs_bio_counter_inc_blocked(fs_info);
5d4f98a2 2047 /* Tell the block device(s) that the sectors can be discarded */
0b246afa
JM
2048 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2049 &bbio, 0);
79787eaa 2050 /* Error condition is -ENOMEM */
5d4f98a2 2051 if (!ret) {
a1d3c478 2052 struct btrfs_bio_stripe *stripe = bbio->stripes;
5d4f98a2
YZ
2053 int i;
2054
5d4f98a2 2055
a1d3c478 2056 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
d04c6b88 2057 u64 bytes;
d5e2003c
JB
2058 if (!stripe->dev->can_discard)
2059 continue;
2060
5378e607
LD
2061 ret = btrfs_issue_discard(stripe->dev->bdev,
2062 stripe->physical,
d04c6b88
JM
2063 stripe->length,
2064 &bytes);
5378e607 2065 if (!ret)
d04c6b88 2066 discarded_bytes += bytes;
5378e607 2067 else if (ret != -EOPNOTSUPP)
79787eaa 2068 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
d5e2003c
JB
2069
2070 /*
2071 * Just in case we get back EOPNOTSUPP for some reason,
2072 * just ignore the return value so we don't screw up
2073 * people calling discard_extent.
2074 */
2075 ret = 0;
5d4f98a2 2076 }
6e9606d2 2077 btrfs_put_bbio(bbio);
5d4f98a2 2078 }
0b246afa 2079 btrfs_bio_counter_dec(fs_info);
5378e607
LD
2080
2081 if (actual_bytes)
2082 *actual_bytes = discarded_bytes;
2083
5d4f98a2 2084
53b381b3
DW
2085 if (ret == -EOPNOTSUPP)
2086 ret = 0;
5d4f98a2 2087 return ret;
5d4f98a2
YZ
2088}
2089
79787eaa 2090/* Can return -ENOMEM */
5d4f98a2 2091int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2ff7e61e 2092 struct btrfs_fs_info *fs_info,
5d4f98a2 2093 u64 bytenr, u64 num_bytes, u64 parent,
b06c4bf5 2094 u64 root_objectid, u64 owner, u64 offset)
5d4f98a2
YZ
2095{
2096 int ret;
66d7e7f0 2097
5d4f98a2
YZ
2098 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2099 root_objectid == BTRFS_TREE_LOG_OBJECTID);
2100
2101 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
66d7e7f0
AJ
2102 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2103 num_bytes,
5d4f98a2 2104 parent, root_objectid, (int)owner,
b06c4bf5 2105 BTRFS_ADD_DELAYED_REF, NULL);
5d4f98a2 2106 } else {
66d7e7f0 2107 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5846a3c2
QW
2108 num_bytes, parent, root_objectid,
2109 owner, offset, 0,
fef394f7 2110 BTRFS_ADD_DELAYED_REF);
5d4f98a2
YZ
2111 }
2112 return ret;
2113}
2114
2115static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2ff7e61e 2116 struct btrfs_fs_info *fs_info,
c682f9b3 2117 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
2118 u64 parent, u64 root_objectid,
2119 u64 owner, u64 offset, int refs_to_add,
2120 struct btrfs_delayed_extent_op *extent_op)
2121{
2122 struct btrfs_path *path;
2123 struct extent_buffer *leaf;
2124 struct btrfs_extent_item *item;
fcebe456 2125 struct btrfs_key key;
c682f9b3
QW
2126 u64 bytenr = node->bytenr;
2127 u64 num_bytes = node->num_bytes;
5d4f98a2
YZ
2128 u64 refs;
2129 int ret;
5d4f98a2
YZ
2130
2131 path = btrfs_alloc_path();
2132 if (!path)
2133 return -ENOMEM;
2134
e4058b54 2135 path->reada = READA_FORWARD;
5d4f98a2
YZ
2136 path->leave_spinning = 1;
2137 /* this will setup the path even if it fails to insert the back ref */
87bde3cd
JM
2138 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2139 num_bytes, parent, root_objectid,
2140 owner, offset,
5d4f98a2 2141 refs_to_add, extent_op);
0ed4792a 2142 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 2143 goto out;
fcebe456
JB
2144
2145 /*
2146 * Ok we had -EAGAIN which means we didn't have space to insert and
2147 * inline extent ref, so just update the reference count and add a
2148 * normal backref.
2149 */
5d4f98a2 2150 leaf = path->nodes[0];
fcebe456 2151 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
2152 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2153 refs = btrfs_extent_refs(leaf, item);
2154 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2155 if (extent_op)
2156 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 2157
5d4f98a2 2158 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 2159 btrfs_release_path(path);
56bec294 2160
e4058b54 2161 path->reada = READA_FORWARD;
b9473439 2162 path->leave_spinning = 1;
56bec294 2163 /* now insert the actual backref */
87bde3cd
JM
2164 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2165 root_objectid, owner, offset, refs_to_add);
79787eaa 2166 if (ret)
66642832 2167 btrfs_abort_transaction(trans, ret);
5d4f98a2 2168out:
56bec294 2169 btrfs_free_path(path);
30d133fc 2170 return ret;
56bec294
CM
2171}
2172
5d4f98a2 2173static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2ff7e61e 2174 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
2175 struct btrfs_delayed_ref_node *node,
2176 struct btrfs_delayed_extent_op *extent_op,
2177 int insert_reserved)
56bec294 2178{
5d4f98a2
YZ
2179 int ret = 0;
2180 struct btrfs_delayed_data_ref *ref;
2181 struct btrfs_key ins;
2182 u64 parent = 0;
2183 u64 ref_root = 0;
2184 u64 flags = 0;
2185
2186 ins.objectid = node->bytenr;
2187 ins.offset = node->num_bytes;
2188 ins.type = BTRFS_EXTENT_ITEM_KEY;
2189
2190 ref = btrfs_delayed_node_to_data_ref(node);
0b246afa 2191 trace_run_delayed_data_ref(fs_info, node, ref, node->action);
599c75ec 2192
5d4f98a2
YZ
2193 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2194 parent = ref->parent;
fcebe456 2195 ref_root = ref->root;
5d4f98a2
YZ
2196
2197 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2198 if (extent_op)
5d4f98a2 2199 flags |= extent_op->flags_to_set;
2ff7e61e 2200 ret = alloc_reserved_file_extent(trans, fs_info,
5d4f98a2
YZ
2201 parent, ref_root, flags,
2202 ref->objectid, ref->offset,
2203 &ins, node->ref_mod);
5d4f98a2 2204 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2ff7e61e 2205 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
5d4f98a2
YZ
2206 ref_root, ref->objectid,
2207 ref->offset, node->ref_mod,
c682f9b3 2208 extent_op);
5d4f98a2 2209 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2ff7e61e 2210 ret = __btrfs_free_extent(trans, fs_info, node, parent,
5d4f98a2
YZ
2211 ref_root, ref->objectid,
2212 ref->offset, node->ref_mod,
c682f9b3 2213 extent_op);
5d4f98a2
YZ
2214 } else {
2215 BUG();
2216 }
2217 return ret;
2218}
2219
2220static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2221 struct extent_buffer *leaf,
2222 struct btrfs_extent_item *ei)
2223{
2224 u64 flags = btrfs_extent_flags(leaf, ei);
2225 if (extent_op->update_flags) {
2226 flags |= extent_op->flags_to_set;
2227 btrfs_set_extent_flags(leaf, ei, flags);
2228 }
2229
2230 if (extent_op->update_key) {
2231 struct btrfs_tree_block_info *bi;
2232 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2233 bi = (struct btrfs_tree_block_info *)(ei + 1);
2234 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2235 }
2236}
2237
2238static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2ff7e61e 2239 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
2240 struct btrfs_delayed_ref_node *node,
2241 struct btrfs_delayed_extent_op *extent_op)
2242{
2243 struct btrfs_key key;
2244 struct btrfs_path *path;
2245 struct btrfs_extent_item *ei;
2246 struct extent_buffer *leaf;
2247 u32 item_size;
56bec294 2248 int ret;
5d4f98a2 2249 int err = 0;
b1c79e09 2250 int metadata = !extent_op->is_data;
5d4f98a2 2251
79787eaa
JM
2252 if (trans->aborted)
2253 return 0;
2254
0b246afa 2255 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
2256 metadata = 0;
2257
5d4f98a2
YZ
2258 path = btrfs_alloc_path();
2259 if (!path)
2260 return -ENOMEM;
2261
2262 key.objectid = node->bytenr;
5d4f98a2 2263
3173a18f 2264 if (metadata) {
3173a18f 2265 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 2266 key.offset = extent_op->level;
3173a18f
JB
2267 } else {
2268 key.type = BTRFS_EXTENT_ITEM_KEY;
2269 key.offset = node->num_bytes;
2270 }
2271
2272again:
e4058b54 2273 path->reada = READA_FORWARD;
5d4f98a2 2274 path->leave_spinning = 1;
0b246afa 2275 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
5d4f98a2
YZ
2276 if (ret < 0) {
2277 err = ret;
2278 goto out;
2279 }
2280 if (ret > 0) {
3173a18f 2281 if (metadata) {
55994887
FDBM
2282 if (path->slots[0] > 0) {
2283 path->slots[0]--;
2284 btrfs_item_key_to_cpu(path->nodes[0], &key,
2285 path->slots[0]);
2286 if (key.objectid == node->bytenr &&
2287 key.type == BTRFS_EXTENT_ITEM_KEY &&
2288 key.offset == node->num_bytes)
2289 ret = 0;
2290 }
2291 if (ret > 0) {
2292 btrfs_release_path(path);
2293 metadata = 0;
3173a18f 2294
55994887
FDBM
2295 key.objectid = node->bytenr;
2296 key.offset = node->num_bytes;
2297 key.type = BTRFS_EXTENT_ITEM_KEY;
2298 goto again;
2299 }
2300 } else {
2301 err = -EIO;
2302 goto out;
3173a18f 2303 }
5d4f98a2
YZ
2304 }
2305
2306 leaf = path->nodes[0];
2307 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2308#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2309 if (item_size < sizeof(*ei)) {
87bde3cd 2310 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
5d4f98a2
YZ
2311 if (ret < 0) {
2312 err = ret;
2313 goto out;
2314 }
2315 leaf = path->nodes[0];
2316 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2317 }
2318#endif
2319 BUG_ON(item_size < sizeof(*ei));
2320 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2321 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 2322
5d4f98a2
YZ
2323 btrfs_mark_buffer_dirty(leaf);
2324out:
2325 btrfs_free_path(path);
2326 return err;
56bec294
CM
2327}
2328
5d4f98a2 2329static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2ff7e61e 2330 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
2331 struct btrfs_delayed_ref_node *node,
2332 struct btrfs_delayed_extent_op *extent_op,
2333 int insert_reserved)
56bec294
CM
2334{
2335 int ret = 0;
5d4f98a2
YZ
2336 struct btrfs_delayed_tree_ref *ref;
2337 struct btrfs_key ins;
2338 u64 parent = 0;
2339 u64 ref_root = 0;
0b246afa 2340 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
56bec294 2341
5d4f98a2 2342 ref = btrfs_delayed_node_to_tree_ref(node);
0b246afa 2343 trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
599c75ec 2344
5d4f98a2
YZ
2345 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2346 parent = ref->parent;
fcebe456 2347 ref_root = ref->root;
5d4f98a2 2348
3173a18f
JB
2349 ins.objectid = node->bytenr;
2350 if (skinny_metadata) {
2351 ins.offset = ref->level;
2352 ins.type = BTRFS_METADATA_ITEM_KEY;
2353 } else {
2354 ins.offset = node->num_bytes;
2355 ins.type = BTRFS_EXTENT_ITEM_KEY;
2356 }
2357
02794222 2358 if (node->ref_mod != 1) {
2ff7e61e 2359 btrfs_err(fs_info,
02794222
LB
2360 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2361 node->bytenr, node->ref_mod, node->action, ref_root,
2362 parent);
2363 return -EIO;
2364 }
5d4f98a2 2365 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 2366 BUG_ON(!extent_op || !extent_op->update_flags);
2ff7e61e 2367 ret = alloc_reserved_tree_block(trans, fs_info,
5d4f98a2
YZ
2368 parent, ref_root,
2369 extent_op->flags_to_set,
2370 &extent_op->key,
b06c4bf5 2371 ref->level, &ins);
5d4f98a2 2372 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2ff7e61e 2373 ret = __btrfs_inc_extent_ref(trans, fs_info, node,
c682f9b3
QW
2374 parent, ref_root,
2375 ref->level, 0, 1,
fcebe456 2376 extent_op);
5d4f98a2 2377 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2ff7e61e 2378 ret = __btrfs_free_extent(trans, fs_info, node,
c682f9b3
QW
2379 parent, ref_root,
2380 ref->level, 0, 1, extent_op);
5d4f98a2
YZ
2381 } else {
2382 BUG();
2383 }
56bec294
CM
2384 return ret;
2385}
2386
2387/* helper function to actually process a single delayed ref entry */
5d4f98a2 2388static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2ff7e61e 2389 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
2390 struct btrfs_delayed_ref_node *node,
2391 struct btrfs_delayed_extent_op *extent_op,
2392 int insert_reserved)
56bec294 2393{
79787eaa
JM
2394 int ret = 0;
2395
857cc2fc
JB
2396 if (trans->aborted) {
2397 if (insert_reserved)
2ff7e61e 2398 btrfs_pin_extent(fs_info, node->bytenr,
857cc2fc 2399 node->num_bytes, 1);
79787eaa 2400 return 0;
857cc2fc 2401 }
79787eaa 2402
5d4f98a2 2403 if (btrfs_delayed_ref_is_head(node)) {
56bec294
CM
2404 struct btrfs_delayed_ref_head *head;
2405 /*
2406 * we've hit the end of the chain and we were supposed
2407 * to insert this extent into the tree. But, it got
2408 * deleted before we ever needed to insert it, so all
2409 * we have to do is clean up the accounting
2410 */
5d4f98a2
YZ
2411 BUG_ON(extent_op);
2412 head = btrfs_delayed_node_to_head(node);
0b246afa 2413 trace_run_delayed_ref_head(fs_info, node, head, node->action);
599c75ec 2414
56bec294 2415 if (insert_reserved) {
2ff7e61e 2416 btrfs_pin_extent(fs_info, node->bytenr,
f0486c68 2417 node->num_bytes, 1);
5d4f98a2 2418 if (head->is_data) {
0b246afa 2419 ret = btrfs_del_csums(trans, fs_info,
5d4f98a2
YZ
2420 node->bytenr,
2421 node->num_bytes);
5d4f98a2 2422 }
56bec294 2423 }
297d750b
QW
2424
2425 /* Also free its reserved qgroup space */
0b246afa 2426 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
297d750b 2427 head->qgroup_reserved);
79787eaa 2428 return ret;
56bec294
CM
2429 }
2430
5d4f98a2
YZ
2431 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2432 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2ff7e61e 2433 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
5d4f98a2
YZ
2434 insert_reserved);
2435 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2436 node->type == BTRFS_SHARED_DATA_REF_KEY)
2ff7e61e 2437 ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
5d4f98a2
YZ
2438 insert_reserved);
2439 else
2440 BUG();
2441 return ret;
56bec294
CM
2442}
2443
c6fc2454 2444static inline struct btrfs_delayed_ref_node *
56bec294
CM
2445select_delayed_ref(struct btrfs_delayed_ref_head *head)
2446{
cffc3374
FM
2447 struct btrfs_delayed_ref_node *ref;
2448
c6fc2454
QW
2449 if (list_empty(&head->ref_list))
2450 return NULL;
d7df2c79 2451
cffc3374
FM
2452 /*
2453 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2454 * This is to prevent a ref count from going down to zero, which deletes
2455 * the extent item from the extent tree, when there still are references
2456 * to add, which would fail because they would not find the extent item.
2457 */
1d57ee94
WX
2458 if (!list_empty(&head->ref_add_list))
2459 return list_first_entry(&head->ref_add_list,
2460 struct btrfs_delayed_ref_node, add_list);
2461
2462 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
2463 list);
2464 ASSERT(list_empty(&ref->add_list));
2465 return ref;
56bec294
CM
2466}
2467
79787eaa
JM
2468/*
2469 * Returns 0 on success or if called with an already aborted transaction.
2470 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2471 */
d7df2c79 2472static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2ff7e61e 2473 struct btrfs_fs_info *fs_info,
d7df2c79 2474 unsigned long nr)
56bec294 2475{
56bec294
CM
2476 struct btrfs_delayed_ref_root *delayed_refs;
2477 struct btrfs_delayed_ref_node *ref;
2478 struct btrfs_delayed_ref_head *locked_ref = NULL;
5d4f98a2 2479 struct btrfs_delayed_extent_op *extent_op;
0a2b2a84 2480 ktime_t start = ktime_get();
56bec294 2481 int ret;
d7df2c79 2482 unsigned long count = 0;
0a2b2a84 2483 unsigned long actual_count = 0;
56bec294 2484 int must_insert_reserved = 0;
56bec294
CM
2485
2486 delayed_refs = &trans->transaction->delayed_refs;
56bec294
CM
2487 while (1) {
2488 if (!locked_ref) {
d7df2c79 2489 if (count >= nr)
56bec294 2490 break;
56bec294 2491
d7df2c79
JB
2492 spin_lock(&delayed_refs->lock);
2493 locked_ref = btrfs_select_ref_head(trans);
2494 if (!locked_ref) {
2495 spin_unlock(&delayed_refs->lock);
2496 break;
2497 }
c3e69d58
CM
2498
2499 /* grab the lock that says we are going to process
2500 * all the refs for this head */
2501 ret = btrfs_delayed_ref_lock(trans, locked_ref);
d7df2c79 2502 spin_unlock(&delayed_refs->lock);
c3e69d58
CM
2503 /*
2504 * we may have dropped the spin lock to get the head
2505 * mutex lock, and that might have given someone else
2506 * time to free the head. If that's true, it has been
2507 * removed from our list and we can move on.
2508 */
2509 if (ret == -EAGAIN) {
2510 locked_ref = NULL;
2511 count++;
2512 continue;
56bec294
CM
2513 }
2514 }
a28ec197 2515
2c3cf7d5
FM
2516 /*
2517 * We need to try and merge add/drops of the same ref since we
2518 * can run into issues with relocate dropping the implicit ref
2519 * and then it being added back again before the drop can
2520 * finish. If we merged anything we need to re-loop so we can
2521 * get a good ref.
2522 * Or we can get node references of the same type that weren't
2523 * merged when created due to bumps in the tree mod seq, and
2524 * we need to merge them to prevent adding an inline extent
2525 * backref before dropping it (triggering a BUG_ON at
2526 * insert_inline_extent_backref()).
2527 */
d7df2c79 2528 spin_lock(&locked_ref->lock);
2c3cf7d5
FM
2529 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2530 locked_ref);
ae1e206b 2531
d1270cd9
AJ
2532 /*
2533 * locked_ref is the head node, so we have to go one
2534 * node back for any delayed ref updates
2535 */
2536 ref = select_delayed_ref(locked_ref);
2537
2538 if (ref && ref->seq &&
097b8a7c 2539 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
d7df2c79 2540 spin_unlock(&locked_ref->lock);
d7df2c79
JB
2541 spin_lock(&delayed_refs->lock);
2542 locked_ref->processing = 0;
d1270cd9
AJ
2543 delayed_refs->num_heads_ready++;
2544 spin_unlock(&delayed_refs->lock);
d0280996 2545 btrfs_delayed_ref_unlock(locked_ref);
d7df2c79 2546 locked_ref = NULL;
d1270cd9 2547 cond_resched();
27a377db 2548 count++;
d1270cd9
AJ
2549 continue;
2550 }
2551
56bec294
CM
2552 /*
2553 * record the must insert reserved flag before we
2554 * drop the spin lock.
2555 */
2556 must_insert_reserved = locked_ref->must_insert_reserved;
2557 locked_ref->must_insert_reserved = 0;
7bb86316 2558
5d4f98a2
YZ
2559 extent_op = locked_ref->extent_op;
2560 locked_ref->extent_op = NULL;
2561
56bec294 2562 if (!ref) {
d7df2c79
JB
2563
2564
56bec294
CM
2565 /* All delayed refs have been processed, Go ahead
2566 * and send the head node to run_one_delayed_ref,
2567 * so that any accounting fixes can happen
2568 */
2569 ref = &locked_ref->node;
5d4f98a2
YZ
2570
2571 if (extent_op && must_insert_reserved) {
78a6184a 2572 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2573 extent_op = NULL;
2574 }
2575
2576 if (extent_op) {
d7df2c79 2577 spin_unlock(&locked_ref->lock);
2ff7e61e 2578 ret = run_delayed_extent_op(trans, fs_info,
5d4f98a2 2579 ref, extent_op);
78a6184a 2580 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2 2581
79787eaa 2582 if (ret) {
857cc2fc
JB
2583 /*
2584 * Need to reset must_insert_reserved if
2585 * there was an error so the abort stuff
2586 * can cleanup the reserved space
2587 * properly.
2588 */
2589 if (must_insert_reserved)
2590 locked_ref->must_insert_reserved = 1;
aa7c8da3 2591 spin_lock(&delayed_refs->lock);
d7df2c79 2592 locked_ref->processing = 0;
aa7c8da3
JM
2593 delayed_refs->num_heads_ready++;
2594 spin_unlock(&delayed_refs->lock);
5d163e0e
JM
2595 btrfs_debug(fs_info,
2596 "run_delayed_extent_op returned %d",
2597 ret);
093486c4 2598 btrfs_delayed_ref_unlock(locked_ref);
79787eaa
JM
2599 return ret;
2600 }
d7df2c79 2601 continue;
5d4f98a2 2602 }
02217ed2 2603
d7df2c79 2604 /*
01327610 2605 * Need to drop our head ref lock and re-acquire the
d7df2c79
JB
2606 * delayed ref lock and then re-check to make sure
2607 * nobody got added.
2608 */
2609 spin_unlock(&locked_ref->lock);
2610 spin_lock(&delayed_refs->lock);
2611 spin_lock(&locked_ref->lock);
c6fc2454 2612 if (!list_empty(&locked_ref->ref_list) ||
573a0755 2613 locked_ref->extent_op) {
d7df2c79
JB
2614 spin_unlock(&locked_ref->lock);
2615 spin_unlock(&delayed_refs->lock);
2616 continue;
2617 }
2618 ref->in_tree = 0;
2619 delayed_refs->num_heads--;
c46effa6
LB
2620 rb_erase(&locked_ref->href_node,
2621 &delayed_refs->href_root);
d7df2c79
JB
2622 spin_unlock(&delayed_refs->lock);
2623 } else {
0a2b2a84 2624 actual_count++;
d7df2c79 2625 ref->in_tree = 0;
c6fc2454 2626 list_del(&ref->list);
1d57ee94
WX
2627 if (!list_empty(&ref->add_list))
2628 list_del(&ref->add_list);
c46effa6 2629 }
d7df2c79
JB
2630 atomic_dec(&delayed_refs->num_entries);
2631
093486c4 2632 if (!btrfs_delayed_ref_is_head(ref)) {
22cd2e7d
AJ
2633 /*
2634 * when we play the delayed ref, also correct the
2635 * ref_mod on head
2636 */
2637 switch (ref->action) {
2638 case BTRFS_ADD_DELAYED_REF:
2639 case BTRFS_ADD_DELAYED_EXTENT:
2640 locked_ref->node.ref_mod -= ref->ref_mod;
2641 break;
2642 case BTRFS_DROP_DELAYED_REF:
2643 locked_ref->node.ref_mod += ref->ref_mod;
2644 break;
2645 default:
2646 WARN_ON(1);
2647 }
2648 }
d7df2c79 2649 spin_unlock(&locked_ref->lock);
925baedd 2650
2ff7e61e 2651 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
56bec294 2652 must_insert_reserved);
eb099670 2653
78a6184a 2654 btrfs_free_delayed_extent_op(extent_op);
79787eaa 2655 if (ret) {
9d1032cc 2656 spin_lock(&delayed_refs->lock);
d7df2c79 2657 locked_ref->processing = 0;
9d1032cc
WX
2658 delayed_refs->num_heads_ready++;
2659 spin_unlock(&delayed_refs->lock);
093486c4
MX
2660 btrfs_delayed_ref_unlock(locked_ref);
2661 btrfs_put_delayed_ref(ref);
5d163e0e
JM
2662 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2663 ret);
79787eaa
JM
2664 return ret;
2665 }
2666
093486c4
MX
2667 /*
2668 * If this node is a head, that means all the refs in this head
2669 * have been dealt with, and we will pick the next head to deal
2670 * with, so we must unlock the head and drop it from the cluster
2671 * list before we release it.
2672 */
2673 if (btrfs_delayed_ref_is_head(ref)) {
1262133b
JB
2674 if (locked_ref->is_data &&
2675 locked_ref->total_ref_mod < 0) {
2676 spin_lock(&delayed_refs->lock);
2677 delayed_refs->pending_csums -= ref->num_bytes;
2678 spin_unlock(&delayed_refs->lock);
2679 }
093486c4
MX
2680 btrfs_delayed_ref_unlock(locked_ref);
2681 locked_ref = NULL;
2682 }
2683 btrfs_put_delayed_ref(ref);
2684 count++;
c3e69d58 2685 cond_resched();
c3e69d58 2686 }
0a2b2a84
JB
2687
2688 /*
2689 * We don't want to include ref heads since we can have empty ref heads
2690 * and those will drastically skew our runtime down since we just do
2691 * accounting, no actual extent tree updates.
2692 */
2693 if (actual_count > 0) {
2694 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2695 u64 avg;
2696
2697 /*
2698 * We weigh the current average higher than our current runtime
2699 * to avoid large swings in the average.
2700 */
2701 spin_lock(&delayed_refs->lock);
2702 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
f8c269d7 2703 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
0a2b2a84
JB
2704 spin_unlock(&delayed_refs->lock);
2705 }
d7df2c79 2706 return 0;
c3e69d58
CM
2707}
2708
709c0486
AJ
2709#ifdef SCRAMBLE_DELAYED_REFS
2710/*
2711 * Normally delayed refs get processed in ascending bytenr order. This
2712 * correlates in most cases to the order added. To expose dependencies on this
2713 * order, we start to process the tree in the middle instead of the beginning
2714 */
2715static u64 find_middle(struct rb_root *root)
2716{
2717 struct rb_node *n = root->rb_node;
2718 struct btrfs_delayed_ref_node *entry;
2719 int alt = 1;
2720 u64 middle;
2721 u64 first = 0, last = 0;
2722
2723 n = rb_first(root);
2724 if (n) {
2725 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2726 first = entry->bytenr;
2727 }
2728 n = rb_last(root);
2729 if (n) {
2730 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2731 last = entry->bytenr;
2732 }
2733 n = root->rb_node;
2734
2735 while (n) {
2736 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2737 WARN_ON(!entry->in_tree);
2738
2739 middle = entry->bytenr;
2740
2741 if (alt)
2742 n = n->rb_left;
2743 else
2744 n = n->rb_right;
2745
2746 alt = 1 - alt;
2747 }
2748 return middle;
2749}
2750#endif
2751
2ff7e61e 2752static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
1be41b78
JB
2753{
2754 u64 num_bytes;
2755
2756 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2757 sizeof(struct btrfs_extent_inline_ref));
0b246afa 2758 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1be41b78
JB
2759 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2760
2761 /*
2762 * We don't ever fill up leaves all the way so multiply by 2 just to be
01327610 2763 * closer to what we're really going to want to use.
1be41b78 2764 */
0b246afa 2765 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
1be41b78
JB
2766}
2767
1262133b
JB
2768/*
2769 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2770 * would require to store the csums for that many bytes.
2771 */
2ff7e61e 2772u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
1262133b
JB
2773{
2774 u64 csum_size;
2775 u64 num_csums_per_leaf;
2776 u64 num_csums;
2777
0b246afa 2778 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
1262133b 2779 num_csums_per_leaf = div64_u64(csum_size,
0b246afa
JM
2780 (u64)btrfs_super_csum_size(fs_info->super_copy));
2781 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
1262133b
JB
2782 num_csums += num_csums_per_leaf - 1;
2783 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2784 return num_csums;
2785}
2786
0a2b2a84 2787int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2ff7e61e 2788 struct btrfs_fs_info *fs_info)
1be41b78
JB
2789{
2790 struct btrfs_block_rsv *global_rsv;
2791 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
1262133b 2792 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
cb723e49
JB
2793 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2794 u64 num_bytes, num_dirty_bgs_bytes;
1be41b78
JB
2795 int ret = 0;
2796
0b246afa 2797 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2ff7e61e 2798 num_heads = heads_to_leaves(fs_info, num_heads);
1be41b78 2799 if (num_heads > 1)
0b246afa 2800 num_bytes += (num_heads - 1) * fs_info->nodesize;
1be41b78 2801 num_bytes <<= 1;
2ff7e61e
JM
2802 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2803 fs_info->nodesize;
0b246afa 2804 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
cb723e49 2805 num_dirty_bgs);
0b246afa 2806 global_rsv = &fs_info->global_block_rsv;
1be41b78
JB
2807
2808 /*
2809 * If we can't allocate any more chunks lets make sure we have _lots_ of
2810 * wiggle room since running delayed refs can create more delayed refs.
2811 */
cb723e49
JB
2812 if (global_rsv->space_info->full) {
2813 num_dirty_bgs_bytes <<= 1;
1be41b78 2814 num_bytes <<= 1;
cb723e49 2815 }
1be41b78
JB
2816
2817 spin_lock(&global_rsv->lock);
cb723e49 2818 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
1be41b78
JB
2819 ret = 1;
2820 spin_unlock(&global_rsv->lock);
2821 return ret;
2822}
2823
0a2b2a84 2824int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2ff7e61e 2825 struct btrfs_fs_info *fs_info)
0a2b2a84 2826{
0a2b2a84
JB
2827 u64 num_entries =
2828 atomic_read(&trans->transaction->delayed_refs.num_entries);
2829 u64 avg_runtime;
a79b7d4b 2830 u64 val;
0a2b2a84
JB
2831
2832 smp_mb();
2833 avg_runtime = fs_info->avg_delayed_ref_runtime;
a79b7d4b 2834 val = num_entries * avg_runtime;
dc1a90c6 2835 if (val >= NSEC_PER_SEC)
0a2b2a84 2836 return 1;
a79b7d4b
CM
2837 if (val >= NSEC_PER_SEC / 2)
2838 return 2;
0a2b2a84 2839
2ff7e61e 2840 return btrfs_check_space_for_delayed_refs(trans, fs_info);
0a2b2a84
JB
2841}
2842
a79b7d4b
CM
2843struct async_delayed_refs {
2844 struct btrfs_root *root;
31b9655f 2845 u64 transid;
a79b7d4b
CM
2846 int count;
2847 int error;
2848 int sync;
2849 struct completion wait;
2850 struct btrfs_work work;
2851};
2852
2ff7e61e
JM
2853static inline struct async_delayed_refs *
2854to_async_delayed_refs(struct btrfs_work *work)
2855{
2856 return container_of(work, struct async_delayed_refs, work);
2857}
2858
a79b7d4b
CM
2859static void delayed_ref_async_start(struct btrfs_work *work)
2860{
2ff7e61e 2861 struct async_delayed_refs *async = to_async_delayed_refs(work);
a79b7d4b 2862 struct btrfs_trans_handle *trans;
2ff7e61e 2863 struct btrfs_fs_info *fs_info = async->root->fs_info;
a79b7d4b
CM
2864 int ret;
2865
0f873eca 2866 /* if the commit is already started, we don't need to wait here */
2ff7e61e 2867 if (btrfs_transaction_blocked(fs_info))
31b9655f 2868 goto done;
31b9655f 2869
0f873eca
CM
2870 trans = btrfs_join_transaction(async->root);
2871 if (IS_ERR(trans)) {
2872 async->error = PTR_ERR(trans);
a79b7d4b
CM
2873 goto done;
2874 }
2875
2876 /*
01327610 2877 * trans->sync means that when we call end_transaction, we won't
a79b7d4b
CM
2878 * wait on delayed refs
2879 */
2880 trans->sync = true;
0f873eca
CM
2881
2882 /* Don't bother flushing if we got into a different transaction */
2883 if (trans->transid > async->transid)
2884 goto end;
2885
2ff7e61e 2886 ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
a79b7d4b
CM
2887 if (ret)
2888 async->error = ret;
0f873eca 2889end:
3a45bb20 2890 ret = btrfs_end_transaction(trans);
a79b7d4b
CM
2891 if (ret && !async->error)
2892 async->error = ret;
2893done:
2894 if (async->sync)
2895 complete(&async->wait);
2896 else
2897 kfree(async);
2898}
2899
2ff7e61e 2900int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
31b9655f 2901 unsigned long count, u64 transid, int wait)
a79b7d4b
CM
2902{
2903 struct async_delayed_refs *async;
2904 int ret;
2905
2906 async = kmalloc(sizeof(*async), GFP_NOFS);
2907 if (!async)
2908 return -ENOMEM;
2909
0b246afa 2910 async->root = fs_info->tree_root;
a79b7d4b
CM
2911 async->count = count;
2912 async->error = 0;
31b9655f 2913 async->transid = transid;
a79b7d4b
CM
2914 if (wait)
2915 async->sync = 1;
2916 else
2917 async->sync = 0;
2918 init_completion(&async->wait);
2919
9e0af237
LB
2920 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2921 delayed_ref_async_start, NULL, NULL);
a79b7d4b 2922
0b246afa 2923 btrfs_queue_work(fs_info->extent_workers, &async->work);
a79b7d4b
CM
2924
2925 if (wait) {
2926 wait_for_completion(&async->wait);
2927 ret = async->error;
2928 kfree(async);
2929 return ret;
2930 }
2931 return 0;
2932}
2933
c3e69d58
CM
2934/*
2935 * this starts processing the delayed reference count updates and
2936 * extent insertions we have queued up so far. count can be
2937 * 0, which means to process everything in the tree at the start
2938 * of the run (but not newly added entries), or it can be some target
2939 * number you'd like to process.
79787eaa
JM
2940 *
2941 * Returns 0 on success or if called with an aborted transaction
2942 * Returns <0 on error and aborts the transaction
c3e69d58
CM
2943 */
2944int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2ff7e61e 2945 struct btrfs_fs_info *fs_info, unsigned long count)
c3e69d58
CM
2946{
2947 struct rb_node *node;
2948 struct btrfs_delayed_ref_root *delayed_refs;
c46effa6 2949 struct btrfs_delayed_ref_head *head;
c3e69d58
CM
2950 int ret;
2951 int run_all = count == (unsigned long)-1;
d9a0540a 2952 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
c3e69d58 2953
79787eaa
JM
2954 /* We'll clean this up in btrfs_cleanup_transaction */
2955 if (trans->aborted)
2956 return 0;
2957
0b246afa 2958 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2959 return 0;
2960
c3e69d58 2961 delayed_refs = &trans->transaction->delayed_refs;
26455d33 2962 if (count == 0)
d7df2c79 2963 count = atomic_read(&delayed_refs->num_entries) * 2;
bb721703 2964
c3e69d58 2965again:
709c0486
AJ
2966#ifdef SCRAMBLE_DELAYED_REFS
2967 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2968#endif
d9a0540a 2969 trans->can_flush_pending_bgs = false;
2ff7e61e 2970 ret = __btrfs_run_delayed_refs(trans, fs_info, count);
d7df2c79 2971 if (ret < 0) {
66642832 2972 btrfs_abort_transaction(trans, ret);
d7df2c79 2973 return ret;
eb099670 2974 }
c3e69d58 2975
56bec294 2976 if (run_all) {
d7df2c79 2977 if (!list_empty(&trans->new_bgs))
2ff7e61e 2978 btrfs_create_pending_block_groups(trans, fs_info);
ea658bad 2979
d7df2c79 2980 spin_lock(&delayed_refs->lock);
c46effa6 2981 node = rb_first(&delayed_refs->href_root);
d7df2c79
JB
2982 if (!node) {
2983 spin_unlock(&delayed_refs->lock);
56bec294 2984 goto out;
d7df2c79 2985 }
e9d0b13b 2986
56bec294 2987 while (node) {
c46effa6
LB
2988 head = rb_entry(node, struct btrfs_delayed_ref_head,
2989 href_node);
2990 if (btrfs_delayed_ref_is_head(&head->node)) {
2991 struct btrfs_delayed_ref_node *ref;
5caf2a00 2992
c46effa6 2993 ref = &head->node;
6df8cdf5 2994 refcount_inc(&ref->refs);
56bec294
CM
2995
2996 spin_unlock(&delayed_refs->lock);
8cc33e5c
DS
2997 /*
2998 * Mutex was contended, block until it's
2999 * released and try again
3000 */
56bec294
CM
3001 mutex_lock(&head->mutex);
3002 mutex_unlock(&head->mutex);
3003
3004 btrfs_put_delayed_ref(ref);
1887be66 3005 cond_resched();
56bec294 3006 goto again;
c46effa6
LB
3007 } else {
3008 WARN_ON(1);
56bec294
CM
3009 }
3010 node = rb_next(node);
3011 }
3012 spin_unlock(&delayed_refs->lock);
d7df2c79 3013 cond_resched();
56bec294 3014 goto again;
5f39d397 3015 }
54aa1f4d 3016out:
d9a0540a 3017 trans->can_flush_pending_bgs = can_flush_pending_bgs;
a28ec197
CM
3018 return 0;
3019}
3020
5d4f98a2 3021int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2ff7e61e 3022 struct btrfs_fs_info *fs_info,
5d4f98a2 3023 u64 bytenr, u64 num_bytes, u64 flags,
b1c79e09 3024 int level, int is_data)
5d4f98a2
YZ
3025{
3026 struct btrfs_delayed_extent_op *extent_op;
3027 int ret;
3028
78a6184a 3029 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
3030 if (!extent_op)
3031 return -ENOMEM;
3032
3033 extent_op->flags_to_set = flags;
35b3ad50
DS
3034 extent_op->update_flags = true;
3035 extent_op->update_key = false;
3036 extent_op->is_data = is_data ? true : false;
b1c79e09 3037 extent_op->level = level;
5d4f98a2 3038
0b246afa 3039 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
66d7e7f0 3040 num_bytes, extent_op);
5d4f98a2 3041 if (ret)
78a6184a 3042 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
3043 return ret;
3044}
3045
e4c3b2dc 3046static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
3047 struct btrfs_path *path,
3048 u64 objectid, u64 offset, u64 bytenr)
3049{
3050 struct btrfs_delayed_ref_head *head;
3051 struct btrfs_delayed_ref_node *ref;
3052 struct btrfs_delayed_data_ref *data_ref;
3053 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 3054 struct btrfs_transaction *cur_trans;
5d4f98a2
YZ
3055 int ret = 0;
3056
e4c3b2dc
LB
3057 cur_trans = root->fs_info->running_transaction;
3058 if (!cur_trans)
3059 return 0;
3060
3061 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 3062 spin_lock(&delayed_refs->lock);
f72ad18e 3063 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
3064 if (!head) {
3065 spin_unlock(&delayed_refs->lock);
3066 return 0;
3067 }
5d4f98a2
YZ
3068
3069 if (!mutex_trylock(&head->mutex)) {
6df8cdf5 3070 refcount_inc(&head->node.refs);
5d4f98a2
YZ
3071 spin_unlock(&delayed_refs->lock);
3072
b3b4aa74 3073 btrfs_release_path(path);
5d4f98a2 3074
8cc33e5c
DS
3075 /*
3076 * Mutex was contended, block until it's released and let
3077 * caller try again
3078 */
5d4f98a2
YZ
3079 mutex_lock(&head->mutex);
3080 mutex_unlock(&head->mutex);
3081 btrfs_put_delayed_ref(&head->node);
3082 return -EAGAIN;
3083 }
d7df2c79 3084 spin_unlock(&delayed_refs->lock);
5d4f98a2 3085
d7df2c79 3086 spin_lock(&head->lock);
c6fc2454 3087 list_for_each_entry(ref, &head->ref_list, list) {
d7df2c79
JB
3088 /* If it's a shared ref we know a cross reference exists */
3089 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3090 ret = 1;
3091 break;
3092 }
5d4f98a2 3093
d7df2c79 3094 data_ref = btrfs_delayed_node_to_data_ref(ref);
5d4f98a2 3095
d7df2c79
JB
3096 /*
3097 * If our ref doesn't match the one we're currently looking at
3098 * then we have a cross reference.
3099 */
3100 if (data_ref->root != root->root_key.objectid ||
3101 data_ref->objectid != objectid ||
3102 data_ref->offset != offset) {
3103 ret = 1;
3104 break;
3105 }
5d4f98a2 3106 }
d7df2c79 3107 spin_unlock(&head->lock);
5d4f98a2 3108 mutex_unlock(&head->mutex);
5d4f98a2
YZ
3109 return ret;
3110}
3111
e4c3b2dc 3112static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2
YZ
3113 struct btrfs_path *path,
3114 u64 objectid, u64 offset, u64 bytenr)
be20aa9d 3115{
0b246afa
JM
3116 struct btrfs_fs_info *fs_info = root->fs_info;
3117 struct btrfs_root *extent_root = fs_info->extent_root;
f321e491 3118 struct extent_buffer *leaf;
5d4f98a2
YZ
3119 struct btrfs_extent_data_ref *ref;
3120 struct btrfs_extent_inline_ref *iref;
3121 struct btrfs_extent_item *ei;
f321e491 3122 struct btrfs_key key;
5d4f98a2 3123 u32 item_size;
be20aa9d 3124 int ret;
925baedd 3125
be20aa9d 3126 key.objectid = bytenr;
31840ae1 3127 key.offset = (u64)-1;
f321e491 3128 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 3129
be20aa9d
CM
3130 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3131 if (ret < 0)
3132 goto out;
79787eaa 3133 BUG_ON(ret == 0); /* Corruption */
80ff3856
YZ
3134
3135 ret = -ENOENT;
3136 if (path->slots[0] == 0)
31840ae1 3137 goto out;
be20aa9d 3138
31840ae1 3139 path->slots[0]--;
f321e491 3140 leaf = path->nodes[0];
5d4f98a2 3141 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 3142
5d4f98a2 3143 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 3144 goto out;
f321e491 3145
5d4f98a2
YZ
3146 ret = 1;
3147 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3148#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3149 if (item_size < sizeof(*ei)) {
3150 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3151 goto out;
3152 }
3153#endif
3154 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 3155
5d4f98a2
YZ
3156 if (item_size != sizeof(*ei) +
3157 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3158 goto out;
be20aa9d 3159
5d4f98a2
YZ
3160 if (btrfs_extent_generation(leaf, ei) <=
3161 btrfs_root_last_snapshot(&root->root_item))
3162 goto out;
3163
3164 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3165 if (btrfs_extent_inline_ref_type(leaf, iref) !=
3166 BTRFS_EXTENT_DATA_REF_KEY)
3167 goto out;
3168
3169 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3170 if (btrfs_extent_refs(leaf, ei) !=
3171 btrfs_extent_data_ref_count(leaf, ref) ||
3172 btrfs_extent_data_ref_root(leaf, ref) !=
3173 root->root_key.objectid ||
3174 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3175 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3176 goto out;
3177
3178 ret = 0;
3179out:
3180 return ret;
3181}
3182
e4c3b2dc
LB
3183int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3184 u64 bytenr)
5d4f98a2
YZ
3185{
3186 struct btrfs_path *path;
3187 int ret;
3188 int ret2;
3189
3190 path = btrfs_alloc_path();
3191 if (!path)
3192 return -ENOENT;
3193
3194 do {
e4c3b2dc 3195 ret = check_committed_ref(root, path, objectid,
5d4f98a2
YZ
3196 offset, bytenr);
3197 if (ret && ret != -ENOENT)
f321e491 3198 goto out;
80ff3856 3199
e4c3b2dc 3200 ret2 = check_delayed_ref(root, path, objectid,
5d4f98a2
YZ
3201 offset, bytenr);
3202 } while (ret2 == -EAGAIN);
3203
3204 if (ret2 && ret2 != -ENOENT) {
3205 ret = ret2;
3206 goto out;
f321e491 3207 }
5d4f98a2
YZ
3208
3209 if (ret != -ENOENT || ret2 != -ENOENT)
3210 ret = 0;
be20aa9d 3211out:
80ff3856 3212 btrfs_free_path(path);
f0486c68
YZ
3213 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3214 WARN_ON(ret > 0);
f321e491 3215 return ret;
be20aa9d 3216}
c5739bba 3217
5d4f98a2 3218static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 3219 struct btrfs_root *root,
5d4f98a2 3220 struct extent_buffer *buf,
e339a6b0 3221 int full_backref, int inc)
31840ae1 3222{
0b246afa 3223 struct btrfs_fs_info *fs_info = root->fs_info;
31840ae1 3224 u64 bytenr;
5d4f98a2
YZ
3225 u64 num_bytes;
3226 u64 parent;
31840ae1 3227 u64 ref_root;
31840ae1 3228 u32 nritems;
31840ae1
ZY
3229 struct btrfs_key key;
3230 struct btrfs_file_extent_item *fi;
3231 int i;
3232 int level;
3233 int ret = 0;
2ff7e61e
JM
3234 int (*process_func)(struct btrfs_trans_handle *,
3235 struct btrfs_fs_info *,
b06c4bf5 3236 u64, u64, u64, u64, u64, u64);
31840ae1 3237
fccb84c9 3238
0b246afa 3239 if (btrfs_is_testing(fs_info))
faa2dbf0 3240 return 0;
fccb84c9 3241
31840ae1 3242 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
3243 nritems = btrfs_header_nritems(buf);
3244 level = btrfs_header_level(buf);
3245
27cdeb70 3246 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
5d4f98a2 3247 return 0;
31840ae1 3248
5d4f98a2
YZ
3249 if (inc)
3250 process_func = btrfs_inc_extent_ref;
3251 else
3252 process_func = btrfs_free_extent;
31840ae1 3253
5d4f98a2
YZ
3254 if (full_backref)
3255 parent = buf->start;
3256 else
3257 parent = 0;
3258
3259 for (i = 0; i < nritems; i++) {
31840ae1 3260 if (level == 0) {
5d4f98a2 3261 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 3262 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 3263 continue;
5d4f98a2 3264 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
3265 struct btrfs_file_extent_item);
3266 if (btrfs_file_extent_type(buf, fi) ==
3267 BTRFS_FILE_EXTENT_INLINE)
3268 continue;
3269 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3270 if (bytenr == 0)
3271 continue;
5d4f98a2
YZ
3272
3273 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3274 key.offset -= btrfs_file_extent_offset(buf, fi);
2ff7e61e 3275 ret = process_func(trans, fs_info, bytenr, num_bytes,
5d4f98a2 3276 parent, ref_root, key.objectid,
b06c4bf5 3277 key.offset);
31840ae1
ZY
3278 if (ret)
3279 goto fail;
3280 } else {
5d4f98a2 3281 bytenr = btrfs_node_blockptr(buf, i);
0b246afa 3282 num_bytes = fs_info->nodesize;
2ff7e61e 3283 ret = process_func(trans, fs_info, bytenr, num_bytes,
b06c4bf5 3284 parent, ref_root, level - 1, 0);
31840ae1
ZY
3285 if (ret)
3286 goto fail;
3287 }
3288 }
3289 return 0;
3290fail:
5d4f98a2
YZ
3291 return ret;
3292}
3293
3294int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3295 struct extent_buffer *buf, int full_backref)
5d4f98a2 3296{
e339a6b0 3297 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
3298}
3299
3300int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 3301 struct extent_buffer *buf, int full_backref)
5d4f98a2 3302{
e339a6b0 3303 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
3304}
3305
9078a3e1 3306static int write_one_cache_group(struct btrfs_trans_handle *trans,
2ff7e61e 3307 struct btrfs_fs_info *fs_info,
9078a3e1
CM
3308 struct btrfs_path *path,
3309 struct btrfs_block_group_cache *cache)
3310{
3311 int ret;
0b246afa 3312 struct btrfs_root *extent_root = fs_info->extent_root;
5f39d397
CM
3313 unsigned long bi;
3314 struct extent_buffer *leaf;
9078a3e1 3315
9078a3e1 3316 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
df95e7f0
JB
3317 if (ret) {
3318 if (ret > 0)
3319 ret = -ENOENT;
54aa1f4d 3320 goto fail;
df95e7f0 3321 }
5f39d397
CM
3322
3323 leaf = path->nodes[0];
3324 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3325 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3326 btrfs_mark_buffer_dirty(leaf);
54aa1f4d 3327fail:
24b89d08 3328 btrfs_release_path(path);
df95e7f0 3329 return ret;
9078a3e1
CM
3330
3331}
3332
4a8c9a62 3333static struct btrfs_block_group_cache *
2ff7e61e 3334next_block_group(struct btrfs_fs_info *fs_info,
4a8c9a62
YZ
3335 struct btrfs_block_group_cache *cache)
3336{
3337 struct rb_node *node;
292cbd51 3338
0b246afa 3339 spin_lock(&fs_info->block_group_cache_lock);
292cbd51
FM
3340
3341 /* If our block group was removed, we need a full search. */
3342 if (RB_EMPTY_NODE(&cache->cache_node)) {
3343 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3344
0b246afa 3345 spin_unlock(&fs_info->block_group_cache_lock);
292cbd51 3346 btrfs_put_block_group(cache);
0b246afa 3347 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
292cbd51 3348 }
4a8c9a62
YZ
3349 node = rb_next(&cache->cache_node);
3350 btrfs_put_block_group(cache);
3351 if (node) {
3352 cache = rb_entry(node, struct btrfs_block_group_cache,
3353 cache_node);
11dfe35a 3354 btrfs_get_block_group(cache);
4a8c9a62
YZ
3355 } else
3356 cache = NULL;
0b246afa 3357 spin_unlock(&fs_info->block_group_cache_lock);
4a8c9a62
YZ
3358 return cache;
3359}
3360
0af3d00b
JB
3361static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3362 struct btrfs_trans_handle *trans,
3363 struct btrfs_path *path)
3364{
0b246afa
JM
3365 struct btrfs_fs_info *fs_info = block_group->fs_info;
3366 struct btrfs_root *root = fs_info->tree_root;
0af3d00b
JB
3367 struct inode *inode = NULL;
3368 u64 alloc_hint = 0;
2b20982e 3369 int dcs = BTRFS_DC_ERROR;
f8c269d7 3370 u64 num_pages = 0;
0af3d00b
JB
3371 int retries = 0;
3372 int ret = 0;
3373
3374 /*
3375 * If this block group is smaller than 100 megs don't bother caching the
3376 * block group.
3377 */
ee22184b 3378 if (block_group->key.offset < (100 * SZ_1M)) {
0af3d00b
JB
3379 spin_lock(&block_group->lock);
3380 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3381 spin_unlock(&block_group->lock);
3382 return 0;
3383 }
3384
0c0ef4bc
JB
3385 if (trans->aborted)
3386 return 0;
0af3d00b 3387again:
77ab86bf 3388 inode = lookup_free_space_inode(fs_info, block_group, path);
0af3d00b
JB
3389 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3390 ret = PTR_ERR(inode);
b3b4aa74 3391 btrfs_release_path(path);
0af3d00b
JB
3392 goto out;
3393 }
3394
3395 if (IS_ERR(inode)) {
3396 BUG_ON(retries);
3397 retries++;
3398
3399 if (block_group->ro)
3400 goto out_free;
3401
77ab86bf
JM
3402 ret = create_free_space_inode(fs_info, trans, block_group,
3403 path);
0af3d00b
JB
3404 if (ret)
3405 goto out_free;
3406 goto again;
3407 }
3408
5b0e95bf
JB
3409 /* We've already setup this transaction, go ahead and exit */
3410 if (block_group->cache_generation == trans->transid &&
3411 i_size_read(inode)) {
3412 dcs = BTRFS_DC_SETUP;
3413 goto out_put;
3414 }
3415
0af3d00b
JB
3416 /*
3417 * We want to set the generation to 0, that way if anything goes wrong
3418 * from here on out we know not to trust this cache when we load up next
3419 * time.
3420 */
3421 BTRFS_I(inode)->generation = 0;
3422 ret = btrfs_update_inode(trans, root, inode);
0c0ef4bc
JB
3423 if (ret) {
3424 /*
3425 * So theoretically we could recover from this, simply set the
3426 * super cache generation to 0 so we know to invalidate the
3427 * cache, but then we'd have to keep track of the block groups
3428 * that fail this way so we know we _have_ to reset this cache
3429 * before the next commit or risk reading stale cache. So to
3430 * limit our exposure to horrible edge cases lets just abort the
3431 * transaction, this only happens in really bad situations
3432 * anyway.
3433 */
66642832 3434 btrfs_abort_transaction(trans, ret);
0c0ef4bc
JB
3435 goto out_put;
3436 }
0af3d00b
JB
3437 WARN_ON(ret);
3438
3439 if (i_size_read(inode) > 0) {
2ff7e61e 3440 ret = btrfs_check_trunc_cache_free_space(fs_info,
0b246afa 3441 &fs_info->global_block_rsv);
7b61cd92
MX
3442 if (ret)
3443 goto out_put;
3444
77ab86bf 3445 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
0af3d00b
JB
3446 if (ret)
3447 goto out_put;
3448 }
3449
3450 spin_lock(&block_group->lock);
cf7c1ef6 3451 if (block_group->cached != BTRFS_CACHE_FINISHED ||
0b246afa 3452 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
cf7c1ef6
LB
3453 /*
3454 * don't bother trying to write stuff out _if_
3455 * a) we're not cached,
1a79c1f2
LB
3456 * b) we're with nospace_cache mount option,
3457 * c) we're with v2 space_cache (FREE_SPACE_TREE).
cf7c1ef6 3458 */
2b20982e 3459 dcs = BTRFS_DC_WRITTEN;
0af3d00b
JB
3460 spin_unlock(&block_group->lock);
3461 goto out_put;
3462 }
3463 spin_unlock(&block_group->lock);
3464
2968b1f4
JB
3465 /*
3466 * We hit an ENOSPC when setting up the cache in this transaction, just
3467 * skip doing the setup, we've already cleared the cache so we're safe.
3468 */
3469 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3470 ret = -ENOSPC;
3471 goto out_put;
3472 }
3473
6fc823b1
JB
3474 /*
3475 * Try to preallocate enough space based on how big the block group is.
3476 * Keep in mind this has to include any pinned space which could end up
3477 * taking up quite a bit since it's not folded into the other space
3478 * cache.
3479 */
ee22184b 3480 num_pages = div_u64(block_group->key.offset, SZ_256M);
0af3d00b
JB
3481 if (!num_pages)
3482 num_pages = 1;
3483
0af3d00b 3484 num_pages *= 16;
09cbfeaf 3485 num_pages *= PAGE_SIZE;
0af3d00b 3486
7cf5b976 3487 ret = btrfs_check_data_free_space(inode, 0, num_pages);
0af3d00b
JB
3488 if (ret)
3489 goto out_put;
3490
3491 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3492 num_pages, num_pages,
3493 &alloc_hint);
2968b1f4
JB
3494 /*
3495 * Our cache requires contiguous chunks so that we don't modify a bunch
3496 * of metadata or split extents when writing the cache out, which means
3497 * we can enospc if we are heavily fragmented in addition to just normal
3498 * out of space conditions. So if we hit this just skip setting up any
3499 * other block groups for this transaction, maybe we'll unpin enough
3500 * space the next time around.
3501 */
2b20982e
JB
3502 if (!ret)
3503 dcs = BTRFS_DC_SETUP;
2968b1f4
JB
3504 else if (ret == -ENOSPC)
3505 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
c09544e0 3506
0af3d00b
JB
3507out_put:
3508 iput(inode);
3509out_free:
b3b4aa74 3510 btrfs_release_path(path);
0af3d00b
JB
3511out:
3512 spin_lock(&block_group->lock);
e65cbb94 3513 if (!ret && dcs == BTRFS_DC_SETUP)
5b0e95bf 3514 block_group->cache_generation = trans->transid;
2b20982e 3515 block_group->disk_cache_state = dcs;
0af3d00b
JB
3516 spin_unlock(&block_group->lock);
3517
3518 return ret;
3519}
3520
dcdf7f6d 3521int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
2ff7e61e 3522 struct btrfs_fs_info *fs_info)
dcdf7f6d
JB
3523{
3524 struct btrfs_block_group_cache *cache, *tmp;
3525 struct btrfs_transaction *cur_trans = trans->transaction;
3526 struct btrfs_path *path;
3527
3528 if (list_empty(&cur_trans->dirty_bgs) ||
0b246afa 3529 !btrfs_test_opt(fs_info, SPACE_CACHE))
dcdf7f6d
JB
3530 return 0;
3531
3532 path = btrfs_alloc_path();
3533 if (!path)
3534 return -ENOMEM;
3535
3536 /* Could add new block groups, use _safe just in case */
3537 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3538 dirty_list) {
3539 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3540 cache_save_setup(cache, trans, path);
3541 }
3542
3543 btrfs_free_path(path);
3544 return 0;
3545}
3546
1bbc621e
CM
3547/*
3548 * transaction commit does final block group cache writeback during a
3549 * critical section where nothing is allowed to change the FS. This is
3550 * required in order for the cache to actually match the block group,
3551 * but can introduce a lot of latency into the commit.
3552 *
3553 * So, btrfs_start_dirty_block_groups is here to kick off block group
3554 * cache IO. There's a chance we'll have to redo some of it if the
3555 * block group changes again during the commit, but it greatly reduces
3556 * the commit latency by getting rid of the easy block groups while
3557 * we're still allowing others to join the commit.
3558 */
3559int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
2ff7e61e 3560 struct btrfs_fs_info *fs_info)
9078a3e1 3561{
4a8c9a62 3562 struct btrfs_block_group_cache *cache;
ce93ec54
JB
3563 struct btrfs_transaction *cur_trans = trans->transaction;
3564 int ret = 0;
c9dc4c65 3565 int should_put;
1bbc621e
CM
3566 struct btrfs_path *path = NULL;
3567 LIST_HEAD(dirty);
3568 struct list_head *io = &cur_trans->io_bgs;
c9dc4c65 3569 int num_started = 0;
1bbc621e
CM
3570 int loops = 0;
3571
3572 spin_lock(&cur_trans->dirty_bgs_lock);
b58d1a9e
FM
3573 if (list_empty(&cur_trans->dirty_bgs)) {
3574 spin_unlock(&cur_trans->dirty_bgs_lock);
3575 return 0;
1bbc621e 3576 }
b58d1a9e 3577 list_splice_init(&cur_trans->dirty_bgs, &dirty);
1bbc621e 3578 spin_unlock(&cur_trans->dirty_bgs_lock);
ce93ec54 3579
1bbc621e 3580again:
1bbc621e
CM
3581 /*
3582 * make sure all the block groups on our dirty list actually
3583 * exist
3584 */
2ff7e61e 3585 btrfs_create_pending_block_groups(trans, fs_info);
1bbc621e
CM
3586
3587 if (!path) {
3588 path = btrfs_alloc_path();
3589 if (!path)
3590 return -ENOMEM;
3591 }
3592
b58d1a9e
FM
3593 /*
3594 * cache_write_mutex is here only to save us from balance or automatic
3595 * removal of empty block groups deleting this block group while we are
3596 * writing out the cache
3597 */
3598 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e
CM
3599 while (!list_empty(&dirty)) {
3600 cache = list_first_entry(&dirty,
3601 struct btrfs_block_group_cache,
3602 dirty_list);
1bbc621e
CM
3603 /*
3604 * this can happen if something re-dirties a block
3605 * group that is already under IO. Just wait for it to
3606 * finish and then do it all again
3607 */
3608 if (!list_empty(&cache->io_list)) {
3609 list_del_init(&cache->io_list);
afdb5718 3610 btrfs_wait_cache_io(trans, cache, path);
1bbc621e
CM
3611 btrfs_put_block_group(cache);
3612 }
3613
3614
3615 /*
3616 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3617 * if it should update the cache_state. Don't delete
3618 * until after we wait.
3619 *
3620 * Since we're not running in the commit critical section
3621 * we need the dirty_bgs_lock to protect from update_block_group
3622 */
3623 spin_lock(&cur_trans->dirty_bgs_lock);
3624 list_del_init(&cache->dirty_list);
3625 spin_unlock(&cur_trans->dirty_bgs_lock);
3626
3627 should_put = 1;
3628
3629 cache_save_setup(cache, trans, path);
3630
3631 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3632 cache->io_ctl.inode = NULL;
0b246afa 3633 ret = btrfs_write_out_cache(fs_info, trans,
5b4aacef 3634 cache, path);
1bbc621e
CM
3635 if (ret == 0 && cache->io_ctl.inode) {
3636 num_started++;
3637 should_put = 0;
3638
3639 /*
3640 * the cache_write_mutex is protecting
3641 * the io_list
3642 */
3643 list_add_tail(&cache->io_list, io);
3644 } else {
3645 /*
3646 * if we failed to write the cache, the
3647 * generation will be bad and life goes on
3648 */
3649 ret = 0;
3650 }
3651 }
ff1f8250 3652 if (!ret) {
2ff7e61e
JM
3653 ret = write_one_cache_group(trans, fs_info,
3654 path, cache);
ff1f8250
FM
3655 /*
3656 * Our block group might still be attached to the list
3657 * of new block groups in the transaction handle of some
3658 * other task (struct btrfs_trans_handle->new_bgs). This
3659 * means its block group item isn't yet in the extent
3660 * tree. If this happens ignore the error, as we will
3661 * try again later in the critical section of the
3662 * transaction commit.
3663 */
3664 if (ret == -ENOENT) {
3665 ret = 0;
3666 spin_lock(&cur_trans->dirty_bgs_lock);
3667 if (list_empty(&cache->dirty_list)) {
3668 list_add_tail(&cache->dirty_list,
3669 &cur_trans->dirty_bgs);
3670 btrfs_get_block_group(cache);
3671 }
3672 spin_unlock(&cur_trans->dirty_bgs_lock);
3673 } else if (ret) {
66642832 3674 btrfs_abort_transaction(trans, ret);
ff1f8250
FM
3675 }
3676 }
1bbc621e
CM
3677
3678 /* if its not on the io list, we need to put the block group */
3679 if (should_put)
3680 btrfs_put_block_group(cache);
3681
3682 if (ret)
3683 break;
b58d1a9e
FM
3684
3685 /*
3686 * Avoid blocking other tasks for too long. It might even save
3687 * us from writing caches for block groups that are going to be
3688 * removed.
3689 */
3690 mutex_unlock(&trans->transaction->cache_write_mutex);
3691 mutex_lock(&trans->transaction->cache_write_mutex);
1bbc621e 3692 }
b58d1a9e 3693 mutex_unlock(&trans->transaction->cache_write_mutex);
1bbc621e
CM
3694
3695 /*
3696 * go through delayed refs for all the stuff we've just kicked off
3697 * and then loop back (just once)
3698 */
2ff7e61e 3699 ret = btrfs_run_delayed_refs(trans, fs_info, 0);
1bbc621e
CM
3700 if (!ret && loops == 0) {
3701 loops++;
3702 spin_lock(&cur_trans->dirty_bgs_lock);
3703 list_splice_init(&cur_trans->dirty_bgs, &dirty);
b58d1a9e
FM
3704 /*
3705 * dirty_bgs_lock protects us from concurrent block group
3706 * deletes too (not just cache_write_mutex).
3707 */
3708 if (!list_empty(&dirty)) {
3709 spin_unlock(&cur_trans->dirty_bgs_lock);
3710 goto again;
3711 }
1bbc621e 3712 spin_unlock(&cur_trans->dirty_bgs_lock);
c79a1751 3713 } else if (ret < 0) {
2ff7e61e 3714 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
1bbc621e
CM
3715 }
3716
3717 btrfs_free_path(path);
3718 return ret;
3719}
3720
3721int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2ff7e61e 3722 struct btrfs_fs_info *fs_info)
1bbc621e
CM
3723{
3724 struct btrfs_block_group_cache *cache;
3725 struct btrfs_transaction *cur_trans = trans->transaction;
3726 int ret = 0;
3727 int should_put;
3728 struct btrfs_path *path;
3729 struct list_head *io = &cur_trans->io_bgs;
3730 int num_started = 0;
9078a3e1
CM
3731
3732 path = btrfs_alloc_path();
3733 if (!path)
3734 return -ENOMEM;
3735
ce93ec54 3736 /*
e44081ef
FM
3737 * Even though we are in the critical section of the transaction commit,
3738 * we can still have concurrent tasks adding elements to this
3739 * transaction's list of dirty block groups. These tasks correspond to
3740 * endio free space workers started when writeback finishes for a
3741 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3742 * allocate new block groups as a result of COWing nodes of the root
3743 * tree when updating the free space inode. The writeback for the space
3744 * caches is triggered by an earlier call to
3745 * btrfs_start_dirty_block_groups() and iterations of the following
3746 * loop.
3747 * Also we want to do the cache_save_setup first and then run the
ce93ec54
JB
3748 * delayed refs to make sure we have the best chance at doing this all
3749 * in one shot.
3750 */
e44081ef 3751 spin_lock(&cur_trans->dirty_bgs_lock);
ce93ec54
JB
3752 while (!list_empty(&cur_trans->dirty_bgs)) {
3753 cache = list_first_entry(&cur_trans->dirty_bgs,
3754 struct btrfs_block_group_cache,
3755 dirty_list);
c9dc4c65
CM
3756
3757 /*
3758 * this can happen if cache_save_setup re-dirties a block
3759 * group that is already under IO. Just wait for it to
3760 * finish and then do it all again
3761 */
3762 if (!list_empty(&cache->io_list)) {
e44081ef 3763 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3764 list_del_init(&cache->io_list);
afdb5718 3765 btrfs_wait_cache_io(trans, cache, path);
c9dc4c65 3766 btrfs_put_block_group(cache);
e44081ef 3767 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3768 }
3769
1bbc621e
CM
3770 /*
3771 * don't remove from the dirty list until after we've waited
3772 * on any pending IO
3773 */
ce93ec54 3774 list_del_init(&cache->dirty_list);
e44081ef 3775 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65
CM
3776 should_put = 1;
3777
1bbc621e 3778 cache_save_setup(cache, trans, path);
c9dc4c65 3779
ce93ec54 3780 if (!ret)
2ff7e61e
JM
3781 ret = btrfs_run_delayed_refs(trans, fs_info,
3782 (unsigned long) -1);
c9dc4c65
CM
3783
3784 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3785 cache->io_ctl.inode = NULL;
0b246afa 3786 ret = btrfs_write_out_cache(fs_info, trans,
5b4aacef 3787 cache, path);
c9dc4c65
CM
3788 if (ret == 0 && cache->io_ctl.inode) {
3789 num_started++;
3790 should_put = 0;
1bbc621e 3791 list_add_tail(&cache->io_list, io);
c9dc4c65
CM
3792 } else {
3793 /*
3794 * if we failed to write the cache, the
3795 * generation will be bad and life goes on
3796 */
3797 ret = 0;
3798 }
3799 }
ff1f8250 3800 if (!ret) {
2ff7e61e
JM
3801 ret = write_one_cache_group(trans, fs_info,
3802 path, cache);
2bc0bb5f
FM
3803 /*
3804 * One of the free space endio workers might have
3805 * created a new block group while updating a free space
3806 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3807 * and hasn't released its transaction handle yet, in
3808 * which case the new block group is still attached to
3809 * its transaction handle and its creation has not
3810 * finished yet (no block group item in the extent tree
3811 * yet, etc). If this is the case, wait for all free
3812 * space endio workers to finish and retry. This is a
3813 * a very rare case so no need for a more efficient and
3814 * complex approach.
3815 */
3816 if (ret == -ENOENT) {
3817 wait_event(cur_trans->writer_wait,
3818 atomic_read(&cur_trans->num_writers) == 1);
2ff7e61e
JM
3819 ret = write_one_cache_group(trans, fs_info,
3820 path, cache);
2bc0bb5f 3821 }
ff1f8250 3822 if (ret)
66642832 3823 btrfs_abort_transaction(trans, ret);
ff1f8250 3824 }
c9dc4c65
CM
3825
3826 /* if its not on the io list, we need to put the block group */
3827 if (should_put)
3828 btrfs_put_block_group(cache);
e44081ef 3829 spin_lock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3830 }
e44081ef 3831 spin_unlock(&cur_trans->dirty_bgs_lock);
c9dc4c65 3832
1bbc621e
CM
3833 while (!list_empty(io)) {
3834 cache = list_first_entry(io, struct btrfs_block_group_cache,
c9dc4c65
CM
3835 io_list);
3836 list_del_init(&cache->io_list);
afdb5718 3837 btrfs_wait_cache_io(trans, cache, path);
0cb59c99
JB
3838 btrfs_put_block_group(cache);
3839 }
3840
9078a3e1 3841 btrfs_free_path(path);
ce93ec54 3842 return ret;
9078a3e1
CM
3843}
3844
2ff7e61e 3845int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
d2fb3437
YZ
3846{
3847 struct btrfs_block_group_cache *block_group;
3848 int readonly = 0;
3849
0b246afa 3850 block_group = btrfs_lookup_block_group(fs_info, bytenr);
d2fb3437
YZ
3851 if (!block_group || block_group->ro)
3852 readonly = 1;
3853 if (block_group)
fa9c0d79 3854 btrfs_put_block_group(block_group);
d2fb3437
YZ
3855 return readonly;
3856}
3857
f78c436c
FM
3858bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3859{
3860 struct btrfs_block_group_cache *bg;
3861 bool ret = true;
3862
3863 bg = btrfs_lookup_block_group(fs_info, bytenr);
3864 if (!bg)
3865 return false;
3866
3867 spin_lock(&bg->lock);
3868 if (bg->ro)
3869 ret = false;
3870 else
3871 atomic_inc(&bg->nocow_writers);
3872 spin_unlock(&bg->lock);
3873
3874 /* no put on block group, done by btrfs_dec_nocow_writers */
3875 if (!ret)
3876 btrfs_put_block_group(bg);
3877
3878 return ret;
3879
3880}
3881
3882void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3883{
3884 struct btrfs_block_group_cache *bg;
3885
3886 bg = btrfs_lookup_block_group(fs_info, bytenr);
3887 ASSERT(bg);
3888 if (atomic_dec_and_test(&bg->nocow_writers))
3889 wake_up_atomic_t(&bg->nocow_writers);
3890 /*
3891 * Once for our lookup and once for the lookup done by a previous call
3892 * to btrfs_inc_nocow_writers()
3893 */
3894 btrfs_put_block_group(bg);
3895 btrfs_put_block_group(bg);
3896}
3897
3898static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3899{
3900 schedule();
3901 return 0;
3902}
3903
3904void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3905{
3906 wait_on_atomic_t(&bg->nocow_writers,
3907 btrfs_wait_nocow_writers_atomic_t,
3908 TASK_UNINTERRUPTIBLE);
3909}
3910
6ab0a202
JM
3911static const char *alloc_name(u64 flags)
3912{
3913 switch (flags) {
3914 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3915 return "mixed";
3916 case BTRFS_BLOCK_GROUP_METADATA:
3917 return "metadata";
3918 case BTRFS_BLOCK_GROUP_DATA:
3919 return "data";
3920 case BTRFS_BLOCK_GROUP_SYSTEM:
3921 return "system";
3922 default:
3923 WARN_ON(1);
3924 return "invalid-combination";
3925 };
3926}
3927
593060d7
CM
3928static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3929 u64 total_bytes, u64 bytes_used,
e40edf2d 3930 u64 bytes_readonly,
593060d7
CM
3931 struct btrfs_space_info **space_info)
3932{
3933 struct btrfs_space_info *found;
b742bb82
YZ
3934 int i;
3935 int factor;
b150a4f1 3936 int ret;
b742bb82
YZ
3937
3938 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3939 BTRFS_BLOCK_GROUP_RAID10))
3940 factor = 2;
3941 else
3942 factor = 1;
593060d7
CM
3943
3944 found = __find_space_info(info, flags);
3945 if (found) {
25179201 3946 spin_lock(&found->lock);
593060d7 3947 found->total_bytes += total_bytes;
89a55897 3948 found->disk_total += total_bytes * factor;
593060d7 3949 found->bytes_used += bytes_used;
b742bb82 3950 found->disk_used += bytes_used * factor;
e40edf2d 3951 found->bytes_readonly += bytes_readonly;
2e6e5183
FM
3952 if (total_bytes > 0)
3953 found->full = 0;
957780eb
JB
3954 space_info_add_new_bytes(info, found, total_bytes -
3955 bytes_used - bytes_readonly);
25179201 3956 spin_unlock(&found->lock);
593060d7
CM
3957 *space_info = found;
3958 return 0;
3959 }
c146afad 3960 found = kzalloc(sizeof(*found), GFP_NOFS);
593060d7
CM
3961 if (!found)
3962 return -ENOMEM;
3963
908c7f19 3964 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
b150a4f1
JB
3965 if (ret) {
3966 kfree(found);
3967 return ret;
3968 }
3969
c1895442 3970 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
b742bb82 3971 INIT_LIST_HEAD(&found->block_groups[i]);
80eb234a 3972 init_rwsem(&found->groups_sem);
0f9dd46c 3973 spin_lock_init(&found->lock);
52ba6929 3974 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
593060d7 3975 found->total_bytes = total_bytes;
89a55897 3976 found->disk_total = total_bytes * factor;
593060d7 3977 found->bytes_used = bytes_used;
b742bb82 3978 found->disk_used = bytes_used * factor;
593060d7 3979 found->bytes_pinned = 0;
e8569813 3980 found->bytes_reserved = 0;
e40edf2d 3981 found->bytes_readonly = bytes_readonly;
f0486c68 3982 found->bytes_may_use = 0;
6af3e3ad 3983 found->full = 0;
4f4db217 3984 found->max_extent_size = 0;
0e4f8f88 3985 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
6d74119f 3986 found->chunk_alloc = 0;
fdb5effd
JB
3987 found->flush = 0;
3988 init_waitqueue_head(&found->wait);
633c0aad 3989 INIT_LIST_HEAD(&found->ro_bgs);
957780eb
JB
3990 INIT_LIST_HEAD(&found->tickets);
3991 INIT_LIST_HEAD(&found->priority_tickets);
6ab0a202
JM
3992
3993 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3994 info->space_info_kobj, "%s",
3995 alloc_name(found->flags));
3996 if (ret) {
896533a7 3997 percpu_counter_destroy(&found->total_bytes_pinned);
6ab0a202
JM
3998 kfree(found);
3999 return ret;
4000 }
4001
593060d7 4002 *space_info = found;
4184ea7f 4003 list_add_rcu(&found->list, &info->space_info);
b4d7c3c9
LZ
4004 if (flags & BTRFS_BLOCK_GROUP_DATA)
4005 info->data_sinfo = found;
6ab0a202
JM
4006
4007 return ret;
593060d7
CM
4008}
4009
8790d502
CM
4010static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4011{
899c81ea
ID
4012 u64 extra_flags = chunk_to_extended(flags) &
4013 BTRFS_EXTENDED_PROFILE_MASK;
a46d11a8 4014
de98ced9 4015 write_seqlock(&fs_info->profiles_lock);
a46d11a8
ID
4016 if (flags & BTRFS_BLOCK_GROUP_DATA)
4017 fs_info->avail_data_alloc_bits |= extra_flags;
4018 if (flags & BTRFS_BLOCK_GROUP_METADATA)
4019 fs_info->avail_metadata_alloc_bits |= extra_flags;
4020 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4021 fs_info->avail_system_alloc_bits |= extra_flags;
de98ced9 4022 write_sequnlock(&fs_info->profiles_lock);
8790d502 4023}
593060d7 4024
fc67c450
ID
4025/*
4026 * returns target flags in extended format or 0 if restripe for this
4027 * chunk_type is not in progress
c6664b42
ID
4028 *
4029 * should be called with either volume_mutex or balance_lock held
fc67c450
ID
4030 */
4031static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4032{
4033 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4034 u64 target = 0;
4035
fc67c450
ID
4036 if (!bctl)
4037 return 0;
4038
4039 if (flags & BTRFS_BLOCK_GROUP_DATA &&
4040 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4041 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4042 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4043 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4044 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4045 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4046 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4047 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4048 }
4049
4050 return target;
4051}
4052
a46d11a8
ID
4053/*
4054 * @flags: available profiles in extended format (see ctree.h)
4055 *
e4d8ec0f
ID
4056 * Returns reduced profile in chunk format. If profile changing is in
4057 * progress (either running or paused) picks the target profile (if it's
4058 * already available), otherwise falls back to plain reducing.
a46d11a8 4059 */
2ff7e61e 4060static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c 4061{
0b246afa 4062 u64 num_devices = fs_info->fs_devices->rw_devices;
fc67c450 4063 u64 target;
9c170b26
ZL
4064 u64 raid_type;
4065 u64 allowed = 0;
a061fc8d 4066
fc67c450
ID
4067 /*
4068 * see if restripe for this chunk_type is in progress, if so
4069 * try to reduce to the target profile
4070 */
0b246afa
JM
4071 spin_lock(&fs_info->balance_lock);
4072 target = get_restripe_target(fs_info, flags);
fc67c450
ID
4073 if (target) {
4074 /* pick target profile only if it's already available */
4075 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
0b246afa 4076 spin_unlock(&fs_info->balance_lock);
fc67c450 4077 return extended_to_chunk(target);
e4d8ec0f
ID
4078 }
4079 }
0b246afa 4080 spin_unlock(&fs_info->balance_lock);
e4d8ec0f 4081
53b381b3 4082 /* First, mask out the RAID levels which aren't possible */
9c170b26
ZL
4083 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4084 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4085 allowed |= btrfs_raid_group[raid_type];
4086 }
4087 allowed &= flags;
4088
4089 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4090 allowed = BTRFS_BLOCK_GROUP_RAID6;
4091 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4092 allowed = BTRFS_BLOCK_GROUP_RAID5;
4093 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4094 allowed = BTRFS_BLOCK_GROUP_RAID10;
4095 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4096 allowed = BTRFS_BLOCK_GROUP_RAID1;
4097 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4098 allowed = BTRFS_BLOCK_GROUP_RAID0;
4099
4100 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4101
4102 return extended_to_chunk(flags | allowed);
ec44a35c
CM
4103}
4104
2ff7e61e 4105static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
6a63209f 4106{
de98ced9 4107 unsigned seq;
f8213bdc 4108 u64 flags;
de98ced9
MX
4109
4110 do {
f8213bdc 4111 flags = orig_flags;
0b246afa 4112 seq = read_seqbegin(&fs_info->profiles_lock);
de98ced9
MX
4113
4114 if (flags & BTRFS_BLOCK_GROUP_DATA)
0b246afa 4115 flags |= fs_info->avail_data_alloc_bits;
de98ced9 4116 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
0b246afa 4117 flags |= fs_info->avail_system_alloc_bits;
de98ced9 4118 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
0b246afa
JM
4119 flags |= fs_info->avail_metadata_alloc_bits;
4120 } while (read_seqretry(&fs_info->profiles_lock, seq));
6fef8df1 4121
2ff7e61e 4122 return btrfs_reduce_alloc_profile(fs_info, flags);
6a63209f
JB
4123}
4124
1b86826d 4125static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 4126{
0b246afa 4127 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 4128 u64 flags;
53b381b3 4129 u64 ret;
9ed74f2d 4130
b742bb82
YZ
4131 if (data)
4132 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 4133 else if (root == fs_info->chunk_root)
b742bb82 4134 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 4135 else
b742bb82 4136 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 4137
2ff7e61e 4138 ret = get_alloc_profile(fs_info, flags);
53b381b3 4139 return ret;
6a63209f 4140}
9ed74f2d 4141
1b86826d
JM
4142u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4143{
4144 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4145}
4146
4147u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4148{
4149 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4150}
4151
4152u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4153{
4154 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4155}
4156
4136135b
LB
4157static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4158 bool may_use_included)
4159{
4160 ASSERT(s_info);
4161 return s_info->bytes_used + s_info->bytes_reserved +
4162 s_info->bytes_pinned + s_info->bytes_readonly +
4163 (may_use_included ? s_info->bytes_may_use : 0);
4164}
4165
04f4f916 4166int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
6a63209f 4167{
6a63209f 4168 struct btrfs_space_info *data_sinfo;
04f4f916 4169 struct btrfs_root *root = inode->root;
b4d7c3c9 4170 struct btrfs_fs_info *fs_info = root->fs_info;
ab6e2410 4171 u64 used;
94b947b2 4172 int ret = 0;
c99f1b0c
ZL
4173 int need_commit = 2;
4174 int have_pinned_space;
6a63209f 4175
6a63209f 4176 /* make sure bytes are sectorsize aligned */
0b246afa 4177 bytes = ALIGN(bytes, fs_info->sectorsize);
6a63209f 4178
9dced186 4179 if (btrfs_is_free_space_inode(inode)) {
c99f1b0c 4180 need_commit = 0;
9dced186 4181 ASSERT(current->journal_info);
0af3d00b
JB
4182 }
4183
b4d7c3c9 4184 data_sinfo = fs_info->data_sinfo;
33b4d47f
CM
4185 if (!data_sinfo)
4186 goto alloc;
9ed74f2d 4187
6a63209f
JB
4188again:
4189 /* make sure we have enough space to handle the data first */
4190 spin_lock(&data_sinfo->lock);
4136135b 4191 used = btrfs_space_info_used(data_sinfo, true);
ab6e2410
JB
4192
4193 if (used + bytes > data_sinfo->total_bytes) {
4e06bdd6 4194 struct btrfs_trans_handle *trans;
9ed74f2d 4195
6a63209f
JB
4196 /*
4197 * if we don't have enough free bytes in this space then we need
4198 * to alloc a new chunk.
4199 */
b9fd47cd 4200 if (!data_sinfo->full) {
6a63209f 4201 u64 alloc_target;
9ed74f2d 4202
0e4f8f88 4203 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
6a63209f 4204 spin_unlock(&data_sinfo->lock);
33b4d47f 4205alloc:
1b86826d 4206 alloc_target = btrfs_data_alloc_profile(fs_info);
9dced186
MX
4207 /*
4208 * It is ugly that we don't call nolock join
4209 * transaction for the free space inode case here.
4210 * But it is safe because we only do the data space
4211 * reservation for the free space cache in the
4212 * transaction context, the common join transaction
4213 * just increase the counter of the current transaction
4214 * handler, doesn't try to acquire the trans_lock of
4215 * the fs.
4216 */
7a7eaa40 4217 trans = btrfs_join_transaction(root);
a22285a6
YZ
4218 if (IS_ERR(trans))
4219 return PTR_ERR(trans);
9ed74f2d 4220
2ff7e61e 4221 ret = do_chunk_alloc(trans, fs_info, alloc_target,
0e4f8f88 4222 CHUNK_ALLOC_NO_FORCE);
3a45bb20 4223 btrfs_end_transaction(trans);
d52a5b5f
MX
4224 if (ret < 0) {
4225 if (ret != -ENOSPC)
4226 return ret;
c99f1b0c
ZL
4227 else {
4228 have_pinned_space = 1;
d52a5b5f 4229 goto commit_trans;
c99f1b0c 4230 }
d52a5b5f 4231 }
9ed74f2d 4232
b4d7c3c9
LZ
4233 if (!data_sinfo)
4234 data_sinfo = fs_info->data_sinfo;
4235
6a63209f
JB
4236 goto again;
4237 }
f2bb8f5c
JB
4238
4239 /*
b150a4f1 4240 * If we don't have enough pinned space to deal with this
94b947b2
ZL
4241 * allocation, and no removed chunk in current transaction,
4242 * don't bother committing the transaction.
f2bb8f5c 4243 */
c99f1b0c
ZL
4244 have_pinned_space = percpu_counter_compare(
4245 &data_sinfo->total_bytes_pinned,
4246 used + bytes - data_sinfo->total_bytes);
6a63209f 4247 spin_unlock(&data_sinfo->lock);
6a63209f 4248
4e06bdd6 4249 /* commit the current transaction and try again */
d52a5b5f 4250commit_trans:
c99f1b0c 4251 if (need_commit &&
0b246afa 4252 !atomic_read(&fs_info->open_ioctl_trans)) {
c99f1b0c 4253 need_commit--;
b150a4f1 4254
e1746e83
ZL
4255 if (need_commit > 0) {
4256 btrfs_start_delalloc_roots(fs_info, 0, -1);
0b246afa
JM
4257 btrfs_wait_ordered_roots(fs_info, -1, 0,
4258 (u64)-1);
e1746e83 4259 }
9a4e7276 4260
7a7eaa40 4261 trans = btrfs_join_transaction(root);
a22285a6
YZ
4262 if (IS_ERR(trans))
4263 return PTR_ERR(trans);
c99f1b0c 4264 if (have_pinned_space >= 0 ||
3204d33c
JB
4265 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4266 &trans->transaction->flags) ||
c99f1b0c 4267 need_commit > 0) {
3a45bb20 4268 ret = btrfs_commit_transaction(trans);
94b947b2
ZL
4269 if (ret)
4270 return ret;
d7c15171 4271 /*
c2d6cb16
FM
4272 * The cleaner kthread might still be doing iput
4273 * operations. Wait for it to finish so that
4274 * more space is released.
d7c15171 4275 */
0b246afa
JM
4276 mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4277 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
94b947b2
ZL
4278 goto again;
4279 } else {
3a45bb20 4280 btrfs_end_transaction(trans);
94b947b2 4281 }
4e06bdd6 4282 }
9ed74f2d 4283
0b246afa 4284 trace_btrfs_space_reservation(fs_info,
cab45e22
JM
4285 "space_info:enospc",
4286 data_sinfo->flags, bytes, 1);
6a63209f
JB
4287 return -ENOSPC;
4288 }
4289 data_sinfo->bytes_may_use += bytes;
0b246afa 4290 trace_btrfs_space_reservation(fs_info, "space_info",
2bcc0328 4291 data_sinfo->flags, bytes, 1);
6a63209f 4292 spin_unlock(&data_sinfo->lock);
6a63209f 4293
237c0e9f 4294 return ret;
9ed74f2d 4295}
6a63209f 4296
4ceff079
QW
4297/*
4298 * New check_data_free_space() with ability for precious data reservation
4299 * Will replace old btrfs_check_data_free_space(), but for patch split,
4300 * add a new function first and then replace it.
4301 */
7cf5b976 4302int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4ceff079 4303{
0b246afa 4304 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4305 int ret;
4306
4307 /* align the range */
0b246afa
JM
4308 len = round_up(start + len, fs_info->sectorsize) -
4309 round_down(start, fs_info->sectorsize);
4310 start = round_down(start, fs_info->sectorsize);
4ceff079 4311
04f4f916 4312 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4ceff079
QW
4313 if (ret < 0)
4314 return ret;
4315
1e5ec2e7 4316 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4ceff079 4317 ret = btrfs_qgroup_reserve_data(inode, start, len);
1e5ec2e7
JB
4318 if (ret)
4319 btrfs_free_reserved_data_space_noquota(inode, start, len);
4ceff079
QW
4320 return ret;
4321}
4322
4ceff079
QW
4323/*
4324 * Called if we need to clear a data reservation for this inode
4325 * Normally in a error case.
4326 *
51773bec
QW
4327 * This one will *NOT* use accurate qgroup reserved space API, just for case
4328 * which we can't sleep and is sure it won't affect qgroup reserved space.
4329 * Like clear_bit_hook().
4ceff079 4330 */
51773bec
QW
4331void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4332 u64 len)
4ceff079 4333{
0b246afa 4334 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4ceff079
QW
4335 struct btrfs_space_info *data_sinfo;
4336
4337 /* Make sure the range is aligned to sectorsize */
0b246afa
JM
4338 len = round_up(start + len, fs_info->sectorsize) -
4339 round_down(start, fs_info->sectorsize);
4340 start = round_down(start, fs_info->sectorsize);
4ceff079 4341
0b246afa 4342 data_sinfo = fs_info->data_sinfo;
4ceff079
QW
4343 spin_lock(&data_sinfo->lock);
4344 if (WARN_ON(data_sinfo->bytes_may_use < len))
4345 data_sinfo->bytes_may_use = 0;
4346 else
4347 data_sinfo->bytes_may_use -= len;
0b246afa 4348 trace_btrfs_space_reservation(fs_info, "space_info",
4ceff079
QW
4349 data_sinfo->flags, len, 0);
4350 spin_unlock(&data_sinfo->lock);
4351}
4352
51773bec
QW
4353/*
4354 * Called if we need to clear a data reservation for this inode
4355 * Normally in a error case.
4356 *
01327610 4357 * This one will handle the per-inode data rsv map for accurate reserved
51773bec
QW
4358 * space framework.
4359 */
4360void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4361{
0c476a5d
JM
4362 struct btrfs_root *root = BTRFS_I(inode)->root;
4363
4364 /* Make sure the range is aligned to sectorsize */
da17066c
JM
4365 len = round_up(start + len, root->fs_info->sectorsize) -
4366 round_down(start, root->fs_info->sectorsize);
4367 start = round_down(start, root->fs_info->sectorsize);
0c476a5d 4368
51773bec
QW
4369 btrfs_free_reserved_data_space_noquota(inode, start, len);
4370 btrfs_qgroup_free_data(inode, start, len);
4371}
4372
97e728d4 4373static void force_metadata_allocation(struct btrfs_fs_info *info)
e3ccfa98 4374{
97e728d4
JB
4375 struct list_head *head = &info->space_info;
4376 struct btrfs_space_info *found;
e3ccfa98 4377
97e728d4
JB
4378 rcu_read_lock();
4379 list_for_each_entry_rcu(found, head, list) {
4380 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
0e4f8f88 4381 found->force_alloc = CHUNK_ALLOC_FORCE;
e3ccfa98 4382 }
97e728d4 4383 rcu_read_unlock();
e3ccfa98
JB
4384}
4385
3c76cd84
MX
4386static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4387{
4388 return (global->size << 1);
4389}
4390
2ff7e61e 4391static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
698d0082 4392 struct btrfs_space_info *sinfo, int force)
32c00aff 4393{
0b246afa 4394 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
424499db 4395 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
0e4f8f88 4396 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
e5bc2458 4397 u64 thresh;
e3ccfa98 4398
0e4f8f88
CM
4399 if (force == CHUNK_ALLOC_FORCE)
4400 return 1;
4401
fb25e914
JB
4402 /*
4403 * We need to take into account the global rsv because for all intents
4404 * and purposes it's used space. Don't worry about locking the
4405 * global_rsv, it doesn't change except when the transaction commits.
4406 */
54338b5c 4407 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3c76cd84 4408 num_allocated += calc_global_rsv_need_space(global_rsv);
fb25e914 4409
0e4f8f88
CM
4410 /*
4411 * in limited mode, we want to have some free space up to
4412 * about 1% of the FS size.
4413 */
4414 if (force == CHUNK_ALLOC_LIMITED) {
0b246afa 4415 thresh = btrfs_super_total_bytes(fs_info->super_copy);
ee22184b 4416 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
0e4f8f88
CM
4417
4418 if (num_bytes - num_allocated < thresh)
4419 return 1;
4420 }
0e4f8f88 4421
ee22184b 4422 if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
14ed0ca6 4423 return 0;
424499db 4424 return 1;
32c00aff
JB
4425}
4426
2ff7e61e 4427static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
15d1ff81
LB
4428{
4429 u64 num_dev;
4430
53b381b3
DW
4431 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4432 BTRFS_BLOCK_GROUP_RAID0 |
4433 BTRFS_BLOCK_GROUP_RAID5 |
4434 BTRFS_BLOCK_GROUP_RAID6))
0b246afa 4435 num_dev = fs_info->fs_devices->rw_devices;
15d1ff81
LB
4436 else if (type & BTRFS_BLOCK_GROUP_RAID1)
4437 num_dev = 2;
4438 else
4439 num_dev = 1; /* DUP or single */
4440
39c2d7fa 4441 return num_dev;
15d1ff81
LB
4442}
4443
39c2d7fa
FM
4444/*
4445 * If @is_allocation is true, reserve space in the system space info necessary
4446 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4447 * removing a chunk.
4448 */
4449void check_system_chunk(struct btrfs_trans_handle *trans,
2ff7e61e 4450 struct btrfs_fs_info *fs_info, u64 type)
15d1ff81
LB
4451{
4452 struct btrfs_space_info *info;
4453 u64 left;
4454 u64 thresh;
4fbcdf66 4455 int ret = 0;
39c2d7fa 4456 u64 num_devs;
4fbcdf66
FM
4457
4458 /*
4459 * Needed because we can end up allocating a system chunk and for an
4460 * atomic and race free space reservation in the chunk block reserve.
4461 */
0b246afa 4462 ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
15d1ff81 4463
0b246afa 4464 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
15d1ff81 4465 spin_lock(&info->lock);
4136135b 4466 left = info->total_bytes - btrfs_space_info_used(info, true);
15d1ff81
LB
4467 spin_unlock(&info->lock);
4468
2ff7e61e 4469 num_devs = get_profile_num_devs(fs_info, type);
39c2d7fa
FM
4470
4471 /* num_devs device items to update and 1 chunk item to add or remove */
0b246afa
JM
4472 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4473 btrfs_calc_trans_metadata_size(fs_info, 1);
39c2d7fa 4474
0b246afa
JM
4475 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4476 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4477 left, thresh, type);
4478 dump_space_info(fs_info, info, 0, 0);
15d1ff81
LB
4479 }
4480
4481 if (left < thresh) {
1b86826d 4482 u64 flags = btrfs_system_alloc_profile(fs_info);
15d1ff81 4483
4fbcdf66
FM
4484 /*
4485 * Ignore failure to create system chunk. We might end up not
4486 * needing it, as we might not need to COW all nodes/leafs from
4487 * the paths we visit in the chunk tree (they were already COWed
4488 * or created in the current transaction for example).
4489 */
2ff7e61e 4490 ret = btrfs_alloc_chunk(trans, fs_info, flags);
4fbcdf66
FM
4491 }
4492
4493 if (!ret) {
0b246afa
JM
4494 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4495 &fs_info->chunk_block_rsv,
4fbcdf66
FM
4496 thresh, BTRFS_RESERVE_NO_FLUSH);
4497 if (!ret)
4498 trans->chunk_bytes_reserved += thresh;
15d1ff81
LB
4499 }
4500}
4501
28b737f6
LB
4502/*
4503 * If force is CHUNK_ALLOC_FORCE:
4504 * - return 1 if it successfully allocates a chunk,
4505 * - return errors including -ENOSPC otherwise.
4506 * If force is NOT CHUNK_ALLOC_FORCE:
4507 * - return 0 if it doesn't need to allocate a new chunk,
4508 * - return 1 if it successfully allocates a chunk,
4509 * - return errors including -ENOSPC otherwise.
4510 */
6324fbf3 4511static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2ff7e61e 4512 struct btrfs_fs_info *fs_info, u64 flags, int force)
9ed74f2d 4513{
6324fbf3 4514 struct btrfs_space_info *space_info;
6d74119f 4515 int wait_for_alloc = 0;
9ed74f2d 4516 int ret = 0;
9ed74f2d 4517
c6b305a8
JB
4518 /* Don't re-enter if we're already allocating a chunk */
4519 if (trans->allocating_chunk)
4520 return -ENOSPC;
4521
0b246afa 4522 space_info = __find_space_info(fs_info, flags);
593060d7 4523 if (!space_info) {
0b246afa 4524 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
79787eaa 4525 BUG_ON(ret); /* -ENOMEM */
9ed74f2d 4526 }
79787eaa 4527 BUG_ON(!space_info); /* Logic error */
9ed74f2d 4528
6d74119f 4529again:
25179201 4530 spin_lock(&space_info->lock);
9e622d6b 4531 if (force < space_info->force_alloc)
0e4f8f88 4532 force = space_info->force_alloc;
25179201 4533 if (space_info->full) {
2ff7e61e 4534 if (should_alloc_chunk(fs_info, space_info, force))
09fb99a6
FDBM
4535 ret = -ENOSPC;
4536 else
4537 ret = 0;
25179201 4538 spin_unlock(&space_info->lock);
09fb99a6 4539 return ret;
9ed74f2d
JB
4540 }
4541
2ff7e61e 4542 if (!should_alloc_chunk(fs_info, space_info, force)) {
25179201 4543 spin_unlock(&space_info->lock);
6d74119f
JB
4544 return 0;
4545 } else if (space_info->chunk_alloc) {
4546 wait_for_alloc = 1;
4547 } else {
4548 space_info->chunk_alloc = 1;
9ed74f2d 4549 }
0e4f8f88 4550
25179201 4551 spin_unlock(&space_info->lock);
9ed74f2d 4552
6d74119f
JB
4553 mutex_lock(&fs_info->chunk_mutex);
4554
4555 /*
4556 * The chunk_mutex is held throughout the entirety of a chunk
4557 * allocation, so once we've acquired the chunk_mutex we know that the
4558 * other guy is done and we need to recheck and see if we should
4559 * allocate.
4560 */
4561 if (wait_for_alloc) {
4562 mutex_unlock(&fs_info->chunk_mutex);
4563 wait_for_alloc = 0;
4564 goto again;
4565 }
4566
c6b305a8
JB
4567 trans->allocating_chunk = true;
4568
67377734
JB
4569 /*
4570 * If we have mixed data/metadata chunks we want to make sure we keep
4571 * allocating mixed chunks instead of individual chunks.
4572 */
4573 if (btrfs_mixed_space_info(space_info))
4574 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4575
97e728d4
JB
4576 /*
4577 * if we're doing a data chunk, go ahead and make sure that
4578 * we keep a reasonable number of metadata chunks allocated in the
4579 * FS as well.
4580 */
9ed74f2d 4581 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
97e728d4
JB
4582 fs_info->data_chunk_allocations++;
4583 if (!(fs_info->data_chunk_allocations %
4584 fs_info->metadata_ratio))
4585 force_metadata_allocation(fs_info);
9ed74f2d
JB
4586 }
4587
15d1ff81
LB
4588 /*
4589 * Check if we have enough space in SYSTEM chunk because we may need
4590 * to update devices.
4591 */
2ff7e61e 4592 check_system_chunk(trans, fs_info, flags);
15d1ff81 4593
2ff7e61e 4594 ret = btrfs_alloc_chunk(trans, fs_info, flags);
c6b305a8 4595 trans->allocating_chunk = false;
92b8e897 4596
9ed74f2d 4597 spin_lock(&space_info->lock);
a81cb9a2
AO
4598 if (ret < 0 && ret != -ENOSPC)
4599 goto out;
9ed74f2d 4600 if (ret)
6324fbf3 4601 space_info->full = 1;
424499db
YZ
4602 else
4603 ret = 1;
6d74119f 4604
0e4f8f88 4605 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
a81cb9a2 4606out:
6d74119f 4607 space_info->chunk_alloc = 0;
9ed74f2d 4608 spin_unlock(&space_info->lock);
a25c75d5 4609 mutex_unlock(&fs_info->chunk_mutex);
00d80e34
FM
4610 /*
4611 * When we allocate a new chunk we reserve space in the chunk block
4612 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4613 * add new nodes/leafs to it if we end up needing to do it when
4614 * inserting the chunk item and updating device items as part of the
4615 * second phase of chunk allocation, performed by
4616 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4617 * large number of new block groups to create in our transaction
4618 * handle's new_bgs list to avoid exhausting the chunk block reserve
4619 * in extreme cases - like having a single transaction create many new
4620 * block groups when starting to write out the free space caches of all
4621 * the block groups that were made dirty during the lifetime of the
4622 * transaction.
4623 */
d9a0540a 4624 if (trans->can_flush_pending_bgs &&
ee22184b 4625 trans->chunk_bytes_reserved >= (u64)SZ_2M) {
2ff7e61e 4626 btrfs_create_pending_block_groups(trans, fs_info);
00d80e34
FM
4627 btrfs_trans_release_chunk_metadata(trans);
4628 }
0f9dd46c 4629 return ret;
6324fbf3 4630}
9ed74f2d 4631
c1c4919b 4632static int can_overcommit(struct btrfs_fs_info *fs_info,
a80c8dcf 4633 struct btrfs_space_info *space_info, u64 bytes,
c1c4919b
JM
4634 enum btrfs_reserve_flush_enum flush,
4635 bool system_chunk)
a80c8dcf 4636{
0b246afa 4637 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
957780eb 4638 u64 profile;
3c76cd84 4639 u64 space_size;
a80c8dcf
JB
4640 u64 avail;
4641 u64 used;
4642
957780eb
JB
4643 /* Don't overcommit when in mixed mode. */
4644 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4645 return 0;
4646
c1c4919b
JM
4647 if (system_chunk)
4648 profile = btrfs_system_alloc_profile(fs_info);
4649 else
4650 profile = btrfs_metadata_alloc_profile(fs_info);
4651
4136135b 4652 used = btrfs_space_info_used(space_info, false);
96f1bb57 4653
96f1bb57
JB
4654 /*
4655 * We only want to allow over committing if we have lots of actual space
4656 * free, but if we don't have enough space to handle the global reserve
4657 * space then we could end up having a real enospc problem when trying
4658 * to allocate a chunk or some other such important allocation.
4659 */
3c76cd84
MX
4660 spin_lock(&global_rsv->lock);
4661 space_size = calc_global_rsv_need_space(global_rsv);
4662 spin_unlock(&global_rsv->lock);
4663 if (used + space_size >= space_info->total_bytes)
96f1bb57
JB
4664 return 0;
4665
4666 used += space_info->bytes_may_use;
a80c8dcf 4667
a5ed45f8 4668 avail = atomic64_read(&fs_info->free_chunk_space);
a80c8dcf
JB
4669
4670 /*
4671 * If we have dup, raid1 or raid10 then only half of the free
53b381b3
DW
4672 * space is actually useable. For raid56, the space info used
4673 * doesn't include the parity drive, so we don't have to
4674 * change the math
a80c8dcf
JB
4675 */
4676 if (profile & (BTRFS_BLOCK_GROUP_DUP |
4677 BTRFS_BLOCK_GROUP_RAID1 |
4678 BTRFS_BLOCK_GROUP_RAID10))
4679 avail >>= 1;
4680
4681 /*
561c294d
MX
4682 * If we aren't flushing all things, let us overcommit up to
4683 * 1/2th of the space. If we can flush, don't let us overcommit
4684 * too much, let it overcommit up to 1/8 of the space.
a80c8dcf 4685 */
08e007d2 4686 if (flush == BTRFS_RESERVE_FLUSH_ALL)
14575aef 4687 avail >>= 3;
a80c8dcf 4688 else
14575aef 4689 avail >>= 1;
a80c8dcf 4690
14575aef 4691 if (used + bytes < space_info->total_bytes + avail)
a80c8dcf
JB
4692 return 1;
4693 return 0;
4694}
4695
2ff7e61e 4696static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
6c255e67 4697 unsigned long nr_pages, int nr_items)
da633a42 4698{
0b246afa 4699 struct super_block *sb = fs_info->sb;
da633a42 4700
925a6efb
JB
4701 if (down_read_trylock(&sb->s_umount)) {
4702 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4703 up_read(&sb->s_umount);
4704 } else {
da633a42
MX
4705 /*
4706 * We needn't worry the filesystem going from r/w to r/o though
4707 * we don't acquire ->s_umount mutex, because the filesystem
4708 * should guarantee the delalloc inodes list be empty after
4709 * the filesystem is readonly(all dirty pages are written to
4710 * the disk).
4711 */
0b246afa 4712 btrfs_start_delalloc_roots(fs_info, 0, nr_items);
98ad69cf 4713 if (!current->journal_info)
0b246afa 4714 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
da633a42
MX
4715 }
4716}
4717
2ff7e61e
JM
4718static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4719 u64 to_reclaim)
18cd8ea6
MX
4720{
4721 u64 bytes;
4722 int nr;
4723
2ff7e61e 4724 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
18cd8ea6
MX
4725 nr = (int)div64_u64(to_reclaim, bytes);
4726 if (!nr)
4727 nr = 1;
4728 return nr;
4729}
4730
ee22184b 4731#define EXTENT_SIZE_PER_ITEM SZ_256K
c61a16a7 4732
9ed74f2d 4733/*
5da9d01b 4734 * shrink metadata reservation for delalloc
9ed74f2d 4735 */
c1c4919b
JM
4736static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4737 u64 orig, bool wait_ordered)
5da9d01b 4738{
0ca1f7ce 4739 struct btrfs_block_rsv *block_rsv;
0019f10d 4740 struct btrfs_space_info *space_info;
663350ac 4741 struct btrfs_trans_handle *trans;
f4c738c2 4742 u64 delalloc_bytes;
5da9d01b 4743 u64 max_reclaim;
b1953bce 4744 long time_left;
d3ee29e3
MX
4745 unsigned long nr_pages;
4746 int loops;
b0244199 4747 int items;
08e007d2 4748 enum btrfs_reserve_flush_enum flush;
5da9d01b 4749
c61a16a7 4750 /* Calc the number of the pages we need flush for space reservation */
2ff7e61e 4751 items = calc_reclaim_items_nr(fs_info, to_reclaim);
8eb0dfdb 4752 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
c61a16a7 4753
663350ac 4754 trans = (struct btrfs_trans_handle *)current->journal_info;
0b246afa 4755 block_rsv = &fs_info->delalloc_block_rsv;
0019f10d 4756 space_info = block_rsv->space_info;
bf9022e0 4757
963d678b 4758 delalloc_bytes = percpu_counter_sum_positive(
0b246afa 4759 &fs_info->delalloc_bytes);
f4c738c2 4760 if (delalloc_bytes == 0) {
fdb5effd 4761 if (trans)
f4c738c2 4762 return;
38c135af 4763 if (wait_ordered)
0b246afa 4764 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
f4c738c2 4765 return;
fdb5effd
JB
4766 }
4767
d3ee29e3 4768 loops = 0;
f4c738c2
JB
4769 while (delalloc_bytes && loops < 3) {
4770 max_reclaim = min(delalloc_bytes, to_reclaim);
09cbfeaf 4771 nr_pages = max_reclaim >> PAGE_SHIFT;
2ff7e61e 4772 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
dea31f52
JB
4773 /*
4774 * We need to wait for the async pages to actually start before
4775 * we do anything.
4776 */
0b246afa 4777 max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
9f3a074d
MX
4778 if (!max_reclaim)
4779 goto skip_async;
4780
4781 if (max_reclaim <= nr_pages)
4782 max_reclaim = 0;
4783 else
4784 max_reclaim -= nr_pages;
dea31f52 4785
0b246afa
JM
4786 wait_event(fs_info->async_submit_wait,
4787 atomic_read(&fs_info->async_delalloc_pages) <=
9f3a074d
MX
4788 (int)max_reclaim);
4789skip_async:
08e007d2
MX
4790 if (!trans)
4791 flush = BTRFS_RESERVE_FLUSH_ALL;
4792 else
4793 flush = BTRFS_RESERVE_NO_FLUSH;
0019f10d 4794 spin_lock(&space_info->lock);
c1c4919b 4795 if (can_overcommit(fs_info, space_info, orig, flush, false)) {
f4c738c2
JB
4796 spin_unlock(&space_info->lock);
4797 break;
4798 }
957780eb
JB
4799 if (list_empty(&space_info->tickets) &&
4800 list_empty(&space_info->priority_tickets)) {
4801 spin_unlock(&space_info->lock);
4802 break;
4803 }
0019f10d 4804 spin_unlock(&space_info->lock);
5da9d01b 4805
36e39c40 4806 loops++;
f104d044 4807 if (wait_ordered && !trans) {
0b246afa 4808 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
f104d044 4809 } else {
f4c738c2 4810 time_left = schedule_timeout_killable(1);
f104d044
JB
4811 if (time_left)
4812 break;
4813 }
963d678b 4814 delalloc_bytes = percpu_counter_sum_positive(
0b246afa 4815 &fs_info->delalloc_bytes);
5da9d01b 4816 }
5da9d01b
YZ
4817}
4818
663350ac
JB
4819/**
4820 * maybe_commit_transaction - possibly commit the transaction if its ok to
4821 * @root - the root we're allocating for
4822 * @bytes - the number of bytes we want to reserve
4823 * @force - force the commit
8bb8ab2e 4824 *
663350ac
JB
4825 * This will check to make sure that committing the transaction will actually
4826 * get us somewhere and then commit the transaction if it does. Otherwise it
4827 * will return -ENOSPC.
8bb8ab2e 4828 */
0c9ab349 4829static int may_commit_transaction(struct btrfs_fs_info *fs_info,
663350ac
JB
4830 struct btrfs_space_info *space_info,
4831 u64 bytes, int force)
4832{
0b246afa 4833 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
663350ac
JB
4834 struct btrfs_trans_handle *trans;
4835
4836 trans = (struct btrfs_trans_handle *)current->journal_info;
4837 if (trans)
4838 return -EAGAIN;
4839
4840 if (force)
4841 goto commit;
4842
4843 /* See if there is enough pinned space to make this reservation */
b150a4f1 4844 if (percpu_counter_compare(&space_info->total_bytes_pinned,
0424c548 4845 bytes) >= 0)
663350ac 4846 goto commit;
663350ac
JB
4847
4848 /*
4849 * See if there is some space in the delayed insertion reservation for
4850 * this reservation.
4851 */
4852 if (space_info != delayed_rsv->space_info)
4853 return -ENOSPC;
4854
4855 spin_lock(&delayed_rsv->lock);
b150a4f1 4856 if (percpu_counter_compare(&space_info->total_bytes_pinned,
28785f70 4857 bytes - delayed_rsv->size) < 0) {
663350ac
JB
4858 spin_unlock(&delayed_rsv->lock);
4859 return -ENOSPC;
4860 }
4861 spin_unlock(&delayed_rsv->lock);
4862
4863commit:
a9b3311e 4864 trans = btrfs_join_transaction(fs_info->extent_root);
663350ac
JB
4865 if (IS_ERR(trans))
4866 return -ENOSPC;
4867
3a45bb20 4868 return btrfs_commit_transaction(trans);
663350ac
JB
4869}
4870
957780eb
JB
4871struct reserve_ticket {
4872 u64 bytes;
4873 int error;
4874 struct list_head list;
4875 wait_queue_head_t wait;
96c3f433
JB
4876};
4877
0c9ab349 4878static int flush_space(struct btrfs_fs_info *fs_info,
96c3f433
JB
4879 struct btrfs_space_info *space_info, u64 num_bytes,
4880 u64 orig_bytes, int state)
4881{
a9b3311e 4882 struct btrfs_root *root = fs_info->extent_root;
96c3f433
JB
4883 struct btrfs_trans_handle *trans;
4884 int nr;
f4c738c2 4885 int ret = 0;
96c3f433
JB
4886
4887 switch (state) {
96c3f433
JB
4888 case FLUSH_DELAYED_ITEMS_NR:
4889 case FLUSH_DELAYED_ITEMS:
18cd8ea6 4890 if (state == FLUSH_DELAYED_ITEMS_NR)
2ff7e61e 4891 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
18cd8ea6 4892 else
96c3f433 4893 nr = -1;
18cd8ea6 4894
96c3f433
JB
4895 trans = btrfs_join_transaction(root);
4896 if (IS_ERR(trans)) {
4897 ret = PTR_ERR(trans);
4898 break;
4899 }
2ff7e61e 4900 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
3a45bb20 4901 btrfs_end_transaction(trans);
96c3f433 4902 break;
67b0fd63
JB
4903 case FLUSH_DELALLOC:
4904 case FLUSH_DELALLOC_WAIT:
c1c4919b 4905 shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
67b0fd63
JB
4906 state == FLUSH_DELALLOC_WAIT);
4907 break;
ea658bad
JB
4908 case ALLOC_CHUNK:
4909 trans = btrfs_join_transaction(root);
4910 if (IS_ERR(trans)) {
4911 ret = PTR_ERR(trans);
4912 break;
4913 }
2ff7e61e 4914 ret = do_chunk_alloc(trans, fs_info,
1b86826d 4915 btrfs_metadata_alloc_profile(fs_info),
ea658bad 4916 CHUNK_ALLOC_NO_FORCE);
3a45bb20 4917 btrfs_end_transaction(trans);
eecba891 4918 if (ret > 0 || ret == -ENOSPC)
ea658bad
JB
4919 ret = 0;
4920 break;
96c3f433 4921 case COMMIT_TRANS:
0c9ab349
JM
4922 ret = may_commit_transaction(fs_info, space_info,
4923 orig_bytes, 0);
96c3f433
JB
4924 break;
4925 default:
4926 ret = -ENOSPC;
4927 break;
4928 }
4929
0b246afa 4930 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes,
f376df2b 4931 orig_bytes, state, ret);
96c3f433
JB
4932 return ret;
4933}
21c7e756
MX
4934
4935static inline u64
c1c4919b
JM
4936btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4937 struct btrfs_space_info *space_info,
4938 bool system_chunk)
21c7e756 4939{
957780eb 4940 struct reserve_ticket *ticket;
21c7e756
MX
4941 u64 used;
4942 u64 expected;
957780eb 4943 u64 to_reclaim = 0;
21c7e756 4944
957780eb
JB
4945 list_for_each_entry(ticket, &space_info->tickets, list)
4946 to_reclaim += ticket->bytes;
4947 list_for_each_entry(ticket, &space_info->priority_tickets, list)
4948 to_reclaim += ticket->bytes;
4949 if (to_reclaim)
4950 return to_reclaim;
21c7e756 4951
e0af2484 4952 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
c1c4919b
JM
4953 if (can_overcommit(fs_info, space_info, to_reclaim,
4954 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
e0af2484
WX
4955 return 0;
4956
21c7e756
MX
4957 used = space_info->bytes_used + space_info->bytes_reserved +
4958 space_info->bytes_pinned + space_info->bytes_readonly +
4959 space_info->bytes_may_use;
c1c4919b
JM
4960 if (can_overcommit(fs_info, space_info, SZ_1M,
4961 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
21c7e756
MX
4962 expected = div_factor_fine(space_info->total_bytes, 95);
4963 else
4964 expected = div_factor_fine(space_info->total_bytes, 90);
4965
4966 if (used > expected)
4967 to_reclaim = used - expected;
4968 else
4969 to_reclaim = 0;
4970 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4971 space_info->bytes_reserved);
21c7e756
MX
4972 return to_reclaim;
4973}
4974
c1c4919b
JM
4975static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4976 struct btrfs_space_info *space_info,
4977 u64 used, bool system_chunk)
21c7e756 4978{
365c5313
JB
4979 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4980
4981 /* If we're just plain full then async reclaim just slows us down. */
baee8790 4982 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
365c5313
JB
4983 return 0;
4984
c1c4919b
JM
4985 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4986 system_chunk))
d38b349c
JB
4987 return 0;
4988
0b246afa
JM
4989 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4990 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
21c7e756
MX
4991}
4992
957780eb 4993static void wake_all_tickets(struct list_head *head)
21c7e756 4994{
957780eb 4995 struct reserve_ticket *ticket;
25ce459c 4996
957780eb
JB
4997 while (!list_empty(head)) {
4998 ticket = list_first_entry(head, struct reserve_ticket, list);
4999 list_del_init(&ticket->list);
5000 ticket->error = -ENOSPC;
5001 wake_up(&ticket->wait);
21c7e756 5002 }
21c7e756
MX
5003}
5004
957780eb
JB
5005/*
5006 * This is for normal flushers, we can wait all goddamned day if we want to. We
5007 * will loop and continuously try to flush as long as we are making progress.
5008 * We count progress as clearing off tickets each time we have to loop.
5009 */
21c7e756
MX
5010static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5011{
5012 struct btrfs_fs_info *fs_info;
5013 struct btrfs_space_info *space_info;
5014 u64 to_reclaim;
5015 int flush_state;
957780eb 5016 int commit_cycles = 0;
ce129655 5017 u64 last_tickets_id;
21c7e756
MX
5018
5019 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5020 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5021
957780eb 5022 spin_lock(&space_info->lock);
c1c4919b
JM
5023 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5024 false);
957780eb
JB
5025 if (!to_reclaim) {
5026 space_info->flush = 0;
5027 spin_unlock(&space_info->lock);
21c7e756 5028 return;
957780eb 5029 }
ce129655 5030 last_tickets_id = space_info->tickets_id;
957780eb 5031 spin_unlock(&space_info->lock);
21c7e756
MX
5032
5033 flush_state = FLUSH_DELAYED_ITEMS_NR;
957780eb
JB
5034 do {
5035 struct reserve_ticket *ticket;
5036 int ret;
5037
0c9ab349
JM
5038 ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim,
5039 flush_state);
957780eb
JB
5040 spin_lock(&space_info->lock);
5041 if (list_empty(&space_info->tickets)) {
5042 space_info->flush = 0;
5043 spin_unlock(&space_info->lock);
5044 return;
5045 }
c1c4919b
JM
5046 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5047 space_info,
5048 false);
957780eb
JB
5049 ticket = list_first_entry(&space_info->tickets,
5050 struct reserve_ticket, list);
ce129655 5051 if (last_tickets_id == space_info->tickets_id) {
957780eb
JB
5052 flush_state++;
5053 } else {
ce129655 5054 last_tickets_id = space_info->tickets_id;
957780eb
JB
5055 flush_state = FLUSH_DELAYED_ITEMS_NR;
5056 if (commit_cycles)
5057 commit_cycles--;
5058 }
5059
5060 if (flush_state > COMMIT_TRANS) {
5061 commit_cycles++;
5062 if (commit_cycles > 2) {
5063 wake_all_tickets(&space_info->tickets);
5064 space_info->flush = 0;
5065 } else {
5066 flush_state = FLUSH_DELAYED_ITEMS_NR;
5067 }
5068 }
5069 spin_unlock(&space_info->lock);
5070 } while (flush_state <= COMMIT_TRANS);
5071}
5072
5073void btrfs_init_async_reclaim_work(struct work_struct *work)
5074{
5075 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5076}
5077
5078static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5079 struct btrfs_space_info *space_info,
5080 struct reserve_ticket *ticket)
5081{
5082 u64 to_reclaim;
5083 int flush_state = FLUSH_DELAYED_ITEMS_NR;
5084
5085 spin_lock(&space_info->lock);
c1c4919b
JM
5086 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5087 false);
957780eb
JB
5088 if (!to_reclaim) {
5089 spin_unlock(&space_info->lock);
5090 return;
5091 }
5092 spin_unlock(&space_info->lock);
5093
21c7e756 5094 do {
0c9ab349
JM
5095 flush_space(fs_info, space_info, to_reclaim, to_reclaim,
5096 flush_state);
21c7e756 5097 flush_state++;
957780eb
JB
5098 spin_lock(&space_info->lock);
5099 if (ticket->bytes == 0) {
5100 spin_unlock(&space_info->lock);
21c7e756 5101 return;
957780eb
JB
5102 }
5103 spin_unlock(&space_info->lock);
5104
5105 /*
5106 * Priority flushers can't wait on delalloc without
5107 * deadlocking.
5108 */
5109 if (flush_state == FLUSH_DELALLOC ||
5110 flush_state == FLUSH_DELALLOC_WAIT)
5111 flush_state = ALLOC_CHUNK;
365c5313 5112 } while (flush_state < COMMIT_TRANS);
21c7e756
MX
5113}
5114
957780eb
JB
5115static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5116 struct btrfs_space_info *space_info,
5117 struct reserve_ticket *ticket, u64 orig_bytes)
5118
21c7e756 5119{
957780eb
JB
5120 DEFINE_WAIT(wait);
5121 int ret = 0;
5122
5123 spin_lock(&space_info->lock);
5124 while (ticket->bytes > 0 && ticket->error == 0) {
5125 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5126 if (ret) {
5127 ret = -EINTR;
5128 break;
5129 }
5130 spin_unlock(&space_info->lock);
5131
5132 schedule();
5133
5134 finish_wait(&ticket->wait, &wait);
5135 spin_lock(&space_info->lock);
5136 }
5137 if (!ret)
5138 ret = ticket->error;
5139 if (!list_empty(&ticket->list))
5140 list_del_init(&ticket->list);
5141 if (ticket->bytes && ticket->bytes < orig_bytes) {
5142 u64 num_bytes = orig_bytes - ticket->bytes;
5143 space_info->bytes_may_use -= num_bytes;
5144 trace_btrfs_space_reservation(fs_info, "space_info",
5145 space_info->flags, num_bytes, 0);
5146 }
5147 spin_unlock(&space_info->lock);
5148
5149 return ret;
21c7e756
MX
5150}
5151
4a92b1b8
JB
5152/**
5153 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5154 * @root - the root we're allocating for
957780eb 5155 * @space_info - the space info we want to allocate from
4a92b1b8 5156 * @orig_bytes - the number of bytes we want
48fc7f7e 5157 * @flush - whether or not we can flush to make our reservation
8bb8ab2e 5158 *
01327610 5159 * This will reserve orig_bytes number of bytes from the space info associated
4a92b1b8
JB
5160 * with the block_rsv. If there is not enough space it will make an attempt to
5161 * flush out space to make room. It will do this by flushing delalloc if
5162 * possible or committing the transaction. If flush is 0 then no attempts to
5163 * regain reservations will be made and this will fail if there is not enough
5164 * space already.
8bb8ab2e 5165 */
c1c4919b 5166static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
957780eb
JB
5167 struct btrfs_space_info *space_info,
5168 u64 orig_bytes,
c1c4919b
JM
5169 enum btrfs_reserve_flush_enum flush,
5170 bool system_chunk)
9ed74f2d 5171{
957780eb 5172 struct reserve_ticket ticket;
2bf64758 5173 u64 used;
8bb8ab2e 5174 int ret = 0;
9ed74f2d 5175
957780eb 5176 ASSERT(orig_bytes);
8ca17f0f 5177 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
fdb5effd 5178
8bb8ab2e 5179 spin_lock(&space_info->lock);
fdb5effd 5180 ret = -ENOSPC;
4136135b 5181 used = btrfs_space_info_used(space_info, true);
9ed74f2d 5182
8bb8ab2e 5183 /*
957780eb
JB
5184 * If we have enough space then hooray, make our reservation and carry
5185 * on. If not see if we can overcommit, and if we can, hooray carry on.
5186 * If not things get more complicated.
8bb8ab2e 5187 */
957780eb
JB
5188 if (used + orig_bytes <= space_info->total_bytes) {
5189 space_info->bytes_may_use += orig_bytes;
0b246afa
JM
5190 trace_btrfs_space_reservation(fs_info, "space_info",
5191 space_info->flags, orig_bytes, 1);
957780eb 5192 ret = 0;
c1c4919b
JM
5193 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5194 system_chunk)) {
44734ed1 5195 space_info->bytes_may_use += orig_bytes;
0b246afa
JM
5196 trace_btrfs_space_reservation(fs_info, "space_info",
5197 space_info->flags, orig_bytes, 1);
44734ed1 5198 ret = 0;
2bf64758
JB
5199 }
5200
8bb8ab2e 5201 /*
957780eb
JB
5202 * If we couldn't make a reservation then setup our reservation ticket
5203 * and kick the async worker if it's not already running.
08e007d2 5204 *
957780eb
JB
5205 * If we are a priority flusher then we just need to add our ticket to
5206 * the list and we will do our own flushing further down.
8bb8ab2e 5207 */
72bcd99d 5208 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
957780eb
JB
5209 ticket.bytes = orig_bytes;
5210 ticket.error = 0;
5211 init_waitqueue_head(&ticket.wait);
5212 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5213 list_add_tail(&ticket.list, &space_info->tickets);
5214 if (!space_info->flush) {
5215 space_info->flush = 1;
0b246afa 5216 trace_btrfs_trigger_flush(fs_info,
f376df2b
JB
5217 space_info->flags,
5218 orig_bytes, flush,
5219 "enospc");
957780eb 5220 queue_work(system_unbound_wq,
c1c4919b 5221 &fs_info->async_reclaim_work);
957780eb
JB
5222 }
5223 } else {
5224 list_add_tail(&ticket.list,
5225 &space_info->priority_tickets);
5226 }
21c7e756
MX
5227 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5228 used += orig_bytes;
f6acfd50
JB
5229 /*
5230 * We will do the space reservation dance during log replay,
5231 * which means we won't have fs_info->fs_root set, so don't do
5232 * the async reclaim as we will panic.
5233 */
0b246afa 5234 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
c1c4919b
JM
5235 need_do_async_reclaim(fs_info, space_info,
5236 used, system_chunk) &&
0b246afa
JM
5237 !work_busy(&fs_info->async_reclaim_work)) {
5238 trace_btrfs_trigger_flush(fs_info, space_info->flags,
5239 orig_bytes, flush, "preempt");
21c7e756 5240 queue_work(system_unbound_wq,
0b246afa 5241 &fs_info->async_reclaim_work);
f376df2b 5242 }
8bb8ab2e 5243 }
f0486c68 5244 spin_unlock(&space_info->lock);
08e007d2 5245 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
957780eb 5246 return ret;
f0486c68 5247
957780eb 5248 if (flush == BTRFS_RESERVE_FLUSH_ALL)
0b246afa 5249 return wait_reserve_ticket(fs_info, space_info, &ticket,
957780eb 5250 orig_bytes);
08e007d2 5251
957780eb 5252 ret = 0;
0b246afa 5253 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
957780eb
JB
5254 spin_lock(&space_info->lock);
5255 if (ticket.bytes) {
5256 if (ticket.bytes < orig_bytes) {
5257 u64 num_bytes = orig_bytes - ticket.bytes;
5258 space_info->bytes_may_use -= num_bytes;
0b246afa
JM
5259 trace_btrfs_space_reservation(fs_info, "space_info",
5260 space_info->flags,
5261 num_bytes, 0);
08e007d2 5262
957780eb
JB
5263 }
5264 list_del_init(&ticket.list);
5265 ret = -ENOSPC;
5266 }
5267 spin_unlock(&space_info->lock);
5268 ASSERT(list_empty(&ticket.list));
5269 return ret;
5270}
8bb8ab2e 5271
957780eb
JB
5272/**
5273 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5274 * @root - the root we're allocating for
5275 * @block_rsv - the block_rsv we're allocating for
5276 * @orig_bytes - the number of bytes we want
5277 * @flush - whether or not we can flush to make our reservation
5278 *
5279 * This will reserve orgi_bytes number of bytes from the space info associated
5280 * with the block_rsv. If there is not enough space it will make an attempt to
5281 * flush out space to make room. It will do this by flushing delalloc if
5282 * possible or committing the transaction. If flush is 0 then no attempts to
5283 * regain reservations will be made and this will fail if there is not enough
5284 * space already.
5285 */
5286static int reserve_metadata_bytes(struct btrfs_root *root,
5287 struct btrfs_block_rsv *block_rsv,
5288 u64 orig_bytes,
5289 enum btrfs_reserve_flush_enum flush)
5290{
0b246afa
JM
5291 struct btrfs_fs_info *fs_info = root->fs_info;
5292 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
957780eb 5293 int ret;
c1c4919b 5294 bool system_chunk = (root == fs_info->chunk_root);
957780eb 5295
c1c4919b
JM
5296 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5297 orig_bytes, flush, system_chunk);
5d80366e
JB
5298 if (ret == -ENOSPC &&
5299 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5d80366e
JB
5300 if (block_rsv != global_rsv &&
5301 !block_rsv_use_bytes(global_rsv, orig_bytes))
5302 ret = 0;
5303 }
cab45e22 5304 if (ret == -ENOSPC)
0b246afa 5305 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
957780eb
JB
5306 block_rsv->space_info->flags,
5307 orig_bytes, 1);
f0486c68
YZ
5308 return ret;
5309}
5310
79787eaa
JM
5311static struct btrfs_block_rsv *get_block_rsv(
5312 const struct btrfs_trans_handle *trans,
5313 const struct btrfs_root *root)
f0486c68 5314{
0b246afa 5315 struct btrfs_fs_info *fs_info = root->fs_info;
4c13d758
JB
5316 struct btrfs_block_rsv *block_rsv = NULL;
5317
e9cf439f 5318 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
0b246afa
JM
5319 (root == fs_info->csum_root && trans->adding_csums) ||
5320 (root == fs_info->uuid_root))
f7a81ea4
SB
5321 block_rsv = trans->block_rsv;
5322
4c13d758 5323 if (!block_rsv)
f0486c68
YZ
5324 block_rsv = root->block_rsv;
5325
5326 if (!block_rsv)
0b246afa 5327 block_rsv = &fs_info->empty_block_rsv;
f0486c68
YZ
5328
5329 return block_rsv;
5330}
5331
5332static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5333 u64 num_bytes)
5334{
5335 int ret = -ENOSPC;
5336 spin_lock(&block_rsv->lock);
5337 if (block_rsv->reserved >= num_bytes) {
5338 block_rsv->reserved -= num_bytes;
5339 if (block_rsv->reserved < block_rsv->size)
5340 block_rsv->full = 0;
5341 ret = 0;
5342 }
5343 spin_unlock(&block_rsv->lock);
5344 return ret;
5345}
5346
5347static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5348 u64 num_bytes, int update_size)
5349{
5350 spin_lock(&block_rsv->lock);
5351 block_rsv->reserved += num_bytes;
5352 if (update_size)
5353 block_rsv->size += num_bytes;
5354 else if (block_rsv->reserved >= block_rsv->size)
5355 block_rsv->full = 1;
5356 spin_unlock(&block_rsv->lock);
5357}
5358
d52be818
JB
5359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5360 struct btrfs_block_rsv *dest, u64 num_bytes,
5361 int min_factor)
5362{
5363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5364 u64 min_bytes;
5365
5366 if (global_rsv->space_info != dest->space_info)
5367 return -ENOSPC;
5368
5369 spin_lock(&global_rsv->lock);
5370 min_bytes = div_factor(global_rsv->size, min_factor);
5371 if (global_rsv->reserved < min_bytes + num_bytes) {
5372 spin_unlock(&global_rsv->lock);
5373 return -ENOSPC;
5374 }
5375 global_rsv->reserved -= num_bytes;
5376 if (global_rsv->reserved < global_rsv->size)
5377 global_rsv->full = 0;
5378 spin_unlock(&global_rsv->lock);
5379
5380 block_rsv_add_bytes(dest, num_bytes, 1);
5381 return 0;
5382}
5383
957780eb
JB
5384/*
5385 * This is for space we already have accounted in space_info->bytes_may_use, so
5386 * basically when we're returning space from block_rsv's.
5387 */
5388static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5389 struct btrfs_space_info *space_info,
5390 u64 num_bytes)
5391{
5392 struct reserve_ticket *ticket;
5393 struct list_head *head;
5394 u64 used;
5395 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5396 bool check_overcommit = false;
5397
5398 spin_lock(&space_info->lock);
5399 head = &space_info->priority_tickets;
5400
5401 /*
5402 * If we are over our limit then we need to check and see if we can
5403 * overcommit, and if we can't then we just need to free up our space
5404 * and not satisfy any requests.
5405 */
5406 used = space_info->bytes_used + space_info->bytes_reserved +
5407 space_info->bytes_pinned + space_info->bytes_readonly +
5408 space_info->bytes_may_use;
5409 if (used - num_bytes >= space_info->total_bytes)
5410 check_overcommit = true;
5411again:
5412 while (!list_empty(head) && num_bytes) {
5413 ticket = list_first_entry(head, struct reserve_ticket,
5414 list);
5415 /*
5416 * We use 0 bytes because this space is already reserved, so
5417 * adding the ticket space would be a double count.
5418 */
5419 if (check_overcommit &&
c1c4919b 5420 !can_overcommit(fs_info, space_info, 0, flush, false))
957780eb
JB
5421 break;
5422 if (num_bytes >= ticket->bytes) {
5423 list_del_init(&ticket->list);
5424 num_bytes -= ticket->bytes;
5425 ticket->bytes = 0;
ce129655 5426 space_info->tickets_id++;
957780eb
JB
5427 wake_up(&ticket->wait);
5428 } else {
5429 ticket->bytes -= num_bytes;
5430 num_bytes = 0;
5431 }
5432 }
5433
5434 if (num_bytes && head == &space_info->priority_tickets) {
5435 head = &space_info->tickets;
5436 flush = BTRFS_RESERVE_FLUSH_ALL;
5437 goto again;
5438 }
5439 space_info->bytes_may_use -= num_bytes;
5440 trace_btrfs_space_reservation(fs_info, "space_info",
5441 space_info->flags, num_bytes, 0);
5442 spin_unlock(&space_info->lock);
5443}
5444
5445/*
5446 * This is for newly allocated space that isn't accounted in
5447 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5448 * we use this helper.
5449 */
5450static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5451 struct btrfs_space_info *space_info,
5452 u64 num_bytes)
5453{
5454 struct reserve_ticket *ticket;
5455 struct list_head *head = &space_info->priority_tickets;
5456
5457again:
5458 while (!list_empty(head) && num_bytes) {
5459 ticket = list_first_entry(head, struct reserve_ticket,
5460 list);
5461 if (num_bytes >= ticket->bytes) {
5462 trace_btrfs_space_reservation(fs_info, "space_info",
5463 space_info->flags,
5464 ticket->bytes, 1);
5465 list_del_init(&ticket->list);
5466 num_bytes -= ticket->bytes;
5467 space_info->bytes_may_use += ticket->bytes;
5468 ticket->bytes = 0;
ce129655 5469 space_info->tickets_id++;
957780eb
JB
5470 wake_up(&ticket->wait);
5471 } else {
5472 trace_btrfs_space_reservation(fs_info, "space_info",
5473 space_info->flags,
5474 num_bytes, 1);
5475 space_info->bytes_may_use += num_bytes;
5476 ticket->bytes -= num_bytes;
5477 num_bytes = 0;
5478 }
5479 }
5480
5481 if (num_bytes && head == &space_info->priority_tickets) {
5482 head = &space_info->tickets;
5483 goto again;
5484 }
5485}
5486
8c2a3ca2
JB
5487static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5488 struct btrfs_block_rsv *block_rsv,
62a45b60 5489 struct btrfs_block_rsv *dest, u64 num_bytes)
f0486c68
YZ
5490{
5491 struct btrfs_space_info *space_info = block_rsv->space_info;
5492
5493 spin_lock(&block_rsv->lock);
5494 if (num_bytes == (u64)-1)
5495 num_bytes = block_rsv->size;
5496 block_rsv->size -= num_bytes;
5497 if (block_rsv->reserved >= block_rsv->size) {
5498 num_bytes = block_rsv->reserved - block_rsv->size;
5499 block_rsv->reserved = block_rsv->size;
5500 block_rsv->full = 1;
5501 } else {
5502 num_bytes = 0;
5503 }
5504 spin_unlock(&block_rsv->lock);
5505
5506 if (num_bytes > 0) {
5507 if (dest) {
e9e22899
JB
5508 spin_lock(&dest->lock);
5509 if (!dest->full) {
5510 u64 bytes_to_add;
5511
5512 bytes_to_add = dest->size - dest->reserved;
5513 bytes_to_add = min(num_bytes, bytes_to_add);
5514 dest->reserved += bytes_to_add;
5515 if (dest->reserved >= dest->size)
5516 dest->full = 1;
5517 num_bytes -= bytes_to_add;
5518 }
5519 spin_unlock(&dest->lock);
5520 }
957780eb
JB
5521 if (num_bytes)
5522 space_info_add_old_bytes(fs_info, space_info,
5523 num_bytes);
9ed74f2d 5524 }
f0486c68 5525}
4e06bdd6 5526
25d609f8
JB
5527int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5528 struct btrfs_block_rsv *dst, u64 num_bytes,
5529 int update_size)
f0486c68
YZ
5530{
5531 int ret;
9ed74f2d 5532
f0486c68
YZ
5533 ret = block_rsv_use_bytes(src, num_bytes);
5534 if (ret)
5535 return ret;
9ed74f2d 5536
25d609f8 5537 block_rsv_add_bytes(dst, num_bytes, update_size);
9ed74f2d
JB
5538 return 0;
5539}
5540
66d8f3dd 5541void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
9ed74f2d 5542{
f0486c68
YZ
5543 memset(rsv, 0, sizeof(*rsv));
5544 spin_lock_init(&rsv->lock);
66d8f3dd 5545 rsv->type = type;
f0486c68
YZ
5546}
5547
2ff7e61e 5548struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
66d8f3dd 5549 unsigned short type)
f0486c68
YZ
5550{
5551 struct btrfs_block_rsv *block_rsv;
9ed74f2d 5552
f0486c68
YZ
5553 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5554 if (!block_rsv)
5555 return NULL;
9ed74f2d 5556
66d8f3dd 5557 btrfs_init_block_rsv(block_rsv, type);
f0486c68
YZ
5558 block_rsv->space_info = __find_space_info(fs_info,
5559 BTRFS_BLOCK_GROUP_METADATA);
f0486c68
YZ
5560 return block_rsv;
5561}
9ed74f2d 5562
2ff7e61e 5563void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
f0486c68
YZ
5564 struct btrfs_block_rsv *rsv)
5565{
2aaa6655
JB
5566 if (!rsv)
5567 return;
2ff7e61e 5568 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
dabdb640 5569 kfree(rsv);
9ed74f2d
JB
5570}
5571
cdfb080e
CM
5572void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5573{
5574 kfree(rsv);
5575}
5576
08e007d2
MX
5577int btrfs_block_rsv_add(struct btrfs_root *root,
5578 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5579 enum btrfs_reserve_flush_enum flush)
9ed74f2d 5580{
f0486c68 5581 int ret;
9ed74f2d 5582
f0486c68
YZ
5583 if (num_bytes == 0)
5584 return 0;
8bb8ab2e 5585
61b520a9 5586 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
f0486c68
YZ
5587 if (!ret) {
5588 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5589 return 0;
5590 }
9ed74f2d 5591
f0486c68 5592 return ret;
f0486c68 5593}
9ed74f2d 5594
2ff7e61e 5595int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
f0486c68
YZ
5596{
5597 u64 num_bytes = 0;
f0486c68 5598 int ret = -ENOSPC;
9ed74f2d 5599
f0486c68
YZ
5600 if (!block_rsv)
5601 return 0;
9ed74f2d 5602
f0486c68 5603 spin_lock(&block_rsv->lock);
36ba022a
JB
5604 num_bytes = div_factor(block_rsv->size, min_factor);
5605 if (block_rsv->reserved >= num_bytes)
5606 ret = 0;
5607 spin_unlock(&block_rsv->lock);
9ed74f2d 5608
36ba022a
JB
5609 return ret;
5610}
5611
08e007d2
MX
5612int btrfs_block_rsv_refill(struct btrfs_root *root,
5613 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5614 enum btrfs_reserve_flush_enum flush)
36ba022a
JB
5615{
5616 u64 num_bytes = 0;
5617 int ret = -ENOSPC;
5618
5619 if (!block_rsv)
5620 return 0;
5621
5622 spin_lock(&block_rsv->lock);
5623 num_bytes = min_reserved;
13553e52 5624 if (block_rsv->reserved >= num_bytes)
f0486c68 5625 ret = 0;
13553e52 5626 else
f0486c68 5627 num_bytes -= block_rsv->reserved;
f0486c68 5628 spin_unlock(&block_rsv->lock);
13553e52 5629
f0486c68
YZ
5630 if (!ret)
5631 return 0;
5632
aa38a711 5633 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
dabdb640
JB
5634 if (!ret) {
5635 block_rsv_add_bytes(block_rsv, num_bytes, 0);
f0486c68 5636 return 0;
6a63209f 5637 }
9ed74f2d 5638
13553e52 5639 return ret;
f0486c68
YZ
5640}
5641
2ff7e61e 5642void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
f0486c68
YZ
5643 struct btrfs_block_rsv *block_rsv,
5644 u64 num_bytes)
5645{
0b246afa
JM
5646 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5647
17504584 5648 if (global_rsv == block_rsv ||
f0486c68
YZ
5649 block_rsv->space_info != global_rsv->space_info)
5650 global_rsv = NULL;
0b246afa 5651 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
6a63209f
JB
5652}
5653
8929ecfa
YZ
5654static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5655{
5656 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5657 struct btrfs_space_info *sinfo = block_rsv->space_info;
5658 u64 num_bytes;
6a63209f 5659
ae2e4728
JB
5660 /*
5661 * The global block rsv is based on the size of the extent tree, the
5662 * checksum tree and the root tree. If the fs is empty we want to set
5663 * it to a minimal amount for safety.
5664 */
5665 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5666 btrfs_root_used(&fs_info->csum_root->root_item) +
5667 btrfs_root_used(&fs_info->tree_root->root_item);
5668 num_bytes = max_t(u64, num_bytes, SZ_16M);
33b4d47f 5669
8929ecfa 5670 spin_lock(&sinfo->lock);
1f699d38 5671 spin_lock(&block_rsv->lock);
4e06bdd6 5672
ee22184b 5673 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
4e06bdd6 5674
fb4b10e5 5675 if (block_rsv->reserved < block_rsv->size) {
4136135b 5676 num_bytes = btrfs_space_info_used(sinfo, true);
fb4b10e5
JB
5677 if (sinfo->total_bytes > num_bytes) {
5678 num_bytes = sinfo->total_bytes - num_bytes;
5679 num_bytes = min(num_bytes,
5680 block_rsv->size - block_rsv->reserved);
5681 block_rsv->reserved += num_bytes;
5682 sinfo->bytes_may_use += num_bytes;
5683 trace_btrfs_space_reservation(fs_info, "space_info",
5684 sinfo->flags, num_bytes,
5685 1);
5686 }
5687 } else if (block_rsv->reserved > block_rsv->size) {
8929ecfa 5688 num_bytes = block_rsv->reserved - block_rsv->size;
fb25e914 5689 sinfo->bytes_may_use -= num_bytes;
8c2a3ca2 5690 trace_btrfs_space_reservation(fs_info, "space_info",
2bcc0328 5691 sinfo->flags, num_bytes, 0);
8929ecfa 5692 block_rsv->reserved = block_rsv->size;
8929ecfa 5693 }
182608c8 5694
fb4b10e5
JB
5695 if (block_rsv->reserved == block_rsv->size)
5696 block_rsv->full = 1;
5697 else
5698 block_rsv->full = 0;
5699
8929ecfa 5700 spin_unlock(&block_rsv->lock);
1f699d38 5701 spin_unlock(&sinfo->lock);
6a63209f
JB
5702}
5703
f0486c68 5704static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
6a63209f 5705{
f0486c68 5706 struct btrfs_space_info *space_info;
6a63209f 5707
f0486c68
YZ
5708 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5709 fs_info->chunk_block_rsv.space_info = space_info;
6a63209f 5710
f0486c68 5711 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
8929ecfa 5712 fs_info->global_block_rsv.space_info = space_info;
8929ecfa 5713 fs_info->delalloc_block_rsv.space_info = space_info;
f0486c68
YZ
5714 fs_info->trans_block_rsv.space_info = space_info;
5715 fs_info->empty_block_rsv.space_info = space_info;
6d668dda 5716 fs_info->delayed_block_rsv.space_info = space_info;
f0486c68 5717
8929ecfa
YZ
5718 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5719 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5720 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5721 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3a6cad90
SB
5722 if (fs_info->quota_root)
5723 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
f0486c68 5724 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
8929ecfa 5725
8929ecfa 5726 update_global_block_rsv(fs_info);
6a63209f
JB
5727}
5728
8929ecfa 5729static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
6a63209f 5730{
8c2a3ca2
JB
5731 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5732 (u64)-1);
8929ecfa
YZ
5733 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5734 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5735 WARN_ON(fs_info->trans_block_rsv.size > 0);
5736 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5737 WARN_ON(fs_info->chunk_block_rsv.size > 0);
5738 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
6d668dda
JB
5739 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5740 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
fcb80c2a
JB
5741}
5742
a22285a6 5743void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2ff7e61e 5744 struct btrfs_fs_info *fs_info)
6a63209f 5745{
0e721106
JB
5746 if (!trans->block_rsv)
5747 return;
5748
a22285a6
YZ
5749 if (!trans->bytes_reserved)
5750 return;
6a63209f 5751
0b246afa 5752 trace_btrfs_space_reservation(fs_info, "transaction",
2bcc0328 5753 trans->transid, trans->bytes_reserved, 0);
2ff7e61e
JM
5754 btrfs_block_rsv_release(fs_info, trans->block_rsv,
5755 trans->bytes_reserved);
a22285a6
YZ
5756 trans->bytes_reserved = 0;
5757}
6a63209f 5758
4fbcdf66
FM
5759/*
5760 * To be called after all the new block groups attached to the transaction
5761 * handle have been created (btrfs_create_pending_block_groups()).
5762 */
5763void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5764{
64b63580 5765 struct btrfs_fs_info *fs_info = trans->fs_info;
4fbcdf66
FM
5766
5767 if (!trans->chunk_bytes_reserved)
5768 return;
5769
5770 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5771
5772 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5773 trans->chunk_bytes_reserved);
5774 trans->chunk_bytes_reserved = 0;
5775}
5776
79787eaa 5777/* Can only return 0 or -ENOSPC */
d68fc57b 5778int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
8ed7a2a0 5779 struct btrfs_inode *inode)
d68fc57b 5780{
8ed7a2a0
NB
5781 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5782 struct btrfs_root *root = inode->root;
40acc3ee
JB
5783 /*
5784 * We always use trans->block_rsv here as we will have reserved space
5785 * for our orphan when starting the transaction, using get_block_rsv()
5786 * here will sometimes make us choose the wrong block rsv as we could be
5787 * doing a reloc inode for a non refcounted root.
5788 */
5789 struct btrfs_block_rsv *src_rsv = trans->block_rsv;
d68fc57b
YZ
5790 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5791
5792 /*
fcb80c2a
JB
5793 * We need to hold space in order to delete our orphan item once we've
5794 * added it, so this takes the reservation so we can release it later
5795 * when we are truly done with the orphan item.
d68fc57b 5796 */
0b246afa
JM
5797 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5798
8ed7a2a0
NB
5799 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5800 num_bytes, 1);
25d609f8 5801 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
6a63209f
JB
5802}
5803
703b391a 5804void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
97e728d4 5805{
703b391a
NB
5806 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5807 struct btrfs_root *root = inode->root;
0b246afa
JM
5808 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5809
703b391a
NB
5810 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5811 num_bytes, 0);
2ff7e61e 5812 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
d68fc57b 5813}
97e728d4 5814
d5c12070
MX
5815/*
5816 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5817 * root: the root of the parent directory
5818 * rsv: block reservation
5819 * items: the number of items that we need do reservation
5820 * qgroup_reserved: used to return the reserved size in qgroup
5821 *
5822 * This function is used to reserve the space for snapshot/subvolume
5823 * creation and deletion. Those operations are different with the
5824 * common file/directory operations, they change two fs/file trees
5825 * and root tree, the number of items that the qgroup reserves is
5826 * different with the free space reservation. So we can not use
01327610 5827 * the space reservation mechanism in start_transaction().
d5c12070
MX
5828 */
5829int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5830 struct btrfs_block_rsv *rsv,
5831 int items,
ee3441b4
JM
5832 u64 *qgroup_reserved,
5833 bool use_global_rsv)
a22285a6 5834{
d5c12070
MX
5835 u64 num_bytes;
5836 int ret;
0b246afa
JM
5837 struct btrfs_fs_info *fs_info = root->fs_info;
5838 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
d5c12070 5839
0b246afa 5840 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
d5c12070 5841 /* One for parent inode, two for dir entries */
0b246afa 5842 num_bytes = 3 * fs_info->nodesize;
003d7c59 5843 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
d5c12070
MX
5844 if (ret)
5845 return ret;
5846 } else {
5847 num_bytes = 0;
5848 }
5849
5850 *qgroup_reserved = num_bytes;
5851
0b246afa
JM
5852 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5853 rsv->space_info = __find_space_info(fs_info,
d5c12070
MX
5854 BTRFS_BLOCK_GROUP_METADATA);
5855 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5856 BTRFS_RESERVE_FLUSH_ALL);
ee3441b4
JM
5857
5858 if (ret == -ENOSPC && use_global_rsv)
25d609f8 5859 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
ee3441b4 5860
7174109c
QW
5861 if (ret && *qgroup_reserved)
5862 btrfs_qgroup_free_meta(root, *qgroup_reserved);
d5c12070
MX
5863
5864 return ret;
5865}
5866
2ff7e61e 5867void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
7775c818 5868 struct btrfs_block_rsv *rsv)
d5c12070 5869{
2ff7e61e 5870 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
97e728d4
JB
5871}
5872
7709cde3
JB
5873/**
5874 * drop_outstanding_extent - drop an outstanding extent
5875 * @inode: the inode we're dropping the extent for
01327610 5876 * @num_bytes: the number of bytes we're releasing.
7709cde3
JB
5877 *
5878 * This is called when we are freeing up an outstanding extent, either called
5879 * after an error or after an extent is written. This will return the number of
5880 * reserved extents that need to be freed. This must be called with
5881 * BTRFS_I(inode)->lock held.
5882 */
baa3ba39
NB
5883static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
5884 u64 num_bytes)
9e0baf60 5885{
7fd2ae21 5886 unsigned drop_inode_space = 0;
9e0baf60 5887 unsigned dropped_extents = 0;
823bb20a 5888 unsigned num_extents;
9e0baf60 5889
823bb20a 5890 num_extents = count_max_extents(num_bytes);
dcab6a3b 5891 ASSERT(num_extents);
baa3ba39
NB
5892 ASSERT(inode->outstanding_extents >= num_extents);
5893 inode->outstanding_extents -= num_extents;
9e0baf60 5894
baa3ba39 5895 if (inode->outstanding_extents == 0 &&
72ac3c0d 5896 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
baa3ba39 5897 &inode->runtime_flags))
7fd2ae21 5898 drop_inode_space = 1;
7fd2ae21 5899
9e0baf60 5900 /*
01327610 5901 * If we have more or the same amount of outstanding extents than we have
9e0baf60
JB
5902 * reserved then we need to leave the reserved extents count alone.
5903 */
baa3ba39 5904 if (inode->outstanding_extents >= inode->reserved_extents)
7fd2ae21 5905 return drop_inode_space;
9e0baf60 5906
baa3ba39
NB
5907 dropped_extents = inode->reserved_extents - inode->outstanding_extents;
5908 inode->reserved_extents -= dropped_extents;
7fd2ae21 5909 return dropped_extents + drop_inode_space;
9e0baf60
JB
5910}
5911
7709cde3 5912/**
01327610
NS
5913 * calc_csum_metadata_size - return the amount of metadata space that must be
5914 * reserved/freed for the given bytes.
7709cde3
JB
5915 * @inode: the inode we're manipulating
5916 * @num_bytes: the number of bytes in question
5917 * @reserve: 1 if we are reserving space, 0 if we are freeing space
5918 *
5919 * This adjusts the number of csum_bytes in the inode and then returns the
5920 * correct amount of metadata that must either be reserved or freed. We
5921 * calculate how many checksums we can fit into one leaf and then divide the
5922 * number of bytes that will need to be checksumed by this value to figure out
5923 * how many checksums will be required. If we are adding bytes then the number
5924 * may go up and we will return the number of additional bytes that must be
5925 * reserved. If it is going down we will return the number of bytes that must
5926 * be freed.
5927 *
5928 * This must be called with BTRFS_I(inode)->lock held.
5929 */
0e6bf9b1 5930static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
7709cde3 5931 int reserve)
6324fbf3 5932{
0e6bf9b1 5933 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1262133b 5934 u64 old_csums, num_csums;
7709cde3 5935
0e6bf9b1 5936 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
7709cde3
JB
5937 return 0;
5938
0e6bf9b1 5939 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
7709cde3 5940 if (reserve)
0e6bf9b1 5941 inode->csum_bytes += num_bytes;
7709cde3 5942 else
0e6bf9b1
NB
5943 inode->csum_bytes -= num_bytes;
5944 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
7709cde3
JB
5945
5946 /* No change, no need to reserve more */
5947 if (old_csums == num_csums)
5948 return 0;
5949
5950 if (reserve)
0b246afa 5951 return btrfs_calc_trans_metadata_size(fs_info,
7709cde3
JB
5952 num_csums - old_csums);
5953
0b246afa 5954 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
0ca1f7ce 5955}
c146afad 5956
9f3db423 5957int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
0ca1f7ce 5958{
9f3db423
NB
5959 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5960 struct btrfs_root *root = inode->root;
0b246afa 5961 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
9e0baf60 5962 u64 to_reserve = 0;
660d3f6c 5963 u64 csum_bytes;
823bb20a 5964 unsigned nr_extents;
08e007d2 5965 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
eb6b88d9 5966 int ret = 0;
c64c2bd8 5967 bool delalloc_lock = true;
88e081bf
WS
5968 u64 to_free = 0;
5969 unsigned dropped;
48c3d480 5970 bool release_extra = false;
6324fbf3 5971
c64c2bd8
JB
5972 /* If we are a free space inode we need to not flush since we will be in
5973 * the middle of a transaction commit. We also don't need the delalloc
5974 * mutex since we won't race with anybody. We need this mostly to make
5975 * lockdep shut its filthy mouth.
bac357dc
JB
5976 *
5977 * If we have a transaction open (can happen if we call truncate_block
5978 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
c64c2bd8
JB
5979 */
5980 if (btrfs_is_free_space_inode(inode)) {
08e007d2 5981 flush = BTRFS_RESERVE_NO_FLUSH;
c64c2bd8 5982 delalloc_lock = false;
bac357dc
JB
5983 } else if (current->journal_info) {
5984 flush = BTRFS_RESERVE_FLUSH_LIMIT;
c64c2bd8 5985 }
c09544e0 5986
08e007d2 5987 if (flush != BTRFS_RESERVE_NO_FLUSH &&
0b246afa 5988 btrfs_transaction_in_commit(fs_info))
0ca1f7ce 5989 schedule_timeout(1);
ec44a35c 5990
c64c2bd8 5991 if (delalloc_lock)
9f3db423 5992 mutex_lock(&inode->delalloc_mutex);
c64c2bd8 5993
0b246afa 5994 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
8bb8ab2e 5995
9f3db423 5996 spin_lock(&inode->lock);
823bb20a 5997 nr_extents = count_max_extents(num_bytes);
9f3db423 5998 inode->outstanding_extents += nr_extents;
9e0baf60 5999
48c3d480 6000 nr_extents = 0;
9f3db423
NB
6001 if (inode->outstanding_extents > inode->reserved_extents)
6002 nr_extents += inode->outstanding_extents -
6003 inode->reserved_extents;
57a45ced 6004
48c3d480 6005 /* We always want to reserve a slot for updating the inode. */
0b246afa 6006 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
7709cde3 6007 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
9f3db423
NB
6008 csum_bytes = inode->csum_bytes;
6009 spin_unlock(&inode->lock);
57a45ced 6010
0b246afa 6011 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
7174109c 6012 ret = btrfs_qgroup_reserve_meta(root,
003d7c59 6013 nr_extents * fs_info->nodesize, true);
88e081bf
WS
6014 if (ret)
6015 goto out_fail;
6016 }
c5567237 6017
48c3d480 6018 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
88e081bf 6019 if (unlikely(ret)) {
da17066c 6020 btrfs_qgroup_free_meta(root,
0b246afa 6021 nr_extents * fs_info->nodesize);
88e081bf 6022 goto out_fail;
9e0baf60 6023 }
25179201 6024
9f3db423 6025 spin_lock(&inode->lock);
48c3d480 6026 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
9f3db423 6027 &inode->runtime_flags)) {
0b246afa 6028 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
48c3d480 6029 release_extra = true;
660d3f6c 6030 }
9f3db423
NB
6031 inode->reserved_extents += nr_extents;
6032 spin_unlock(&inode->lock);
c64c2bd8
JB
6033
6034 if (delalloc_lock)
9f3db423 6035 mutex_unlock(&inode->delalloc_mutex);
660d3f6c 6036
8c2a3ca2 6037 if (to_reserve)
0b246afa 6038 trace_btrfs_space_reservation(fs_info, "delalloc",
9f3db423 6039 btrfs_ino(inode), to_reserve, 1);
48c3d480 6040 if (release_extra)
2ff7e61e 6041 btrfs_block_rsv_release(fs_info, block_rsv,
0b246afa 6042 btrfs_calc_trans_metadata_size(fs_info, 1));
0ca1f7ce 6043 return 0;
88e081bf
WS
6044
6045out_fail:
9f3db423 6046 spin_lock(&inode->lock);
dcab6a3b 6047 dropped = drop_outstanding_extent(inode, num_bytes);
88e081bf
WS
6048 /*
6049 * If the inodes csum_bytes is the same as the original
6050 * csum_bytes then we know we haven't raced with any free()ers
6051 * so we can just reduce our inodes csum bytes and carry on.
88e081bf 6052 */
9f3db423 6053 if (inode->csum_bytes == csum_bytes) {
88e081bf 6054 calc_csum_metadata_size(inode, num_bytes, 0);
f4881bc7 6055 } else {
9f3db423 6056 u64 orig_csum_bytes = inode->csum_bytes;
f4881bc7
JB
6057 u64 bytes;
6058
6059 /*
6060 * This is tricky, but first we need to figure out how much we
01327610 6061 * freed from any free-ers that occurred during this
f4881bc7
JB
6062 * reservation, so we reset ->csum_bytes to the csum_bytes
6063 * before we dropped our lock, and then call the free for the
6064 * number of bytes that were freed while we were trying our
6065 * reservation.
6066 */
9f3db423
NB
6067 bytes = csum_bytes - inode->csum_bytes;
6068 inode->csum_bytes = csum_bytes;
f4881bc7
JB
6069 to_free = calc_csum_metadata_size(inode, bytes, 0);
6070
6071
6072 /*
6073 * Now we need to see how much we would have freed had we not
6074 * been making this reservation and our ->csum_bytes were not
6075 * artificially inflated.
6076 */
9f3db423 6077 inode->csum_bytes = csum_bytes - num_bytes;
f4881bc7
JB
6078 bytes = csum_bytes - orig_csum_bytes;
6079 bytes = calc_csum_metadata_size(inode, bytes, 0);
6080
6081 /*
6082 * Now reset ->csum_bytes to what it should be. If bytes is
01327610 6083 * more than to_free then we would have freed more space had we
f4881bc7
JB
6084 * not had an artificially high ->csum_bytes, so we need to free
6085 * the remainder. If bytes is the same or less then we don't
6086 * need to do anything, the other free-ers did the correct
6087 * thing.
6088 */
9f3db423 6089 inode->csum_bytes = orig_csum_bytes - num_bytes;
f4881bc7
JB
6090 if (bytes > to_free)
6091 to_free = bytes - to_free;
6092 else
6093 to_free = 0;
6094 }
9f3db423 6095 spin_unlock(&inode->lock);
e2d1f923 6096 if (dropped)
0b246afa 6097 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
88e081bf
WS
6098
6099 if (to_free) {
2ff7e61e 6100 btrfs_block_rsv_release(fs_info, block_rsv, to_free);
0b246afa 6101 trace_btrfs_space_reservation(fs_info, "delalloc",
9f3db423 6102 btrfs_ino(inode), to_free, 0);
88e081bf
WS
6103 }
6104 if (delalloc_lock)
9f3db423 6105 mutex_unlock(&inode->delalloc_mutex);
88e081bf 6106 return ret;
0ca1f7ce
YZ
6107}
6108
7709cde3
JB
6109/**
6110 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6111 * @inode: the inode to release the reservation for
6112 * @num_bytes: the number of bytes we're releasing
6113 *
6114 * This will release the metadata reservation for an inode. This can be called
6115 * once we complete IO for a given set of bytes to release their metadata
6116 * reservations.
6117 */
691fa059 6118void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
0ca1f7ce 6119{
691fa059 6120 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
9e0baf60
JB
6121 u64 to_free = 0;
6122 unsigned dropped;
0ca1f7ce 6123
0b246afa 6124 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
691fa059 6125 spin_lock(&inode->lock);
dcab6a3b 6126 dropped = drop_outstanding_extent(inode, num_bytes);
97e728d4 6127
0934856d
MX
6128 if (num_bytes)
6129 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
691fa059 6130 spin_unlock(&inode->lock);
9e0baf60 6131 if (dropped > 0)
0b246afa 6132 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
0ca1f7ce 6133
0b246afa 6134 if (btrfs_is_testing(fs_info))
6a3891c5
JB
6135 return;
6136
691fa059
NB
6137 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
6138 to_free, 0);
c5567237 6139
2ff7e61e 6140 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
0ca1f7ce
YZ
6141}
6142
1ada3a62 6143/**
7cf5b976 6144 * btrfs_delalloc_reserve_space - reserve data and metadata space for
1ada3a62
QW
6145 * delalloc
6146 * @inode: inode we're writing to
6147 * @start: start range we are writing to
6148 * @len: how long the range we are writing to
6149 *
1ada3a62
QW
6150 * This will do the following things
6151 *
6152 * o reserve space in data space info for num bytes
6153 * and reserve precious corresponding qgroup space
6154 * (Done in check_data_free_space)
6155 *
6156 * o reserve space for metadata space, based on the number of outstanding
6157 * extents and how much csums will be needed
6158 * also reserve metadata space in a per root over-reserve method.
6159 * o add to the inodes->delalloc_bytes
6160 * o add it to the fs_info's delalloc inodes list.
6161 * (Above 3 all done in delalloc_reserve_metadata)
6162 *
6163 * Return 0 for success
6164 * Return <0 for error(-ENOSPC or -EQUOT)
6165 */
7cf5b976 6166int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
1ada3a62
QW
6167{
6168 int ret;
6169
7cf5b976 6170 ret = btrfs_check_data_free_space(inode, start, len);
1ada3a62
QW
6171 if (ret < 0)
6172 return ret;
9f3db423 6173 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
1ada3a62 6174 if (ret < 0)
7cf5b976 6175 btrfs_free_reserved_data_space(inode, start, len);
1ada3a62
QW
6176 return ret;
6177}
6178
7709cde3 6179/**
7cf5b976 6180 * btrfs_delalloc_release_space - release data and metadata space for delalloc
1ada3a62
QW
6181 * @inode: inode we're releasing space for
6182 * @start: start position of the space already reserved
6183 * @len: the len of the space already reserved
6184 *
6185 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
6186 * called in the case that we don't need the metadata AND data reservations
6187 * anymore. So if there is an error or we insert an inline extent.
6188 *
6189 * This function will release the metadata space that was not used and will
6190 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6191 * list if there are no delalloc bytes left.
6192 * Also it will handle the qgroup reserved space.
6193 */
7cf5b976 6194void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
1ada3a62 6195{
691fa059 6196 btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
7cf5b976 6197 btrfs_free_reserved_data_space(inode, start, len);
6324fbf3
CM
6198}
6199
ce93ec54 6200static int update_block_group(struct btrfs_trans_handle *trans,
6202df69 6201 struct btrfs_fs_info *info, u64 bytenr,
ce93ec54 6202 u64 num_bytes, int alloc)
9078a3e1 6203{
0af3d00b 6204 struct btrfs_block_group_cache *cache = NULL;
db94535d 6205 u64 total = num_bytes;
9078a3e1 6206 u64 old_val;
db94535d 6207 u64 byte_in_group;
0af3d00b 6208 int factor;
3e1ad54f 6209
5d4f98a2 6210 /* block accounting for super block */
eb73c1b7 6211 spin_lock(&info->delalloc_root_lock);
6c41761f 6212 old_val = btrfs_super_bytes_used(info->super_copy);
5d4f98a2
YZ
6213 if (alloc)
6214 old_val += num_bytes;
6215 else
6216 old_val -= num_bytes;
6c41761f 6217 btrfs_set_super_bytes_used(info->super_copy, old_val);
eb73c1b7 6218 spin_unlock(&info->delalloc_root_lock);
5d4f98a2 6219
d397712b 6220 while (total) {
db94535d 6221 cache = btrfs_lookup_block_group(info, bytenr);
f3465ca4 6222 if (!cache)
79787eaa 6223 return -ENOENT;
b742bb82
YZ
6224 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6225 BTRFS_BLOCK_GROUP_RAID1 |
6226 BTRFS_BLOCK_GROUP_RAID10))
6227 factor = 2;
6228 else
6229 factor = 1;
9d66e233
JB
6230 /*
6231 * If this block group has free space cache written out, we
6232 * need to make sure to load it if we are removing space. This
6233 * is because we need the unpinning stage to actually add the
6234 * space back to the block group, otherwise we will leak space.
6235 */
6236 if (!alloc && cache->cached == BTRFS_CACHE_NO)
f6373bf3 6237 cache_block_group(cache, 1);
0af3d00b 6238
db94535d
CM
6239 byte_in_group = bytenr - cache->key.objectid;
6240 WARN_ON(byte_in_group > cache->key.offset);
9078a3e1 6241
25179201 6242 spin_lock(&cache->space_info->lock);
c286ac48 6243 spin_lock(&cache->lock);
0af3d00b 6244
6202df69 6245 if (btrfs_test_opt(info, SPACE_CACHE) &&
0af3d00b
JB
6246 cache->disk_cache_state < BTRFS_DC_CLEAR)
6247 cache->disk_cache_state = BTRFS_DC_CLEAR;
6248
9078a3e1 6249 old_val = btrfs_block_group_used(&cache->item);
db94535d 6250 num_bytes = min(total, cache->key.offset - byte_in_group);
cd1bc465 6251 if (alloc) {
db94535d 6252 old_val += num_bytes;
11833d66
YZ
6253 btrfs_set_block_group_used(&cache->item, old_val);
6254 cache->reserved -= num_bytes;
11833d66 6255 cache->space_info->bytes_reserved -= num_bytes;
b742bb82
YZ
6256 cache->space_info->bytes_used += num_bytes;
6257 cache->space_info->disk_used += num_bytes * factor;
c286ac48 6258 spin_unlock(&cache->lock);
25179201 6259 spin_unlock(&cache->space_info->lock);
cd1bc465 6260 } else {
db94535d 6261 old_val -= num_bytes;
ae0ab003
FM
6262 btrfs_set_block_group_used(&cache->item, old_val);
6263 cache->pinned += num_bytes;
6264 cache->space_info->bytes_pinned += num_bytes;
6265 cache->space_info->bytes_used -= num_bytes;
6266 cache->space_info->disk_used -= num_bytes * factor;
6267 spin_unlock(&cache->lock);
6268 spin_unlock(&cache->space_info->lock);
47ab2a6c 6269
0b246afa 6270 trace_btrfs_space_reservation(info, "pinned",
c51e7bb1
JB
6271 cache->space_info->flags,
6272 num_bytes, 1);
ae0ab003
FM
6273 set_extent_dirty(info->pinned_extents,
6274 bytenr, bytenr + num_bytes - 1,
6275 GFP_NOFS | __GFP_NOFAIL);
cd1bc465 6276 }
1bbc621e
CM
6277
6278 spin_lock(&trans->transaction->dirty_bgs_lock);
6279 if (list_empty(&cache->dirty_list)) {
6280 list_add_tail(&cache->dirty_list,
6281 &trans->transaction->dirty_bgs);
6282 trans->transaction->num_dirty_bgs++;
6283 btrfs_get_block_group(cache);
6284 }
6285 spin_unlock(&trans->transaction->dirty_bgs_lock);
6286
036a9348
FM
6287 /*
6288 * No longer have used bytes in this block group, queue it for
6289 * deletion. We do this after adding the block group to the
6290 * dirty list to avoid races between cleaner kthread and space
6291 * cache writeout.
6292 */
6293 if (!alloc && old_val == 0) {
6294 spin_lock(&info->unused_bgs_lock);
6295 if (list_empty(&cache->bg_list)) {
6296 btrfs_get_block_group(cache);
6297 list_add_tail(&cache->bg_list,
6298 &info->unused_bgs);
6299 }
6300 spin_unlock(&info->unused_bgs_lock);
6301 }
6302
fa9c0d79 6303 btrfs_put_block_group(cache);
db94535d
CM
6304 total -= num_bytes;
6305 bytenr += num_bytes;
9078a3e1
CM
6306 }
6307 return 0;
6308}
6324fbf3 6309
2ff7e61e 6310static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
a061fc8d 6311{
0f9dd46c 6312 struct btrfs_block_group_cache *cache;
d2fb3437 6313 u64 bytenr;
0f9dd46c 6314
0b246afa
JM
6315 spin_lock(&fs_info->block_group_cache_lock);
6316 bytenr = fs_info->first_logical_byte;
6317 spin_unlock(&fs_info->block_group_cache_lock);
a1897fdd
LB
6318
6319 if (bytenr < (u64)-1)
6320 return bytenr;
6321
0b246afa 6322 cache = btrfs_lookup_first_block_group(fs_info, search_start);
0f9dd46c 6323 if (!cache)
a061fc8d 6324 return 0;
0f9dd46c 6325
d2fb3437 6326 bytenr = cache->key.objectid;
fa9c0d79 6327 btrfs_put_block_group(cache);
d2fb3437
YZ
6328
6329 return bytenr;
a061fc8d
CM
6330}
6331
2ff7e61e 6332static int pin_down_extent(struct btrfs_fs_info *fs_info,
f0486c68
YZ
6333 struct btrfs_block_group_cache *cache,
6334 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 6335{
11833d66
YZ
6336 spin_lock(&cache->space_info->lock);
6337 spin_lock(&cache->lock);
6338 cache->pinned += num_bytes;
6339 cache->space_info->bytes_pinned += num_bytes;
6340 if (reserved) {
6341 cache->reserved -= num_bytes;
6342 cache->space_info->bytes_reserved -= num_bytes;
6343 }
6344 spin_unlock(&cache->lock);
6345 spin_unlock(&cache->space_info->lock);
68b38550 6346
0b246afa 6347 trace_btrfs_space_reservation(fs_info, "pinned",
c51e7bb1 6348 cache->space_info->flags, num_bytes, 1);
0b246afa 6349 set_extent_dirty(fs_info->pinned_extents, bytenr,
f0486c68
YZ
6350 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6351 return 0;
6352}
68b38550 6353
f0486c68
YZ
6354/*
6355 * this function must be called within transaction
6356 */
2ff7e61e 6357int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
f0486c68
YZ
6358 u64 bytenr, u64 num_bytes, int reserved)
6359{
6360 struct btrfs_block_group_cache *cache;
68b38550 6361
0b246afa 6362 cache = btrfs_lookup_block_group(fs_info, bytenr);
79787eaa 6363 BUG_ON(!cache); /* Logic error */
f0486c68 6364
2ff7e61e 6365 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
f0486c68
YZ
6366
6367 btrfs_put_block_group(cache);
11833d66
YZ
6368 return 0;
6369}
6370
f0486c68 6371/*
e688b725
CM
6372 * this function must be called within transaction
6373 */
2ff7e61e 6374int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
e688b725
CM
6375 u64 bytenr, u64 num_bytes)
6376{
6377 struct btrfs_block_group_cache *cache;
b50c6e25 6378 int ret;
e688b725 6379
0b246afa 6380 cache = btrfs_lookup_block_group(fs_info, bytenr);
b50c6e25
JB
6381 if (!cache)
6382 return -EINVAL;
e688b725
CM
6383
6384 /*
6385 * pull in the free space cache (if any) so that our pin
6386 * removes the free space from the cache. We have load_only set
6387 * to one because the slow code to read in the free extents does check
6388 * the pinned extents.
6389 */
f6373bf3 6390 cache_block_group(cache, 1);
e688b725 6391
2ff7e61e 6392 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
e688b725
CM
6393
6394 /* remove us from the free space cache (if we're there at all) */
b50c6e25 6395 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
e688b725 6396 btrfs_put_block_group(cache);
b50c6e25 6397 return ret;
e688b725
CM
6398}
6399
2ff7e61e
JM
6400static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6401 u64 start, u64 num_bytes)
8c2a1a30
JB
6402{
6403 int ret;
6404 struct btrfs_block_group_cache *block_group;
6405 struct btrfs_caching_control *caching_ctl;
6406
0b246afa 6407 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
6408 if (!block_group)
6409 return -EINVAL;
6410
6411 cache_block_group(block_group, 0);
6412 caching_ctl = get_caching_control(block_group);
6413
6414 if (!caching_ctl) {
6415 /* Logic error */
6416 BUG_ON(!block_group_cache_done(block_group));
6417 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6418 } else {
6419 mutex_lock(&caching_ctl->mutex);
6420
6421 if (start >= caching_ctl->progress) {
2ff7e61e 6422 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
6423 } else if (start + num_bytes <= caching_ctl->progress) {
6424 ret = btrfs_remove_free_space(block_group,
6425 start, num_bytes);
6426 } else {
6427 num_bytes = caching_ctl->progress - start;
6428 ret = btrfs_remove_free_space(block_group,
6429 start, num_bytes);
6430 if (ret)
6431 goto out_lock;
6432
6433 num_bytes = (start + num_bytes) -
6434 caching_ctl->progress;
6435 start = caching_ctl->progress;
2ff7e61e 6436 ret = add_excluded_extent(fs_info, start, num_bytes);
8c2a1a30
JB
6437 }
6438out_lock:
6439 mutex_unlock(&caching_ctl->mutex);
6440 put_caching_control(caching_ctl);
6441 }
6442 btrfs_put_block_group(block_group);
6443 return ret;
6444}
6445
2ff7e61e 6446int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
8c2a1a30
JB
6447 struct extent_buffer *eb)
6448{
6449 struct btrfs_file_extent_item *item;
6450 struct btrfs_key key;
6451 int found_type;
6452 int i;
6453
2ff7e61e 6454 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
6455 return 0;
6456
6457 for (i = 0; i < btrfs_header_nritems(eb); i++) {
6458 btrfs_item_key_to_cpu(eb, &key, i);
6459 if (key.type != BTRFS_EXTENT_DATA_KEY)
6460 continue;
6461 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6462 found_type = btrfs_file_extent_type(eb, item);
6463 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6464 continue;
6465 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6466 continue;
6467 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6468 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2ff7e61e 6469 __exclude_logged_extent(fs_info, key.objectid, key.offset);
8c2a1a30
JB
6470 }
6471
6472 return 0;
6473}
6474
9cfa3e34
FM
6475static void
6476btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6477{
6478 atomic_inc(&bg->reservations);
6479}
6480
6481void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6482 const u64 start)
6483{
6484 struct btrfs_block_group_cache *bg;
6485
6486 bg = btrfs_lookup_block_group(fs_info, start);
6487 ASSERT(bg);
6488 if (atomic_dec_and_test(&bg->reservations))
6489 wake_up_atomic_t(&bg->reservations);
6490 btrfs_put_block_group(bg);
6491}
6492
6493static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6494{
6495 schedule();
6496 return 0;
6497}
6498
6499void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6500{
6501 struct btrfs_space_info *space_info = bg->space_info;
6502
6503 ASSERT(bg->ro);
6504
6505 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6506 return;
6507
6508 /*
6509 * Our block group is read only but before we set it to read only,
6510 * some task might have had allocated an extent from it already, but it
6511 * has not yet created a respective ordered extent (and added it to a
6512 * root's list of ordered extents).
6513 * Therefore wait for any task currently allocating extents, since the
6514 * block group's reservations counter is incremented while a read lock
6515 * on the groups' semaphore is held and decremented after releasing
6516 * the read access on that semaphore and creating the ordered extent.
6517 */
6518 down_write(&space_info->groups_sem);
6519 up_write(&space_info->groups_sem);
6520
6521 wait_on_atomic_t(&bg->reservations,
6522 btrfs_wait_bg_reservations_atomic_t,
6523 TASK_UNINTERRUPTIBLE);
6524}
6525
fb25e914 6526/**
4824f1f4 6527 * btrfs_add_reserved_bytes - update the block_group and space info counters
fb25e914 6528 * @cache: The cache we are manipulating
18513091
WX
6529 * @ram_bytes: The number of bytes of file content, and will be same to
6530 * @num_bytes except for the compress path.
fb25e914 6531 * @num_bytes: The number of bytes in question
e570fd27 6532 * @delalloc: The blocks are allocated for the delalloc write
fb25e914 6533 *
745699ef
XW
6534 * This is called by the allocator when it reserves space. If this is a
6535 * reservation and the block group has become read only we cannot make the
6536 * reservation and return -EAGAIN, otherwise this function always succeeds.
f0486c68 6537 */
4824f1f4 6538static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
18513091 6539 u64 ram_bytes, u64 num_bytes, int delalloc)
11833d66 6540{
fb25e914 6541 struct btrfs_space_info *space_info = cache->space_info;
f0486c68 6542 int ret = 0;
79787eaa 6543
fb25e914
JB
6544 spin_lock(&space_info->lock);
6545 spin_lock(&cache->lock);
4824f1f4
WX
6546 if (cache->ro) {
6547 ret = -EAGAIN;
fb25e914 6548 } else {
4824f1f4
WX
6549 cache->reserved += num_bytes;
6550 space_info->bytes_reserved += num_bytes;
e570fd27 6551
18513091
WX
6552 trace_btrfs_space_reservation(cache->fs_info,
6553 "space_info", space_info->flags,
6554 ram_bytes, 0);
6555 space_info->bytes_may_use -= ram_bytes;
e570fd27 6556 if (delalloc)
4824f1f4 6557 cache->delalloc_bytes += num_bytes;
324ae4df 6558 }
fb25e914
JB
6559 spin_unlock(&cache->lock);
6560 spin_unlock(&space_info->lock);
f0486c68 6561 return ret;
324ae4df 6562}
9078a3e1 6563
4824f1f4
WX
6564/**
6565 * btrfs_free_reserved_bytes - update the block_group and space info counters
6566 * @cache: The cache we are manipulating
6567 * @num_bytes: The number of bytes in question
6568 * @delalloc: The blocks are allocated for the delalloc write
6569 *
6570 * This is called by somebody who is freeing space that was never actually used
6571 * on disk. For example if you reserve some space for a new leaf in transaction
6572 * A and before transaction A commits you free that leaf, you call this with
6573 * reserve set to 0 in order to clear the reservation.
6574 */
6575
6576static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6577 u64 num_bytes, int delalloc)
6578{
6579 struct btrfs_space_info *space_info = cache->space_info;
6580 int ret = 0;
6581
6582 spin_lock(&space_info->lock);
6583 spin_lock(&cache->lock);
6584 if (cache->ro)
6585 space_info->bytes_readonly += num_bytes;
6586 cache->reserved -= num_bytes;
6587 space_info->bytes_reserved -= num_bytes;
6588
6589 if (delalloc)
6590 cache->delalloc_bytes -= num_bytes;
6591 spin_unlock(&cache->lock);
6592 spin_unlock(&space_info->lock);
6593 return ret;
6594}
8b74c03e 6595void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
e8569813 6596{
11833d66
YZ
6597 struct btrfs_caching_control *next;
6598 struct btrfs_caching_control *caching_ctl;
6599 struct btrfs_block_group_cache *cache;
e8569813 6600
9e351cc8 6601 down_write(&fs_info->commit_root_sem);
25179201 6602
11833d66
YZ
6603 list_for_each_entry_safe(caching_ctl, next,
6604 &fs_info->caching_block_groups, list) {
6605 cache = caching_ctl->block_group;
6606 if (block_group_cache_done(cache)) {
6607 cache->last_byte_to_unpin = (u64)-1;
6608 list_del_init(&caching_ctl->list);
6609 put_caching_control(caching_ctl);
e8569813 6610 } else {
11833d66 6611 cache->last_byte_to_unpin = caching_ctl->progress;
e8569813 6612 }
e8569813 6613 }
11833d66
YZ
6614
6615 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6616 fs_info->pinned_extents = &fs_info->freed_extents[1];
6617 else
6618 fs_info->pinned_extents = &fs_info->freed_extents[0];
6619
9e351cc8 6620 up_write(&fs_info->commit_root_sem);
8929ecfa
YZ
6621
6622 update_global_block_rsv(fs_info);
e8569813
ZY
6623}
6624
c759c4e1
JB
6625/*
6626 * Returns the free cluster for the given space info and sets empty_cluster to
6627 * what it should be based on the mount options.
6628 */
6629static struct btrfs_free_cluster *
2ff7e61e
JM
6630fetch_cluster_info(struct btrfs_fs_info *fs_info,
6631 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
6632{
6633 struct btrfs_free_cluster *ret = NULL;
0b246afa 6634 bool ssd = btrfs_test_opt(fs_info, SSD);
c759c4e1
JB
6635
6636 *empty_cluster = 0;
6637 if (btrfs_mixed_space_info(space_info))
6638 return ret;
6639
6640 if (ssd)
ee22184b 6641 *empty_cluster = SZ_2M;
c759c4e1 6642 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 6643 ret = &fs_info->meta_alloc_cluster;
c759c4e1 6644 if (!ssd)
ee22184b 6645 *empty_cluster = SZ_64K;
c759c4e1 6646 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
0b246afa 6647 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
6648 }
6649
6650 return ret;
6651}
6652
2ff7e61e
JM
6653static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6654 u64 start, u64 end,
678886bd 6655 const bool return_free_space)
ccd467d6 6656{
11833d66 6657 struct btrfs_block_group_cache *cache = NULL;
7b398f8e
JB
6658 struct btrfs_space_info *space_info;
6659 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 6660 struct btrfs_free_cluster *cluster = NULL;
11833d66 6661 u64 len;
c759c4e1
JB
6662 u64 total_unpinned = 0;
6663 u64 empty_cluster = 0;
7b398f8e 6664 bool readonly;
ccd467d6 6665
11833d66 6666 while (start <= end) {
7b398f8e 6667 readonly = false;
11833d66
YZ
6668 if (!cache ||
6669 start >= cache->key.objectid + cache->key.offset) {
6670 if (cache)
6671 btrfs_put_block_group(cache);
c759c4e1 6672 total_unpinned = 0;
11833d66 6673 cache = btrfs_lookup_block_group(fs_info, start);
79787eaa 6674 BUG_ON(!cache); /* Logic error */
c759c4e1 6675
2ff7e61e 6676 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
6677 cache->space_info,
6678 &empty_cluster);
6679 empty_cluster <<= 1;
11833d66
YZ
6680 }
6681
6682 len = cache->key.objectid + cache->key.offset - start;
6683 len = min(len, end + 1 - start);
6684
6685 if (start < cache->last_byte_to_unpin) {
6686 len = min(len, cache->last_byte_to_unpin - start);
678886bd
FM
6687 if (return_free_space)
6688 btrfs_add_free_space(cache, start, len);
11833d66
YZ
6689 }
6690
f0486c68 6691 start += len;
c759c4e1 6692 total_unpinned += len;
7b398f8e 6693 space_info = cache->space_info;
f0486c68 6694
c759c4e1
JB
6695 /*
6696 * If this space cluster has been marked as fragmented and we've
6697 * unpinned enough in this block group to potentially allow a
6698 * cluster to be created inside of it go ahead and clear the
6699 * fragmented check.
6700 */
6701 if (cluster && cluster->fragmented &&
6702 total_unpinned > empty_cluster) {
6703 spin_lock(&cluster->lock);
6704 cluster->fragmented = 0;
6705 spin_unlock(&cluster->lock);
6706 }
6707
7b398f8e 6708 spin_lock(&space_info->lock);
11833d66
YZ
6709 spin_lock(&cache->lock);
6710 cache->pinned -= len;
7b398f8e 6711 space_info->bytes_pinned -= len;
c51e7bb1
JB
6712
6713 trace_btrfs_space_reservation(fs_info, "pinned",
6714 space_info->flags, len, 0);
4f4db217 6715 space_info->max_extent_size = 0;
d288db5d 6716 percpu_counter_add(&space_info->total_bytes_pinned, -len);
7b398f8e
JB
6717 if (cache->ro) {
6718 space_info->bytes_readonly += len;
6719 readonly = true;
6720 }
11833d66 6721 spin_unlock(&cache->lock);
957780eb
JB
6722 if (!readonly && return_free_space &&
6723 global_rsv->space_info == space_info) {
6724 u64 to_add = len;
6725 WARN_ON(!return_free_space);
7b398f8e
JB
6726 spin_lock(&global_rsv->lock);
6727 if (!global_rsv->full) {
957780eb
JB
6728 to_add = min(len, global_rsv->size -
6729 global_rsv->reserved);
6730 global_rsv->reserved += to_add;
6731 space_info->bytes_may_use += to_add;
7b398f8e
JB
6732 if (global_rsv->reserved >= global_rsv->size)
6733 global_rsv->full = 1;
957780eb
JB
6734 trace_btrfs_space_reservation(fs_info,
6735 "space_info",
6736 space_info->flags,
6737 to_add, 1);
6738 len -= to_add;
7b398f8e
JB
6739 }
6740 spin_unlock(&global_rsv->lock);
957780eb
JB
6741 /* Add to any tickets we may have */
6742 if (len)
6743 space_info_add_new_bytes(fs_info, space_info,
6744 len);
7b398f8e
JB
6745 }
6746 spin_unlock(&space_info->lock);
ccd467d6 6747 }
11833d66
YZ
6748
6749 if (cache)
6750 btrfs_put_block_group(cache);
ccd467d6
CM
6751 return 0;
6752}
6753
6754int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2ff7e61e 6755 struct btrfs_fs_info *fs_info)
a28ec197 6756{
e33e17ee
JM
6757 struct btrfs_block_group_cache *block_group, *tmp;
6758 struct list_head *deleted_bgs;
11833d66 6759 struct extent_io_tree *unpin;
1a5bc167
CM
6760 u64 start;
6761 u64 end;
a28ec197 6762 int ret;
a28ec197 6763
11833d66
YZ
6764 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6765 unpin = &fs_info->freed_extents[1];
6766 else
6767 unpin = &fs_info->freed_extents[0];
6768
e33e17ee 6769 while (!trans->aborted) {
d4b450cd 6770 mutex_lock(&fs_info->unused_bg_unpin_mutex);
1a5bc167 6771 ret = find_first_extent_bit(unpin, 0, &start, &end,
e6138876 6772 EXTENT_DIRTY, NULL);
d4b450cd
FM
6773 if (ret) {
6774 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 6775 break;
d4b450cd 6776 }
1f3c79a2 6777
0b246afa 6778 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 6779 ret = btrfs_discard_extent(fs_info, start,
5378e607 6780 end + 1 - start, NULL);
1f3c79a2 6781
af6f8f60 6782 clear_extent_dirty(unpin, start, end);
2ff7e61e 6783 unpin_extent_range(fs_info, start, end, true);
d4b450cd 6784 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
b9473439 6785 cond_resched();
a28ec197 6786 }
817d52f8 6787
e33e17ee
JM
6788 /*
6789 * Transaction is finished. We don't need the lock anymore. We
6790 * do need to clean up the block groups in case of a transaction
6791 * abort.
6792 */
6793 deleted_bgs = &trans->transaction->deleted_bgs;
6794 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6795 u64 trimmed = 0;
6796
6797 ret = -EROFS;
6798 if (!trans->aborted)
2ff7e61e 6799 ret = btrfs_discard_extent(fs_info,
e33e17ee
JM
6800 block_group->key.objectid,
6801 block_group->key.offset,
6802 &trimmed);
6803
6804 list_del_init(&block_group->bg_list);
6805 btrfs_put_block_group_trimming(block_group);
6806 btrfs_put_block_group(block_group);
6807
6808 if (ret) {
6809 const char *errstr = btrfs_decode_error(ret);
6810 btrfs_warn(fs_info,
6811 "Discard failed while removing blockgroup: errno=%d %s\n",
6812 ret, errstr);
6813 }
6814 }
6815
e20d96d6
CM
6816 return 0;
6817}
6818
b150a4f1
JB
6819static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6820 u64 owner, u64 root_objectid)
6821{
6822 struct btrfs_space_info *space_info;
6823 u64 flags;
6824
6825 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6826 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6827 flags = BTRFS_BLOCK_GROUP_SYSTEM;
6828 else
6829 flags = BTRFS_BLOCK_GROUP_METADATA;
6830 } else {
6831 flags = BTRFS_BLOCK_GROUP_DATA;
6832 }
6833
6834 space_info = __find_space_info(fs_info, flags);
6835 BUG_ON(!space_info); /* Logic bug */
6836 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6837}
6838
6839
5d4f98a2 6840static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2ff7e61e 6841 struct btrfs_fs_info *info,
c682f9b3 6842 struct btrfs_delayed_ref_node *node, u64 parent,
5d4f98a2
YZ
6843 u64 root_objectid, u64 owner_objectid,
6844 u64 owner_offset, int refs_to_drop,
c682f9b3 6845 struct btrfs_delayed_extent_op *extent_op)
a28ec197 6846{
e2fa7227 6847 struct btrfs_key key;
5d4f98a2 6848 struct btrfs_path *path;
1261ec42 6849 struct btrfs_root *extent_root = info->extent_root;
5f39d397 6850 struct extent_buffer *leaf;
5d4f98a2
YZ
6851 struct btrfs_extent_item *ei;
6852 struct btrfs_extent_inline_ref *iref;
a28ec197 6853 int ret;
5d4f98a2 6854 int is_data;
952fccac
CM
6855 int extent_slot = 0;
6856 int found_extent = 0;
6857 int num_to_del = 1;
5d4f98a2
YZ
6858 u32 item_size;
6859 u64 refs;
c682f9b3
QW
6860 u64 bytenr = node->bytenr;
6861 u64 num_bytes = node->num_bytes;
fcebe456 6862 int last_ref = 0;
0b246afa 6863 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
037e6390 6864
5caf2a00 6865 path = btrfs_alloc_path();
54aa1f4d
CM
6866 if (!path)
6867 return -ENOMEM;
5f26f772 6868
e4058b54 6869 path->reada = READA_FORWARD;
b9473439 6870 path->leave_spinning = 1;
5d4f98a2
YZ
6871
6872 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6873 BUG_ON(!is_data && refs_to_drop != 1);
6874
3173a18f
JB
6875 if (is_data)
6876 skinny_metadata = 0;
6877
87bde3cd 6878 ret = lookup_extent_backref(trans, info, path, &iref,
5d4f98a2
YZ
6879 bytenr, num_bytes, parent,
6880 root_objectid, owner_objectid,
6881 owner_offset);
7bb86316 6882 if (ret == 0) {
952fccac 6883 extent_slot = path->slots[0];
5d4f98a2
YZ
6884 while (extent_slot >= 0) {
6885 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 6886 extent_slot);
5d4f98a2 6887 if (key.objectid != bytenr)
952fccac 6888 break;
5d4f98a2
YZ
6889 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6890 key.offset == num_bytes) {
952fccac
CM
6891 found_extent = 1;
6892 break;
6893 }
3173a18f
JB
6894 if (key.type == BTRFS_METADATA_ITEM_KEY &&
6895 key.offset == owner_objectid) {
6896 found_extent = 1;
6897 break;
6898 }
952fccac
CM
6899 if (path->slots[0] - extent_slot > 5)
6900 break;
5d4f98a2 6901 extent_slot--;
952fccac 6902 }
5d4f98a2
YZ
6903#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6904 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6905 if (found_extent && item_size < sizeof(*ei))
6906 found_extent = 0;
6907#endif
31840ae1 6908 if (!found_extent) {
5d4f98a2 6909 BUG_ON(iref);
87bde3cd
JM
6910 ret = remove_extent_backref(trans, info, path, NULL,
6911 refs_to_drop,
fcebe456 6912 is_data, &last_ref);
005d6427 6913 if (ret) {
66642832 6914 btrfs_abort_transaction(trans, ret);
005d6427
DS
6915 goto out;
6916 }
b3b4aa74 6917 btrfs_release_path(path);
b9473439 6918 path->leave_spinning = 1;
5d4f98a2
YZ
6919
6920 key.objectid = bytenr;
6921 key.type = BTRFS_EXTENT_ITEM_KEY;
6922 key.offset = num_bytes;
6923
3173a18f
JB
6924 if (!is_data && skinny_metadata) {
6925 key.type = BTRFS_METADATA_ITEM_KEY;
6926 key.offset = owner_objectid;
6927 }
6928
31840ae1
ZY
6929 ret = btrfs_search_slot(trans, extent_root,
6930 &key, path, -1, 1);
3173a18f
JB
6931 if (ret > 0 && skinny_metadata && path->slots[0]) {
6932 /*
6933 * Couldn't find our skinny metadata item,
6934 * see if we have ye olde extent item.
6935 */
6936 path->slots[0]--;
6937 btrfs_item_key_to_cpu(path->nodes[0], &key,
6938 path->slots[0]);
6939 if (key.objectid == bytenr &&
6940 key.type == BTRFS_EXTENT_ITEM_KEY &&
6941 key.offset == num_bytes)
6942 ret = 0;
6943 }
6944
6945 if (ret > 0 && skinny_metadata) {
6946 skinny_metadata = false;
9ce49a0b 6947 key.objectid = bytenr;
3173a18f
JB
6948 key.type = BTRFS_EXTENT_ITEM_KEY;
6949 key.offset = num_bytes;
6950 btrfs_release_path(path);
6951 ret = btrfs_search_slot(trans, extent_root,
6952 &key, path, -1, 1);
6953 }
6954
f3465ca4 6955 if (ret) {
5d163e0e
JM
6956 btrfs_err(info,
6957 "umm, got %d back from search, was looking for %llu",
6958 ret, bytenr);
b783e62d 6959 if (ret > 0)
2ff7e61e 6960 btrfs_print_leaf(info, path->nodes[0]);
f3465ca4 6961 }
005d6427 6962 if (ret < 0) {
66642832 6963 btrfs_abort_transaction(trans, ret);
005d6427
DS
6964 goto out;
6965 }
31840ae1
ZY
6966 extent_slot = path->slots[0];
6967 }
fae7f21c 6968 } else if (WARN_ON(ret == -ENOENT)) {
2ff7e61e 6969 btrfs_print_leaf(info, path->nodes[0]);
c2cf52eb
SK
6970 btrfs_err(info,
6971 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
c1c9ff7c
GU
6972 bytenr, parent, root_objectid, owner_objectid,
6973 owner_offset);
66642832 6974 btrfs_abort_transaction(trans, ret);
c4a050bb 6975 goto out;
79787eaa 6976 } else {
66642832 6977 btrfs_abort_transaction(trans, ret);
005d6427 6978 goto out;
7bb86316 6979 }
5f39d397
CM
6980
6981 leaf = path->nodes[0];
5d4f98a2
YZ
6982 item_size = btrfs_item_size_nr(leaf, extent_slot);
6983#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6984 if (item_size < sizeof(*ei)) {
6985 BUG_ON(found_extent || extent_slot != path->slots[0]);
87bde3cd
JM
6986 ret = convert_extent_item_v0(trans, info, path, owner_objectid,
6987 0);
005d6427 6988 if (ret < 0) {
66642832 6989 btrfs_abort_transaction(trans, ret);
005d6427
DS
6990 goto out;
6991 }
5d4f98a2 6992
b3b4aa74 6993 btrfs_release_path(path);
5d4f98a2
YZ
6994 path->leave_spinning = 1;
6995
6996 key.objectid = bytenr;
6997 key.type = BTRFS_EXTENT_ITEM_KEY;
6998 key.offset = num_bytes;
6999
7000 ret = btrfs_search_slot(trans, extent_root, &key, path,
7001 -1, 1);
7002 if (ret) {
5d163e0e
JM
7003 btrfs_err(info,
7004 "umm, got %d back from search, was looking for %llu",
c1c9ff7c 7005 ret, bytenr);
2ff7e61e 7006 btrfs_print_leaf(info, path->nodes[0]);
5d4f98a2 7007 }
005d6427 7008 if (ret < 0) {
66642832 7009 btrfs_abort_transaction(trans, ret);
005d6427
DS
7010 goto out;
7011 }
7012
5d4f98a2
YZ
7013 extent_slot = path->slots[0];
7014 leaf = path->nodes[0];
7015 item_size = btrfs_item_size_nr(leaf, extent_slot);
7016 }
7017#endif
7018 BUG_ON(item_size < sizeof(*ei));
952fccac 7019 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 7020 struct btrfs_extent_item);
3173a18f
JB
7021 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7022 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2
YZ
7023 struct btrfs_tree_block_info *bi;
7024 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7025 bi = (struct btrfs_tree_block_info *)(ei + 1);
7026 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7027 }
56bec294 7028
5d4f98a2 7029 refs = btrfs_extent_refs(leaf, ei);
32b02538 7030 if (refs < refs_to_drop) {
5d163e0e
JM
7031 btrfs_err(info,
7032 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7033 refs_to_drop, refs, bytenr);
32b02538 7034 ret = -EINVAL;
66642832 7035 btrfs_abort_transaction(trans, ret);
32b02538
JB
7036 goto out;
7037 }
56bec294 7038 refs -= refs_to_drop;
5f39d397 7039
5d4f98a2
YZ
7040 if (refs > 0) {
7041 if (extent_op)
7042 __run_delayed_extent_op(extent_op, leaf, ei);
7043 /*
7044 * In the case of inline back ref, reference count will
7045 * be updated by remove_extent_backref
952fccac 7046 */
5d4f98a2
YZ
7047 if (iref) {
7048 BUG_ON(!found_extent);
7049 } else {
7050 btrfs_set_extent_refs(leaf, ei, refs);
7051 btrfs_mark_buffer_dirty(leaf);
7052 }
7053 if (found_extent) {
87bde3cd 7054 ret = remove_extent_backref(trans, info, path,
5d4f98a2 7055 iref, refs_to_drop,
fcebe456 7056 is_data, &last_ref);
005d6427 7057 if (ret) {
66642832 7058 btrfs_abort_transaction(trans, ret);
005d6427
DS
7059 goto out;
7060 }
952fccac 7061 }
0b246afa 7062 add_pinned_bytes(info, -num_bytes, owner_objectid,
b150a4f1 7063 root_objectid);
5d4f98a2 7064 } else {
5d4f98a2
YZ
7065 if (found_extent) {
7066 BUG_ON(is_data && refs_to_drop !=
9ed0dea0 7067 extent_data_ref_count(path, iref));
5d4f98a2
YZ
7068 if (iref) {
7069 BUG_ON(path->slots[0] != extent_slot);
7070 } else {
7071 BUG_ON(path->slots[0] != extent_slot + 1);
7072 path->slots[0] = extent_slot;
7073 num_to_del = 2;
7074 }
78fae27e 7075 }
b9473439 7076
fcebe456 7077 last_ref = 1;
952fccac
CM
7078 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7079 num_to_del);
005d6427 7080 if (ret) {
66642832 7081 btrfs_abort_transaction(trans, ret);
005d6427
DS
7082 goto out;
7083 }
b3b4aa74 7084 btrfs_release_path(path);
21af804c 7085
5d4f98a2 7086 if (is_data) {
5b4aacef 7087 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
005d6427 7088 if (ret) {
66642832 7089 btrfs_abort_transaction(trans, ret);
005d6427
DS
7090 goto out;
7091 }
459931ec
CM
7092 }
7093
0b246afa 7094 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
1e144fb8 7095 if (ret) {
66642832 7096 btrfs_abort_transaction(trans, ret);
1e144fb8
OS
7097 goto out;
7098 }
7099
0b246afa 7100 ret = update_block_group(trans, info, bytenr, num_bytes, 0);
005d6427 7101 if (ret) {
66642832 7102 btrfs_abort_transaction(trans, ret);
005d6427
DS
7103 goto out;
7104 }
a28ec197 7105 }
fcebe456
JB
7106 btrfs_release_path(path);
7107
79787eaa 7108out:
5caf2a00 7109 btrfs_free_path(path);
a28ec197
CM
7110 return ret;
7111}
7112
1887be66 7113/*
f0486c68 7114 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
7115 * delayed ref for that extent as well. This searches the delayed ref tree for
7116 * a given extent, and if there are no other delayed refs to be processed, it
7117 * removes it from the tree.
7118 */
7119static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 7120 u64 bytenr)
1887be66
CM
7121{
7122 struct btrfs_delayed_ref_head *head;
7123 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 7124 int ret = 0;
1887be66
CM
7125
7126 delayed_refs = &trans->transaction->delayed_refs;
7127 spin_lock(&delayed_refs->lock);
f72ad18e 7128 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 7129 if (!head)
cf93da7b 7130 goto out_delayed_unlock;
1887be66 7131
d7df2c79 7132 spin_lock(&head->lock);
c6fc2454 7133 if (!list_empty(&head->ref_list))
1887be66
CM
7134 goto out;
7135
5d4f98a2
YZ
7136 if (head->extent_op) {
7137 if (!head->must_insert_reserved)
7138 goto out;
78a6184a 7139 btrfs_free_delayed_extent_op(head->extent_op);
5d4f98a2
YZ
7140 head->extent_op = NULL;
7141 }
7142
1887be66
CM
7143 /*
7144 * waiting for the lock here would deadlock. If someone else has it
7145 * locked they are already in the process of dropping it anyway
7146 */
7147 if (!mutex_trylock(&head->mutex))
7148 goto out;
7149
7150 /*
7151 * at this point we have a head with no other entries. Go
7152 * ahead and process it.
7153 */
7154 head->node.in_tree = 0;
c46effa6 7155 rb_erase(&head->href_node, &delayed_refs->href_root);
c3e69d58 7156
d7df2c79 7157 atomic_dec(&delayed_refs->num_entries);
1887be66
CM
7158
7159 /*
7160 * we don't take a ref on the node because we're removing it from the
7161 * tree, so we just steal the ref the tree was holding.
7162 */
c3e69d58 7163 delayed_refs->num_heads--;
d7df2c79 7164 if (head->processing == 0)
c3e69d58 7165 delayed_refs->num_heads_ready--;
d7df2c79
JB
7166 head->processing = 0;
7167 spin_unlock(&head->lock);
1887be66
CM
7168 spin_unlock(&delayed_refs->lock);
7169
f0486c68
YZ
7170 BUG_ON(head->extent_op);
7171 if (head->must_insert_reserved)
7172 ret = 1;
7173
7174 mutex_unlock(&head->mutex);
1887be66 7175 btrfs_put_delayed_ref(&head->node);
f0486c68 7176 return ret;
1887be66 7177out:
d7df2c79 7178 spin_unlock(&head->lock);
cf93da7b
CM
7179
7180out_delayed_unlock:
1887be66
CM
7181 spin_unlock(&delayed_refs->lock);
7182 return 0;
7183}
7184
f0486c68
YZ
7185void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7186 struct btrfs_root *root,
7187 struct extent_buffer *buf,
5581a51a 7188 u64 parent, int last_ref)
f0486c68 7189{
0b246afa 7190 struct btrfs_fs_info *fs_info = root->fs_info;
b150a4f1 7191 int pin = 1;
f0486c68
YZ
7192 int ret;
7193
7194 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
0b246afa
JM
7195 ret = btrfs_add_delayed_tree_ref(fs_info, trans,
7196 buf->start, buf->len,
7197 parent,
7198 root->root_key.objectid,
7199 btrfs_header_level(buf),
7200 BTRFS_DROP_DELAYED_REF, NULL);
79787eaa 7201 BUG_ON(ret); /* -ENOMEM */
f0486c68
YZ
7202 }
7203
7204 if (!last_ref)
7205 return;
7206
f0486c68 7207 if (btrfs_header_generation(buf) == trans->transid) {
6219872d
FM
7208 struct btrfs_block_group_cache *cache;
7209
f0486c68 7210 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
2ff7e61e 7211 ret = check_ref_cleanup(trans, buf->start);
f0486c68 7212 if (!ret)
37be25bc 7213 goto out;
f0486c68
YZ
7214 }
7215
0b246afa 7216 cache = btrfs_lookup_block_group(fs_info, buf->start);
6219872d 7217
f0486c68 7218 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2ff7e61e
JM
7219 pin_down_extent(fs_info, cache, buf->start,
7220 buf->len, 1);
6219872d 7221 btrfs_put_block_group(cache);
37be25bc 7222 goto out;
f0486c68
YZ
7223 }
7224
7225 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7226
7227 btrfs_add_free_space(cache, buf->start, buf->len);
4824f1f4 7228 btrfs_free_reserved_bytes(cache, buf->len, 0);
6219872d 7229 btrfs_put_block_group(cache);
71ff6437 7230 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
b150a4f1 7231 pin = 0;
f0486c68
YZ
7232 }
7233out:
b150a4f1 7234 if (pin)
0b246afa 7235 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
b150a4f1
JB
7236 root->root_key.objectid);
7237
a826d6dc
JB
7238 /*
7239 * Deleting the buffer, clear the corrupt flag since it doesn't matter
7240 * anymore.
7241 */
7242 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
f0486c68
YZ
7243}
7244
79787eaa 7245/* Can return -ENOMEM */
2ff7e61e
JM
7246int btrfs_free_extent(struct btrfs_trans_handle *trans,
7247 struct btrfs_fs_info *fs_info,
66d7e7f0 7248 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
b06c4bf5 7249 u64 owner, u64 offset)
925baedd
CM
7250{
7251 int ret;
7252
f5ee5c9a 7253 if (btrfs_is_testing(fs_info))
faa2dbf0 7254 return 0;
fccb84c9 7255
0b246afa 7256 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
b150a4f1 7257
56bec294
CM
7258 /*
7259 * tree log blocks never actually go into the extent allocation
7260 * tree, just update pinning info and exit early.
56bec294 7261 */
5d4f98a2
YZ
7262 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7263 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
b9473439 7264 /* unlocks the pinned mutex */
2ff7e61e 7265 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
56bec294 7266 ret = 0;
5d4f98a2 7267 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
66d7e7f0
AJ
7268 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7269 num_bytes,
5d4f98a2 7270 parent, root_objectid, (int)owner,
b06c4bf5 7271 BTRFS_DROP_DELAYED_REF, NULL);
5d4f98a2 7272 } else {
66d7e7f0
AJ
7273 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7274 num_bytes,
7275 parent, root_objectid, owner,
5846a3c2 7276 offset, 0,
fef394f7 7277 BTRFS_DROP_DELAYED_REF);
56bec294 7278 }
925baedd
CM
7279 return ret;
7280}
7281
817d52f8
JB
7282/*
7283 * when we wait for progress in the block group caching, its because
7284 * our allocation attempt failed at least once. So, we must sleep
7285 * and let some progress happen before we try again.
7286 *
7287 * This function will sleep at least once waiting for new free space to
7288 * show up, and then it will check the block group free space numbers
7289 * for our min num_bytes. Another option is to have it go ahead
7290 * and look in the rbtree for a free extent of a given size, but this
7291 * is a good start.
36cce922
JB
7292 *
7293 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7294 * any of the information in this block group.
817d52f8 7295 */
36cce922 7296static noinline void
817d52f8
JB
7297wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7298 u64 num_bytes)
7299{
11833d66 7300 struct btrfs_caching_control *caching_ctl;
817d52f8 7301
11833d66
YZ
7302 caching_ctl = get_caching_control(cache);
7303 if (!caching_ctl)
36cce922 7304 return;
817d52f8 7305
11833d66 7306 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
34d52cb6 7307 (cache->free_space_ctl->free_space >= num_bytes));
11833d66
YZ
7308
7309 put_caching_control(caching_ctl);
11833d66
YZ
7310}
7311
7312static noinline int
7313wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7314{
7315 struct btrfs_caching_control *caching_ctl;
36cce922 7316 int ret = 0;
11833d66
YZ
7317
7318 caching_ctl = get_caching_control(cache);
7319 if (!caching_ctl)
36cce922 7320 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
11833d66
YZ
7321
7322 wait_event(caching_ctl->wait, block_group_cache_done(cache));
36cce922
JB
7323 if (cache->cached == BTRFS_CACHE_ERROR)
7324 ret = -EIO;
11833d66 7325 put_caching_control(caching_ctl);
36cce922 7326 return ret;
817d52f8
JB
7327}
7328
31e50229 7329int __get_raid_index(u64 flags)
b742bb82 7330{
7738a53a 7331 if (flags & BTRFS_BLOCK_GROUP_RAID10)
e6ec716f 7332 return BTRFS_RAID_RAID10;
7738a53a 7333 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
e6ec716f 7334 return BTRFS_RAID_RAID1;
7738a53a 7335 else if (flags & BTRFS_BLOCK_GROUP_DUP)
e6ec716f 7336 return BTRFS_RAID_DUP;
7738a53a 7337 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
e6ec716f 7338 return BTRFS_RAID_RAID0;
53b381b3 7339 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
e942f883 7340 return BTRFS_RAID_RAID5;
53b381b3 7341 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
e942f883 7342 return BTRFS_RAID_RAID6;
7738a53a 7343
e942f883 7344 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
b742bb82
YZ
7345}
7346
6ab0a202 7347int get_block_group_index(struct btrfs_block_group_cache *cache)
7738a53a 7348{
31e50229 7349 return __get_raid_index(cache->flags);
7738a53a
ID
7350}
7351
6ab0a202
JM
7352static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7353 [BTRFS_RAID_RAID10] = "raid10",
7354 [BTRFS_RAID_RAID1] = "raid1",
7355 [BTRFS_RAID_DUP] = "dup",
7356 [BTRFS_RAID_RAID0] = "raid0",
7357 [BTRFS_RAID_SINGLE] = "single",
7358 [BTRFS_RAID_RAID5] = "raid5",
7359 [BTRFS_RAID_RAID6] = "raid6",
7360};
7361
1b8e5df6 7362static const char *get_raid_name(enum btrfs_raid_types type)
6ab0a202
JM
7363{
7364 if (type >= BTRFS_NR_RAID_TYPES)
7365 return NULL;
7366
7367 return btrfs_raid_type_names[type];
7368}
7369
817d52f8 7370enum btrfs_loop_type {
285ff5af
JB
7371 LOOP_CACHING_NOWAIT = 0,
7372 LOOP_CACHING_WAIT = 1,
7373 LOOP_ALLOC_CHUNK = 2,
7374 LOOP_NO_EMPTY_SIZE = 3,
817d52f8
JB
7375};
7376
e570fd27
MX
7377static inline void
7378btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7379 int delalloc)
7380{
7381 if (delalloc)
7382 down_read(&cache->data_rwsem);
7383}
7384
7385static inline void
7386btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7387 int delalloc)
7388{
7389 btrfs_get_block_group(cache);
7390 if (delalloc)
7391 down_read(&cache->data_rwsem);
7392}
7393
7394static struct btrfs_block_group_cache *
7395btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7396 struct btrfs_free_cluster *cluster,
7397 int delalloc)
7398{
89771cc9 7399 struct btrfs_block_group_cache *used_bg = NULL;
6719afdc 7400
e570fd27 7401 spin_lock(&cluster->refill_lock);
6719afdc
GU
7402 while (1) {
7403 used_bg = cluster->block_group;
7404 if (!used_bg)
7405 return NULL;
7406
7407 if (used_bg == block_group)
e570fd27
MX
7408 return used_bg;
7409
6719afdc 7410 btrfs_get_block_group(used_bg);
e570fd27 7411
6719afdc
GU
7412 if (!delalloc)
7413 return used_bg;
e570fd27 7414
6719afdc
GU
7415 if (down_read_trylock(&used_bg->data_rwsem))
7416 return used_bg;
e570fd27 7417
6719afdc 7418 spin_unlock(&cluster->refill_lock);
e570fd27 7419
e321f8a8
LB
7420 /* We should only have one-level nested. */
7421 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 7422
6719afdc
GU
7423 spin_lock(&cluster->refill_lock);
7424 if (used_bg == cluster->block_group)
7425 return used_bg;
e570fd27 7426
6719afdc
GU
7427 up_read(&used_bg->data_rwsem);
7428 btrfs_put_block_group(used_bg);
7429 }
e570fd27
MX
7430}
7431
7432static inline void
7433btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7434 int delalloc)
7435{
7436 if (delalloc)
7437 up_read(&cache->data_rwsem);
7438 btrfs_put_block_group(cache);
7439}
7440
fec577fb
CM
7441/*
7442 * walks the btree of allocated extents and find a hole of a given size.
7443 * The key ins is changed to record the hole:
a4820398 7444 * ins->objectid == start position
62e2749e 7445 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 7446 * ins->offset == the size of the hole.
fec577fb 7447 * Any available blocks before search_start are skipped.
a4820398
MX
7448 *
7449 * If there is no suitable free space, we will record the max size of
7450 * the free space extent currently.
fec577fb 7451 */
87bde3cd 7452static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
18513091
WX
7453 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7454 u64 hint_byte, struct btrfs_key *ins,
7455 u64 flags, int delalloc)
fec577fb 7456{
80eb234a 7457 int ret = 0;
0b246afa 7458 struct btrfs_root *root = fs_info->extent_root;
fa9c0d79 7459 struct btrfs_free_cluster *last_ptr = NULL;
80eb234a 7460 struct btrfs_block_group_cache *block_group = NULL;
81c9ad23 7461 u64 search_start = 0;
a4820398 7462 u64 max_extent_size = 0;
c759c4e1 7463 u64 empty_cluster = 0;
80eb234a 7464 struct btrfs_space_info *space_info;
fa9c0d79 7465 int loop = 0;
b6919a58 7466 int index = __get_raid_index(flags);
0a24325e 7467 bool failed_cluster_refill = false;
1cdda9b8 7468 bool failed_alloc = false;
67377734 7469 bool use_cluster = true;
60d2adbb 7470 bool have_caching_bg = false;
13a0db5a 7471 bool orig_have_caching_bg = false;
a5e681d9 7472 bool full_search = false;
fec577fb 7473
0b246afa 7474 WARN_ON(num_bytes < fs_info->sectorsize);
962a298f 7475 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
7476 ins->objectid = 0;
7477 ins->offset = 0;
b1a4d965 7478
71ff6437 7479 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
3f7de037 7480
0b246afa 7481 space_info = __find_space_info(fs_info, flags);
1b1d1f66 7482 if (!space_info) {
0b246afa 7483 btrfs_err(fs_info, "No space info for %llu", flags);
1b1d1f66
JB
7484 return -ENOSPC;
7485 }
2552d17e 7486
67377734 7487 /*
4f4db217
JB
7488 * If our free space is heavily fragmented we may not be able to make
7489 * big contiguous allocations, so instead of doing the expensive search
7490 * for free space, simply return ENOSPC with our max_extent_size so we
7491 * can go ahead and search for a more manageable chunk.
7492 *
7493 * If our max_extent_size is large enough for our allocation simply
7494 * disable clustering since we will likely not be able to find enough
7495 * space to create a cluster and induce latency trying.
67377734 7496 */
4f4db217
JB
7497 if (unlikely(space_info->max_extent_size)) {
7498 spin_lock(&space_info->lock);
7499 if (space_info->max_extent_size &&
7500 num_bytes > space_info->max_extent_size) {
7501 ins->offset = space_info->max_extent_size;
7502 spin_unlock(&space_info->lock);
7503 return -ENOSPC;
7504 } else if (space_info->max_extent_size) {
7505 use_cluster = false;
7506 }
7507 spin_unlock(&space_info->lock);
fa9c0d79 7508 }
0f9dd46c 7509
2ff7e61e 7510 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
239b14b3 7511 if (last_ptr) {
fa9c0d79
CM
7512 spin_lock(&last_ptr->lock);
7513 if (last_ptr->block_group)
7514 hint_byte = last_ptr->window_start;
c759c4e1
JB
7515 if (last_ptr->fragmented) {
7516 /*
7517 * We still set window_start so we can keep track of the
7518 * last place we found an allocation to try and save
7519 * some time.
7520 */
7521 hint_byte = last_ptr->window_start;
7522 use_cluster = false;
7523 }
fa9c0d79 7524 spin_unlock(&last_ptr->lock);
239b14b3 7525 }
fa9c0d79 7526
2ff7e61e 7527 search_start = max(search_start, first_logical_byte(fs_info, 0));
239b14b3 7528 search_start = max(search_start, hint_byte);
2552d17e 7529 if (search_start == hint_byte) {
0b246afa 7530 block_group = btrfs_lookup_block_group(fs_info, search_start);
817d52f8
JB
7531 /*
7532 * we don't want to use the block group if it doesn't match our
7533 * allocation bits, or if its not cached.
ccf0e725
JB
7534 *
7535 * However if we are re-searching with an ideal block group
7536 * picked out then we don't care that the block group is cached.
817d52f8 7537 */
b6919a58 7538 if (block_group && block_group_bits(block_group, flags) &&
285ff5af 7539 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 7540 down_read(&space_info->groups_sem);
44fb5511
CM
7541 if (list_empty(&block_group->list) ||
7542 block_group->ro) {
7543 /*
7544 * someone is removing this block group,
7545 * we can't jump into the have_block_group
7546 * target because our list pointers are not
7547 * valid
7548 */
7549 btrfs_put_block_group(block_group);
7550 up_read(&space_info->groups_sem);
ccf0e725 7551 } else {
b742bb82 7552 index = get_block_group_index(block_group);
e570fd27 7553 btrfs_lock_block_group(block_group, delalloc);
44fb5511 7554 goto have_block_group;
ccf0e725 7555 }
2552d17e 7556 } else if (block_group) {
fa9c0d79 7557 btrfs_put_block_group(block_group);
2552d17e 7558 }
42e70e7a 7559 }
2552d17e 7560search:
60d2adbb 7561 have_caching_bg = false;
a5e681d9
JB
7562 if (index == 0 || index == __get_raid_index(flags))
7563 full_search = true;
80eb234a 7564 down_read(&space_info->groups_sem);
b742bb82
YZ
7565 list_for_each_entry(block_group, &space_info->block_groups[index],
7566 list) {
6226cb0a 7567 u64 offset;
817d52f8 7568 int cached;
8a1413a2 7569
e570fd27 7570 btrfs_grab_block_group(block_group, delalloc);
2552d17e 7571 search_start = block_group->key.objectid;
42e70e7a 7572
83a50de9
CM
7573 /*
7574 * this can happen if we end up cycling through all the
7575 * raid types, but we want to make sure we only allocate
7576 * for the proper type.
7577 */
b6919a58 7578 if (!block_group_bits(block_group, flags)) {
83a50de9
CM
7579 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7580 BTRFS_BLOCK_GROUP_RAID1 |
53b381b3
DW
7581 BTRFS_BLOCK_GROUP_RAID5 |
7582 BTRFS_BLOCK_GROUP_RAID6 |
83a50de9
CM
7583 BTRFS_BLOCK_GROUP_RAID10;
7584
7585 /*
7586 * if they asked for extra copies and this block group
7587 * doesn't provide them, bail. This does allow us to
7588 * fill raid0 from raid1.
7589 */
b6919a58 7590 if ((flags & extra) && !(block_group->flags & extra))
83a50de9
CM
7591 goto loop;
7592 }
7593
2552d17e 7594have_block_group:
291c7d2f
JB
7595 cached = block_group_cache_done(block_group);
7596 if (unlikely(!cached)) {
a5e681d9 7597 have_caching_bg = true;
f6373bf3 7598 ret = cache_block_group(block_group, 0);
1d4284bd
CM
7599 BUG_ON(ret < 0);
7600 ret = 0;
817d52f8
JB
7601 }
7602
36cce922
JB
7603 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7604 goto loop;
ea6a478e 7605 if (unlikely(block_group->ro))
2552d17e 7606 goto loop;
0f9dd46c 7607
0a24325e 7608 /*
062c05c4
AO
7609 * Ok we want to try and use the cluster allocator, so
7610 * lets look there
0a24325e 7611 */
c759c4e1 7612 if (last_ptr && use_cluster) {
215a63d1 7613 struct btrfs_block_group_cache *used_block_group;
8de972b4 7614 unsigned long aligned_cluster;
fa9c0d79
CM
7615 /*
7616 * the refill lock keeps out other
7617 * people trying to start a new cluster
7618 */
e570fd27
MX
7619 used_block_group = btrfs_lock_cluster(block_group,
7620 last_ptr,
7621 delalloc);
7622 if (!used_block_group)
44fb5511 7623 goto refill_cluster;
274bd4fb 7624
e570fd27
MX
7625 if (used_block_group != block_group &&
7626 (used_block_group->ro ||
7627 !block_group_bits(used_block_group, flags)))
7628 goto release_cluster;
44fb5511 7629
274bd4fb 7630 offset = btrfs_alloc_from_cluster(used_block_group,
a4820398
MX
7631 last_ptr,
7632 num_bytes,
7633 used_block_group->key.objectid,
7634 &max_extent_size);
fa9c0d79
CM
7635 if (offset) {
7636 /* we have a block, we're done */
7637 spin_unlock(&last_ptr->refill_lock);
71ff6437 7638 trace_btrfs_reserve_extent_cluster(fs_info,
89d4346a
MX
7639 used_block_group,
7640 search_start, num_bytes);
215a63d1 7641 if (used_block_group != block_group) {
e570fd27
MX
7642 btrfs_release_block_group(block_group,
7643 delalloc);
215a63d1
MX
7644 block_group = used_block_group;
7645 }
fa9c0d79
CM
7646 goto checks;
7647 }
7648
274bd4fb 7649 WARN_ON(last_ptr->block_group != used_block_group);
e570fd27 7650release_cluster:
062c05c4
AO
7651 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7652 * set up a new clusters, so lets just skip it
7653 * and let the allocator find whatever block
7654 * it can find. If we reach this point, we
7655 * will have tried the cluster allocator
7656 * plenty of times and not have found
7657 * anything, so we are likely way too
7658 * fragmented for the clustering stuff to find
a5f6f719
AO
7659 * anything.
7660 *
7661 * However, if the cluster is taken from the
7662 * current block group, release the cluster
7663 * first, so that we stand a better chance of
7664 * succeeding in the unclustered
7665 * allocation. */
7666 if (loop >= LOOP_NO_EMPTY_SIZE &&
e570fd27 7667 used_block_group != block_group) {
062c05c4 7668 spin_unlock(&last_ptr->refill_lock);
e570fd27
MX
7669 btrfs_release_block_group(used_block_group,
7670 delalloc);
062c05c4
AO
7671 goto unclustered_alloc;
7672 }
7673
fa9c0d79
CM
7674 /*
7675 * this cluster didn't work out, free it and
7676 * start over
7677 */
7678 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7679
e570fd27
MX
7680 if (used_block_group != block_group)
7681 btrfs_release_block_group(used_block_group,
7682 delalloc);
7683refill_cluster:
a5f6f719
AO
7684 if (loop >= LOOP_NO_EMPTY_SIZE) {
7685 spin_unlock(&last_ptr->refill_lock);
7686 goto unclustered_alloc;
7687 }
7688
8de972b4
CM
7689 aligned_cluster = max_t(unsigned long,
7690 empty_cluster + empty_size,
7691 block_group->full_stripe_len);
7692
fa9c0d79 7693 /* allocate a cluster in this block group */
2ff7e61e 7694 ret = btrfs_find_space_cluster(fs_info, block_group,
00361589
JB
7695 last_ptr, search_start,
7696 num_bytes,
7697 aligned_cluster);
fa9c0d79
CM
7698 if (ret == 0) {
7699 /*
7700 * now pull our allocation out of this
7701 * cluster
7702 */
7703 offset = btrfs_alloc_from_cluster(block_group,
a4820398
MX
7704 last_ptr,
7705 num_bytes,
7706 search_start,
7707 &max_extent_size);
fa9c0d79
CM
7708 if (offset) {
7709 /* we found one, proceed */
7710 spin_unlock(&last_ptr->refill_lock);
71ff6437 7711 trace_btrfs_reserve_extent_cluster(fs_info,
3f7de037
JB
7712 block_group, search_start,
7713 num_bytes);
fa9c0d79
CM
7714 goto checks;
7715 }
0a24325e
JB
7716 } else if (!cached && loop > LOOP_CACHING_NOWAIT
7717 && !failed_cluster_refill) {
817d52f8
JB
7718 spin_unlock(&last_ptr->refill_lock);
7719
0a24325e 7720 failed_cluster_refill = true;
817d52f8
JB
7721 wait_block_group_cache_progress(block_group,
7722 num_bytes + empty_cluster + empty_size);
7723 goto have_block_group;
fa9c0d79 7724 }
817d52f8 7725
fa9c0d79
CM
7726 /*
7727 * at this point we either didn't find a cluster
7728 * or we weren't able to allocate a block from our
7729 * cluster. Free the cluster we've been trying
7730 * to use, and go to the next block group
7731 */
0a24325e 7732 btrfs_return_cluster_to_free_space(NULL, last_ptr);
fa9c0d79 7733 spin_unlock(&last_ptr->refill_lock);
0a24325e 7734 goto loop;
fa9c0d79
CM
7735 }
7736
062c05c4 7737unclustered_alloc:
c759c4e1
JB
7738 /*
7739 * We are doing an unclustered alloc, set the fragmented flag so
7740 * we don't bother trying to setup a cluster again until we get
7741 * more space.
7742 */
7743 if (unlikely(last_ptr)) {
7744 spin_lock(&last_ptr->lock);
7745 last_ptr->fragmented = 1;
7746 spin_unlock(&last_ptr->lock);
7747 }
0c9b36e0
LB
7748 if (cached) {
7749 struct btrfs_free_space_ctl *ctl =
7750 block_group->free_space_ctl;
7751
7752 spin_lock(&ctl->tree_lock);
7753 if (ctl->free_space <
7754 num_bytes + empty_cluster + empty_size) {
7755 if (ctl->free_space > max_extent_size)
7756 max_extent_size = ctl->free_space;
7757 spin_unlock(&ctl->tree_lock);
7758 goto loop;
7759 }
7760 spin_unlock(&ctl->tree_lock);
a5f6f719 7761 }
a5f6f719 7762
6226cb0a 7763 offset = btrfs_find_space_for_alloc(block_group, search_start,
a4820398
MX
7764 num_bytes, empty_size,
7765 &max_extent_size);
1cdda9b8
JB
7766 /*
7767 * If we didn't find a chunk, and we haven't failed on this
7768 * block group before, and this block group is in the middle of
7769 * caching and we are ok with waiting, then go ahead and wait
7770 * for progress to be made, and set failed_alloc to true.
7771 *
7772 * If failed_alloc is true then we've already waited on this
7773 * block group once and should move on to the next block group.
7774 */
7775 if (!offset && !failed_alloc && !cached &&
7776 loop > LOOP_CACHING_NOWAIT) {
817d52f8 7777 wait_block_group_cache_progress(block_group,
1cdda9b8
JB
7778 num_bytes + empty_size);
7779 failed_alloc = true;
817d52f8 7780 goto have_block_group;
1cdda9b8
JB
7781 } else if (!offset) {
7782 goto loop;
817d52f8 7783 }
fa9c0d79 7784checks:
0b246afa 7785 search_start = ALIGN(offset, fs_info->stripesize);
25179201 7786
2552d17e
JB
7787 /* move on to the next group */
7788 if (search_start + num_bytes >
215a63d1
MX
7789 block_group->key.objectid + block_group->key.offset) {
7790 btrfs_add_free_space(block_group, offset, num_bytes);
2552d17e 7791 goto loop;
6226cb0a 7792 }
f5a31e16 7793
f0486c68 7794 if (offset < search_start)
215a63d1 7795 btrfs_add_free_space(block_group, offset,
f0486c68
YZ
7796 search_start - offset);
7797 BUG_ON(offset > search_start);
2552d17e 7798
18513091
WX
7799 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7800 num_bytes, delalloc);
f0486c68 7801 if (ret == -EAGAIN) {
215a63d1 7802 btrfs_add_free_space(block_group, offset, num_bytes);
2552d17e 7803 goto loop;
0f9dd46c 7804 }
9cfa3e34 7805 btrfs_inc_block_group_reservations(block_group);
0b86a832 7806
f0486c68 7807 /* we are all good, lets return */
2552d17e
JB
7808 ins->objectid = search_start;
7809 ins->offset = num_bytes;
d2fb3437 7810
71ff6437 7811 trace_btrfs_reserve_extent(fs_info, block_group,
3f7de037 7812 search_start, num_bytes);
e570fd27 7813 btrfs_release_block_group(block_group, delalloc);
2552d17e
JB
7814 break;
7815loop:
0a24325e 7816 failed_cluster_refill = false;
1cdda9b8 7817 failed_alloc = false;
b742bb82 7818 BUG_ON(index != get_block_group_index(block_group));
e570fd27 7819 btrfs_release_block_group(block_group, delalloc);
2552d17e
JB
7820 }
7821 up_read(&space_info->groups_sem);
7822
13a0db5a 7823 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7824 && !orig_have_caching_bg)
7825 orig_have_caching_bg = true;
7826
60d2adbb
MX
7827 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7828 goto search;
7829
b742bb82
YZ
7830 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7831 goto search;
7832
285ff5af 7833 /*
ccf0e725
JB
7834 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7835 * caching kthreads as we move along
817d52f8
JB
7836 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7837 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7838 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7839 * again
fa9c0d79 7840 */
723bda20 7841 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
b742bb82 7842 index = 0;
a5e681d9
JB
7843 if (loop == LOOP_CACHING_NOWAIT) {
7844 /*
7845 * We want to skip the LOOP_CACHING_WAIT step if we
01327610 7846 * don't have any uncached bgs and we've already done a
a5e681d9
JB
7847 * full search through.
7848 */
13a0db5a 7849 if (orig_have_caching_bg || !full_search)
a5e681d9
JB
7850 loop = LOOP_CACHING_WAIT;
7851 else
7852 loop = LOOP_ALLOC_CHUNK;
7853 } else {
7854 loop++;
7855 }
7856
817d52f8 7857 if (loop == LOOP_ALLOC_CHUNK) {
00361589 7858 struct btrfs_trans_handle *trans;
f017f15f
WS
7859 int exist = 0;
7860
7861 trans = current->journal_info;
7862 if (trans)
7863 exist = 1;
7864 else
7865 trans = btrfs_join_transaction(root);
00361589 7866
00361589
JB
7867 if (IS_ERR(trans)) {
7868 ret = PTR_ERR(trans);
7869 goto out;
7870 }
7871
2ff7e61e 7872 ret = do_chunk_alloc(trans, fs_info, flags,
ea658bad 7873 CHUNK_ALLOC_FORCE);
a5e681d9
JB
7874
7875 /*
7876 * If we can't allocate a new chunk we've already looped
7877 * through at least once, move on to the NO_EMPTY_SIZE
7878 * case.
7879 */
7880 if (ret == -ENOSPC)
7881 loop = LOOP_NO_EMPTY_SIZE;
7882
ea658bad
JB
7883 /*
7884 * Do not bail out on ENOSPC since we
7885 * can do more things.
7886 */
00361589 7887 if (ret < 0 && ret != -ENOSPC)
66642832 7888 btrfs_abort_transaction(trans, ret);
00361589
JB
7889 else
7890 ret = 0;
f017f15f 7891 if (!exist)
3a45bb20 7892 btrfs_end_transaction(trans);
00361589 7893 if (ret)
ea658bad 7894 goto out;
2552d17e
JB
7895 }
7896
723bda20 7897 if (loop == LOOP_NO_EMPTY_SIZE) {
a5e681d9
JB
7898 /*
7899 * Don't loop again if we already have no empty_size and
7900 * no empty_cluster.
7901 */
7902 if (empty_size == 0 &&
7903 empty_cluster == 0) {
7904 ret = -ENOSPC;
7905 goto out;
7906 }
723bda20
JB
7907 empty_size = 0;
7908 empty_cluster = 0;
fa9c0d79 7909 }
723bda20
JB
7910
7911 goto search;
2552d17e
JB
7912 } else if (!ins->objectid) {
7913 ret = -ENOSPC;
d82a6f1d 7914 } else if (ins->objectid) {
c759c4e1
JB
7915 if (!use_cluster && last_ptr) {
7916 spin_lock(&last_ptr->lock);
7917 last_ptr->window_start = ins->objectid;
7918 spin_unlock(&last_ptr->lock);
7919 }
80eb234a 7920 ret = 0;
be744175 7921 }
79787eaa 7922out:
4f4db217
JB
7923 if (ret == -ENOSPC) {
7924 spin_lock(&space_info->lock);
7925 space_info->max_extent_size = max_extent_size;
7926 spin_unlock(&space_info->lock);
a4820398 7927 ins->offset = max_extent_size;
4f4db217 7928 }
0f70abe2 7929 return ret;
fec577fb 7930}
ec44a35c 7931
ab8d0fc4
JM
7932static void dump_space_info(struct btrfs_fs_info *fs_info,
7933 struct btrfs_space_info *info, u64 bytes,
9ed74f2d 7934 int dump_block_groups)
0f9dd46c
JB
7935{
7936 struct btrfs_block_group_cache *cache;
b742bb82 7937 int index = 0;
0f9dd46c 7938
9ed74f2d 7939 spin_lock(&info->lock);
ab8d0fc4
JM
7940 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7941 info->flags,
4136135b
LB
7942 info->total_bytes - btrfs_space_info_used(info, true),
7943 info->full ? "" : "not ");
ab8d0fc4
JM
7944 btrfs_info(fs_info,
7945 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7946 info->total_bytes, info->bytes_used, info->bytes_pinned,
7947 info->bytes_reserved, info->bytes_may_use,
7948 info->bytes_readonly);
9ed74f2d
JB
7949 spin_unlock(&info->lock);
7950
7951 if (!dump_block_groups)
7952 return;
0f9dd46c 7953
80eb234a 7954 down_read(&info->groups_sem);
b742bb82
YZ
7955again:
7956 list_for_each_entry(cache, &info->block_groups[index], list) {
0f9dd46c 7957 spin_lock(&cache->lock);
ab8d0fc4
JM
7958 btrfs_info(fs_info,
7959 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7960 cache->key.objectid, cache->key.offset,
7961 btrfs_block_group_used(&cache->item), cache->pinned,
7962 cache->reserved, cache->ro ? "[readonly]" : "");
0f9dd46c
JB
7963 btrfs_dump_free_space(cache, bytes);
7964 spin_unlock(&cache->lock);
7965 }
b742bb82
YZ
7966 if (++index < BTRFS_NR_RAID_TYPES)
7967 goto again;
80eb234a 7968 up_read(&info->groups_sem);
0f9dd46c 7969}
e8569813 7970
18513091 7971int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
7972 u64 num_bytes, u64 min_alloc_size,
7973 u64 empty_size, u64 hint_byte,
e570fd27 7974 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 7975{
ab8d0fc4 7976 struct btrfs_fs_info *fs_info = root->fs_info;
36af4e07 7977 bool final_tried = num_bytes == min_alloc_size;
b6919a58 7978 u64 flags;
fec577fb 7979 int ret;
925baedd 7980
1b86826d 7981 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 7982again:
0b246afa 7983 WARN_ON(num_bytes < fs_info->sectorsize);
87bde3cd 7984 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
18513091 7985 hint_byte, ins, flags, delalloc);
9cfa3e34 7986 if (!ret && !is_data) {
ab8d0fc4 7987 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 7988 } else if (ret == -ENOSPC) {
a4820398
MX
7989 if (!final_tried && ins->offset) {
7990 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 7991 num_bytes = round_down(num_bytes,
0b246afa 7992 fs_info->sectorsize);
9e622d6b 7993 num_bytes = max(num_bytes, min_alloc_size);
18513091 7994 ram_bytes = num_bytes;
9e622d6b
MX
7995 if (num_bytes == min_alloc_size)
7996 final_tried = true;
7997 goto again;
ab8d0fc4 7998 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
7999 struct btrfs_space_info *sinfo;
8000
ab8d0fc4 8001 sinfo = __find_space_info(fs_info, flags);
0b246afa 8002 btrfs_err(fs_info,
5d163e0e
JM
8003 "allocation failed flags %llu, wanted %llu",
8004 flags, num_bytes);
53804280 8005 if (sinfo)
ab8d0fc4 8006 dump_space_info(fs_info, sinfo, num_bytes, 1);
9e622d6b 8007 }
925baedd 8008 }
0f9dd46c
JB
8009
8010 return ret;
e6dcd2dc
CM
8011}
8012
2ff7e61e 8013static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27
MX
8014 u64 start, u64 len,
8015 int pin, int delalloc)
65b51a00 8016{
0f9dd46c 8017 struct btrfs_block_group_cache *cache;
1f3c79a2 8018 int ret = 0;
0f9dd46c 8019
0b246afa 8020 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 8021 if (!cache) {
0b246afa
JM
8022 btrfs_err(fs_info, "Unable to find block group for %llu",
8023 start);
0f9dd46c
JB
8024 return -ENOSPC;
8025 }
1f3c79a2 8026
e688b725 8027 if (pin)
2ff7e61e 8028 pin_down_extent(fs_info, cache, start, len, 1);
e688b725 8029 else {
0b246afa 8030 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 8031 ret = btrfs_discard_extent(fs_info, start, len, NULL);
e688b725 8032 btrfs_add_free_space(cache, start, len);
4824f1f4 8033 btrfs_free_reserved_bytes(cache, len, delalloc);
71ff6437 8034 trace_btrfs_reserved_extent_free(fs_info, start, len);
e688b725 8035 }
31193213 8036
fa9c0d79 8037 btrfs_put_block_group(cache);
e6dcd2dc
CM
8038 return ret;
8039}
8040
2ff7e61e 8041int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27 8042 u64 start, u64 len, int delalloc)
e688b725 8043{
2ff7e61e 8044 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
e688b725
CM
8045}
8046
2ff7e61e 8047int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
e688b725
CM
8048 u64 start, u64 len)
8049{
2ff7e61e 8050 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
e688b725
CM
8051}
8052
5d4f98a2 8053static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
2ff7e61e 8054 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
8055 u64 parent, u64 root_objectid,
8056 u64 flags, u64 owner, u64 offset,
8057 struct btrfs_key *ins, int ref_mod)
e6dcd2dc
CM
8058{
8059 int ret;
e6dcd2dc 8060 struct btrfs_extent_item *extent_item;
5d4f98a2 8061 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 8062 struct btrfs_path *path;
5d4f98a2
YZ
8063 struct extent_buffer *leaf;
8064 int type;
8065 u32 size;
26b8003f 8066
5d4f98a2
YZ
8067 if (parent > 0)
8068 type = BTRFS_SHARED_DATA_REF_KEY;
8069 else
8070 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 8071
5d4f98a2 8072 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
8073
8074 path = btrfs_alloc_path();
db5b493a
TI
8075 if (!path)
8076 return -ENOMEM;
47e4bb98 8077
b9473439 8078 path->leave_spinning = 1;
5d4f98a2
YZ
8079 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8080 ins, size);
79787eaa
JM
8081 if (ret) {
8082 btrfs_free_path(path);
8083 return ret;
8084 }
0f9dd46c 8085
5d4f98a2
YZ
8086 leaf = path->nodes[0];
8087 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 8088 struct btrfs_extent_item);
5d4f98a2
YZ
8089 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8090 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8091 btrfs_set_extent_flags(leaf, extent_item,
8092 flags | BTRFS_EXTENT_FLAG_DATA);
8093
8094 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8095 btrfs_set_extent_inline_ref_type(leaf, iref, type);
8096 if (parent > 0) {
8097 struct btrfs_shared_data_ref *ref;
8098 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8099 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8100 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8101 } else {
8102 struct btrfs_extent_data_ref *ref;
8103 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8104 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8105 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8106 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8107 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8108 }
47e4bb98
CM
8109
8110 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 8111 btrfs_free_path(path);
f510cfec 8112
1e144fb8
OS
8113 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8114 ins->offset);
8115 if (ret)
8116 return ret;
8117
6202df69 8118 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
79787eaa 8119 if (ret) { /* -ENOENT, logic error */
c2cf52eb 8120 btrfs_err(fs_info, "update block group failed for %llu %llu",
c1c9ff7c 8121 ins->objectid, ins->offset);
f5947066
CM
8122 BUG();
8123 }
71ff6437 8124 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
e6dcd2dc
CM
8125 return ret;
8126}
8127
5d4f98a2 8128static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
2ff7e61e 8129 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
8130 u64 parent, u64 root_objectid,
8131 u64 flags, struct btrfs_disk_key *key,
b06c4bf5 8132 int level, struct btrfs_key *ins)
e6dcd2dc
CM
8133{
8134 int ret;
5d4f98a2
YZ
8135 struct btrfs_extent_item *extent_item;
8136 struct btrfs_tree_block_info *block_info;
8137 struct btrfs_extent_inline_ref *iref;
8138 struct btrfs_path *path;
8139 struct extent_buffer *leaf;
3173a18f 8140 u32 size = sizeof(*extent_item) + sizeof(*iref);
fcebe456 8141 u64 num_bytes = ins->offset;
0b246afa 8142 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f
JB
8143
8144 if (!skinny_metadata)
8145 size += sizeof(*block_info);
1c2308f8 8146
5d4f98a2 8147 path = btrfs_alloc_path();
857cc2fc 8148 if (!path) {
2ff7e61e 8149 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
0b246afa 8150 fs_info->nodesize);
d8926bb3 8151 return -ENOMEM;
857cc2fc 8152 }
56bec294 8153
5d4f98a2
YZ
8154 path->leave_spinning = 1;
8155 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8156 ins, size);
79787eaa 8157 if (ret) {
dd825259 8158 btrfs_free_path(path);
2ff7e61e 8159 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
0b246afa 8160 fs_info->nodesize);
79787eaa
JM
8161 return ret;
8162 }
5d4f98a2
YZ
8163
8164 leaf = path->nodes[0];
8165 extent_item = btrfs_item_ptr(leaf, path->slots[0],
8166 struct btrfs_extent_item);
8167 btrfs_set_extent_refs(leaf, extent_item, 1);
8168 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8169 btrfs_set_extent_flags(leaf, extent_item,
8170 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 8171
3173a18f
JB
8172 if (skinny_metadata) {
8173 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
0b246afa 8174 num_bytes = fs_info->nodesize;
3173a18f
JB
8175 } else {
8176 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8177 btrfs_set_tree_block_key(leaf, block_info, key);
8178 btrfs_set_tree_block_level(leaf, block_info, level);
8179 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8180 }
5d4f98a2 8181
5d4f98a2
YZ
8182 if (parent > 0) {
8183 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8184 btrfs_set_extent_inline_ref_type(leaf, iref,
8185 BTRFS_SHARED_BLOCK_REF_KEY);
8186 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8187 } else {
8188 btrfs_set_extent_inline_ref_type(leaf, iref,
8189 BTRFS_TREE_BLOCK_REF_KEY);
8190 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8191 }
8192
8193 btrfs_mark_buffer_dirty(leaf);
8194 btrfs_free_path(path);
8195
1e144fb8
OS
8196 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8197 num_bytes);
8198 if (ret)
8199 return ret;
8200
6202df69
JM
8201 ret = update_block_group(trans, fs_info, ins->objectid,
8202 fs_info->nodesize, 1);
79787eaa 8203 if (ret) { /* -ENOENT, logic error */
c2cf52eb 8204 btrfs_err(fs_info, "update block group failed for %llu %llu",
c1c9ff7c 8205 ins->objectid, ins->offset);
5d4f98a2
YZ
8206 BUG();
8207 }
0be5dc67 8208
71ff6437 8209 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
0b246afa 8210 fs_info->nodesize);
5d4f98a2
YZ
8211 return ret;
8212}
8213
8214int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2 8215 u64 root_objectid, u64 owner,
5846a3c2
QW
8216 u64 offset, u64 ram_bytes,
8217 struct btrfs_key *ins)
5d4f98a2 8218{
2ff7e61e 8219 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
8220 int ret;
8221
8222 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8223
0b246afa 8224 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
66d7e7f0
AJ
8225 ins->offset, 0,
8226 root_objectid, owner, offset,
fef394f7 8227 ram_bytes, BTRFS_ADD_DELAYED_EXTENT);
e6dcd2dc
CM
8228 return ret;
8229}
e02119d5
CM
8230
8231/*
8232 * this is used by the tree logging recovery code. It records that
8233 * an extent has been allocated and makes sure to clear the free
8234 * space cache bits as well
8235 */
5d4f98a2 8236int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
2ff7e61e 8237 struct btrfs_fs_info *fs_info,
5d4f98a2
YZ
8238 u64 root_objectid, u64 owner, u64 offset,
8239 struct btrfs_key *ins)
e02119d5
CM
8240{
8241 int ret;
8242 struct btrfs_block_group_cache *block_group;
ed7a6948 8243 struct btrfs_space_info *space_info;
11833d66 8244
8c2a1a30
JB
8245 /*
8246 * Mixed block groups will exclude before processing the log so we only
01327610 8247 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 8248 */
0b246afa 8249 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
8250 ret = __exclude_logged_extent(fs_info, ins->objectid,
8251 ins->offset);
b50c6e25 8252 if (ret)
8c2a1a30 8253 return ret;
11833d66
YZ
8254 }
8255
0b246afa 8256 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
8257 if (!block_group)
8258 return -EINVAL;
8259
ed7a6948
WX
8260 space_info = block_group->space_info;
8261 spin_lock(&space_info->lock);
8262 spin_lock(&block_group->lock);
8263 space_info->bytes_reserved += ins->offset;
8264 block_group->reserved += ins->offset;
8265 spin_unlock(&block_group->lock);
8266 spin_unlock(&space_info->lock);
8267
2ff7e61e 8268 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
5d4f98a2 8269 0, owner, offset, ins, 1);
b50c6e25 8270 btrfs_put_block_group(block_group);
e02119d5
CM
8271 return ret;
8272}
8273
48a3b636
ES
8274static struct extent_buffer *
8275btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fe864576 8276 u64 bytenr, int level)
65b51a00 8277{
0b246afa 8278 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00
CM
8279 struct extent_buffer *buf;
8280
2ff7e61e 8281 buf = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
8282 if (IS_ERR(buf))
8283 return buf;
8284
65b51a00 8285 btrfs_set_header_generation(buf, trans->transid);
85d4e461 8286 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
65b51a00 8287 btrfs_tree_lock(buf);
7c302b49 8288 clean_tree_block(fs_info, buf);
3083ee2e 8289 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
b4ce94de
CM
8290
8291 btrfs_set_lock_blocking(buf);
4db8c528 8292 set_extent_buffer_uptodate(buf);
b4ce94de 8293
d0c803c4 8294 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 8295 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
8296 /*
8297 * we allow two log transactions at a time, use different
8298 * EXENT bit to differentiate dirty pages.
8299 */
656f30db 8300 if (buf->log_index == 0)
8cef4e16
YZ
8301 set_extent_dirty(&root->dirty_log_pages, buf->start,
8302 buf->start + buf->len - 1, GFP_NOFS);
8303 else
8304 set_extent_new(&root->dirty_log_pages, buf->start,
3744dbeb 8305 buf->start + buf->len - 1);
d0c803c4 8306 } else {
656f30db 8307 buf->log_index = -1;
d0c803c4 8308 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 8309 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 8310 }
64c12921 8311 trans->dirty = true;
b4ce94de 8312 /* this returns a buffer locked for blocking */
65b51a00
CM
8313 return buf;
8314}
8315
f0486c68
YZ
8316static struct btrfs_block_rsv *
8317use_block_rsv(struct btrfs_trans_handle *trans,
8318 struct btrfs_root *root, u32 blocksize)
8319{
0b246afa 8320 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 8321 struct btrfs_block_rsv *block_rsv;
0b246afa 8322 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
f0486c68 8323 int ret;
d88033db 8324 bool global_updated = false;
f0486c68
YZ
8325
8326 block_rsv = get_block_rsv(trans, root);
8327
b586b323
MX
8328 if (unlikely(block_rsv->size == 0))
8329 goto try_reserve;
d88033db 8330again:
f0486c68
YZ
8331 ret = block_rsv_use_bytes(block_rsv, blocksize);
8332 if (!ret)
8333 return block_rsv;
8334
b586b323
MX
8335 if (block_rsv->failfast)
8336 return ERR_PTR(ret);
8337
d88033db
MX
8338 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8339 global_updated = true;
0b246afa 8340 update_global_block_rsv(fs_info);
d88033db
MX
8341 goto again;
8342 }
8343
0b246afa 8344 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
b586b323
MX
8345 static DEFINE_RATELIMIT_STATE(_rs,
8346 DEFAULT_RATELIMIT_INTERVAL * 10,
8347 /*DEFAULT_RATELIMIT_BURST*/ 1);
8348 if (__ratelimit(&_rs))
8349 WARN(1, KERN_DEBUG
efe120a0 8350 "BTRFS: block rsv returned %d\n", ret);
b586b323
MX
8351 }
8352try_reserve:
8353 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8354 BTRFS_RESERVE_NO_FLUSH);
8355 if (!ret)
8356 return block_rsv;
8357 /*
8358 * If we couldn't reserve metadata bytes try and use some from
5881cfc9
MX
8359 * the global reserve if its space type is the same as the global
8360 * reservation.
b586b323 8361 */
5881cfc9
MX
8362 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8363 block_rsv->space_info == global_rsv->space_info) {
b586b323
MX
8364 ret = block_rsv_use_bytes(global_rsv, blocksize);
8365 if (!ret)
8366 return global_rsv;
8367 }
8368 return ERR_PTR(ret);
f0486c68
YZ
8369}
8370
8c2a3ca2
JB
8371static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8372 struct btrfs_block_rsv *block_rsv, u32 blocksize)
f0486c68
YZ
8373{
8374 block_rsv_add_bytes(block_rsv, blocksize, 0);
8c2a3ca2 8375 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
f0486c68
YZ
8376}
8377
fec577fb 8378/*
f0486c68 8379 * finds a free extent and does all the dirty work required for allocation
67b7859e 8380 * returns the tree buffer or an ERR_PTR on error.
fec577fb 8381 */
4d75f8a9 8382struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
8383 struct btrfs_root *root,
8384 u64 parent, u64 root_objectid,
8385 const struct btrfs_disk_key *key,
8386 int level, u64 hint,
8387 u64 empty_size)
fec577fb 8388{
0b246afa 8389 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 8390 struct btrfs_key ins;
f0486c68 8391 struct btrfs_block_rsv *block_rsv;
5f39d397 8392 struct extent_buffer *buf;
67b7859e 8393 struct btrfs_delayed_extent_op *extent_op;
f0486c68
YZ
8394 u64 flags = 0;
8395 int ret;
0b246afa
JM
8396 u32 blocksize = fs_info->nodesize;
8397 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
fec577fb 8398
05653ef3 8399#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 8400 if (btrfs_is_testing(fs_info)) {
faa2dbf0 8401 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
fe864576 8402 level);
faa2dbf0
JB
8403 if (!IS_ERR(buf))
8404 root->alloc_bytenr += blocksize;
8405 return buf;
8406 }
05653ef3 8407#endif
fccb84c9 8408
f0486c68
YZ
8409 block_rsv = use_block_rsv(trans, root, blocksize);
8410 if (IS_ERR(block_rsv))
8411 return ERR_CAST(block_rsv);
8412
18513091 8413 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 8414 empty_size, hint, &ins, 0, 0);
67b7859e
OS
8415 if (ret)
8416 goto out_unuse;
55c69072 8417
fe864576 8418 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
67b7859e
OS
8419 if (IS_ERR(buf)) {
8420 ret = PTR_ERR(buf);
8421 goto out_free_reserved;
8422 }
f0486c68
YZ
8423
8424 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8425 if (parent == 0)
8426 parent = ins.objectid;
8427 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8428 } else
8429 BUG_ON(parent > 0);
8430
8431 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
78a6184a 8432 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
8433 if (!extent_op) {
8434 ret = -ENOMEM;
8435 goto out_free_buf;
8436 }
f0486c68
YZ
8437 if (key)
8438 memcpy(&extent_op->key, key, sizeof(extent_op->key));
8439 else
8440 memset(&extent_op->key, 0, sizeof(extent_op->key));
8441 extent_op->flags_to_set = flags;
35b3ad50
DS
8442 extent_op->update_key = skinny_metadata ? false : true;
8443 extent_op->update_flags = true;
8444 extent_op->is_data = false;
b1c79e09 8445 extent_op->level = level;
f0486c68 8446
0b246afa 8447 ret = btrfs_add_delayed_tree_ref(fs_info, trans,
67b7859e
OS
8448 ins.objectid, ins.offset,
8449 parent, root_objectid, level,
8450 BTRFS_ADD_DELAYED_EXTENT,
b06c4bf5 8451 extent_op);
67b7859e
OS
8452 if (ret)
8453 goto out_free_delayed;
f0486c68 8454 }
fec577fb 8455 return buf;
67b7859e
OS
8456
8457out_free_delayed:
8458 btrfs_free_delayed_extent_op(extent_op);
8459out_free_buf:
8460 free_extent_buffer(buf);
8461out_free_reserved:
2ff7e61e 8462 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 8463out_unuse:
0b246afa 8464 unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 8465 return ERR_PTR(ret);
fec577fb 8466}
a28ec197 8467
2c47e605
YZ
8468struct walk_control {
8469 u64 refs[BTRFS_MAX_LEVEL];
8470 u64 flags[BTRFS_MAX_LEVEL];
8471 struct btrfs_key update_progress;
8472 int stage;
8473 int level;
8474 int shared_level;
8475 int update_ref;
8476 int keep_locks;
1c4850e2
YZ
8477 int reada_slot;
8478 int reada_count;
66d7e7f0 8479 int for_reloc;
2c47e605
YZ
8480};
8481
8482#define DROP_REFERENCE 1
8483#define UPDATE_BACKREF 2
8484
1c4850e2
YZ
8485static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8486 struct btrfs_root *root,
8487 struct walk_control *wc,
8488 struct btrfs_path *path)
6407bf6d 8489{
0b246afa 8490 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
8491 u64 bytenr;
8492 u64 generation;
8493 u64 refs;
94fcca9f 8494 u64 flags;
5d4f98a2 8495 u32 nritems;
1c4850e2
YZ
8496 struct btrfs_key key;
8497 struct extent_buffer *eb;
6407bf6d 8498 int ret;
1c4850e2
YZ
8499 int slot;
8500 int nread = 0;
6407bf6d 8501
1c4850e2
YZ
8502 if (path->slots[wc->level] < wc->reada_slot) {
8503 wc->reada_count = wc->reada_count * 2 / 3;
8504 wc->reada_count = max(wc->reada_count, 2);
8505 } else {
8506 wc->reada_count = wc->reada_count * 3 / 2;
8507 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 8508 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 8509 }
7bb86316 8510
1c4850e2
YZ
8511 eb = path->nodes[wc->level];
8512 nritems = btrfs_header_nritems(eb);
bd56b302 8513
1c4850e2
YZ
8514 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8515 if (nread >= wc->reada_count)
8516 break;
bd56b302 8517
2dd3e67b 8518 cond_resched();
1c4850e2
YZ
8519 bytenr = btrfs_node_blockptr(eb, slot);
8520 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 8521
1c4850e2
YZ
8522 if (slot == path->slots[wc->level])
8523 goto reada;
5d4f98a2 8524
1c4850e2
YZ
8525 if (wc->stage == UPDATE_BACKREF &&
8526 generation <= root->root_key.offset)
bd56b302
CM
8527 continue;
8528
94fcca9f 8529 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 8530 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f
JB
8531 wc->level - 1, 1, &refs,
8532 &flags);
79787eaa
JM
8533 /* We don't care about errors in readahead. */
8534 if (ret < 0)
8535 continue;
94fcca9f
YZ
8536 BUG_ON(refs == 0);
8537
1c4850e2 8538 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
8539 if (refs == 1)
8540 goto reada;
bd56b302 8541
94fcca9f
YZ
8542 if (wc->level == 1 &&
8543 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8544 continue;
1c4850e2
YZ
8545 if (!wc->update_ref ||
8546 generation <= root->root_key.offset)
8547 continue;
8548 btrfs_node_key_to_cpu(eb, &key, slot);
8549 ret = btrfs_comp_cpu_keys(&key,
8550 &wc->update_progress);
8551 if (ret < 0)
8552 continue;
94fcca9f
YZ
8553 } else {
8554 if (wc->level == 1 &&
8555 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8556 continue;
6407bf6d 8557 }
1c4850e2 8558reada:
2ff7e61e 8559 readahead_tree_block(fs_info, bytenr);
1c4850e2 8560 nread++;
20524f02 8561 }
1c4850e2 8562 wc->reada_slot = slot;
20524f02 8563}
2c47e605 8564
f82d02d9 8565/*
2c016dc2 8566 * helper to process tree block while walking down the tree.
2c47e605 8567 *
2c47e605
YZ
8568 * when wc->stage == UPDATE_BACKREF, this function updates
8569 * back refs for pointers in the block.
8570 *
8571 * NOTE: return value 1 means we should stop walking down.
f82d02d9 8572 */
2c47e605 8573static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 8574 struct btrfs_root *root,
2c47e605 8575 struct btrfs_path *path,
94fcca9f 8576 struct walk_control *wc, int lookup_info)
f82d02d9 8577{
2ff7e61e 8578 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
8579 int level = wc->level;
8580 struct extent_buffer *eb = path->nodes[level];
2c47e605 8581 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
8582 int ret;
8583
2c47e605
YZ
8584 if (wc->stage == UPDATE_BACKREF &&
8585 btrfs_header_owner(eb) != root->root_key.objectid)
8586 return 1;
f82d02d9 8587
2c47e605
YZ
8588 /*
8589 * when reference count of tree block is 1, it won't increase
8590 * again. once full backref flag is set, we never clear it.
8591 */
94fcca9f
YZ
8592 if (lookup_info &&
8593 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8594 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 8595 BUG_ON(!path->locks[level]);
2ff7e61e 8596 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 8597 eb->start, level, 1,
2c47e605
YZ
8598 &wc->refs[level],
8599 &wc->flags[level]);
79787eaa
JM
8600 BUG_ON(ret == -ENOMEM);
8601 if (ret)
8602 return ret;
2c47e605
YZ
8603 BUG_ON(wc->refs[level] == 0);
8604 }
5d4f98a2 8605
2c47e605
YZ
8606 if (wc->stage == DROP_REFERENCE) {
8607 if (wc->refs[level] > 1)
8608 return 1;
f82d02d9 8609
2c47e605 8610 if (path->locks[level] && !wc->keep_locks) {
bd681513 8611 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
8612 path->locks[level] = 0;
8613 }
8614 return 0;
8615 }
f82d02d9 8616
2c47e605
YZ
8617 /* wc->stage == UPDATE_BACKREF */
8618 if (!(wc->flags[level] & flag)) {
8619 BUG_ON(!path->locks[level]);
e339a6b0 8620 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 8621 BUG_ON(ret); /* -ENOMEM */
e339a6b0 8622 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 8623 BUG_ON(ret); /* -ENOMEM */
2ff7e61e 8624 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
b1c79e09
JB
8625 eb->len, flag,
8626 btrfs_header_level(eb), 0);
79787eaa 8627 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
8628 wc->flags[level] |= flag;
8629 }
8630
8631 /*
8632 * the block is shared by multiple trees, so it's not good to
8633 * keep the tree lock
8634 */
8635 if (path->locks[level] && level > 0) {
bd681513 8636 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
8637 path->locks[level] = 0;
8638 }
8639 return 0;
8640}
8641
1c4850e2 8642/*
2c016dc2 8643 * helper to process tree block pointer.
1c4850e2
YZ
8644 *
8645 * when wc->stage == DROP_REFERENCE, this function checks
8646 * reference count of the block pointed to. if the block
8647 * is shared and we need update back refs for the subtree
8648 * rooted at the block, this function changes wc->stage to
8649 * UPDATE_BACKREF. if the block is shared and there is no
8650 * need to update back, this function drops the reference
8651 * to the block.
8652 *
8653 * NOTE: return value 1 means we should stop walking down.
8654 */
8655static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8656 struct btrfs_root *root,
8657 struct btrfs_path *path,
94fcca9f 8658 struct walk_control *wc, int *lookup_info)
1c4850e2 8659{
0b246afa 8660 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
8661 u64 bytenr;
8662 u64 generation;
8663 u64 parent;
8664 u32 blocksize;
8665 struct btrfs_key key;
8666 struct extent_buffer *next;
8667 int level = wc->level;
8668 int reada = 0;
8669 int ret = 0;
1152651a 8670 bool need_account = false;
1c4850e2
YZ
8671
8672 generation = btrfs_node_ptr_generation(path->nodes[level],
8673 path->slots[level]);
8674 /*
8675 * if the lower level block was created before the snapshot
8676 * was created, we know there is no need to update back refs
8677 * for the subtree
8678 */
8679 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
8680 generation <= root->root_key.offset) {
8681 *lookup_info = 1;
1c4850e2 8682 return 1;
94fcca9f 8683 }
1c4850e2
YZ
8684
8685 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
0b246afa 8686 blocksize = fs_info->nodesize;
1c4850e2 8687
0b246afa 8688 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 8689 if (!next) {
2ff7e61e 8690 next = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
8691 if (IS_ERR(next))
8692 return PTR_ERR(next);
8693
b2aaaa3b
JB
8694 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8695 level - 1);
1c4850e2
YZ
8696 reada = 1;
8697 }
8698 btrfs_tree_lock(next);
8699 btrfs_set_lock_blocking(next);
8700
2ff7e61e 8701 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f
YZ
8702 &wc->refs[level - 1],
8703 &wc->flags[level - 1]);
4867268c
JB
8704 if (ret < 0)
8705 goto out_unlock;
79787eaa 8706
c2cf52eb 8707 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 8708 btrfs_err(fs_info, "Missing references.");
4867268c
JB
8709 ret = -EIO;
8710 goto out_unlock;
c2cf52eb 8711 }
94fcca9f 8712 *lookup_info = 0;
1c4850e2 8713
94fcca9f 8714 if (wc->stage == DROP_REFERENCE) {
1c4850e2 8715 if (wc->refs[level - 1] > 1) {
1152651a 8716 need_account = true;
94fcca9f
YZ
8717 if (level == 1 &&
8718 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8719 goto skip;
8720
1c4850e2
YZ
8721 if (!wc->update_ref ||
8722 generation <= root->root_key.offset)
8723 goto skip;
8724
8725 btrfs_node_key_to_cpu(path->nodes[level], &key,
8726 path->slots[level]);
8727 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8728 if (ret < 0)
8729 goto skip;
8730
8731 wc->stage = UPDATE_BACKREF;
8732 wc->shared_level = level - 1;
8733 }
94fcca9f
YZ
8734 } else {
8735 if (level == 1 &&
8736 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8737 goto skip;
1c4850e2
YZ
8738 }
8739
b9fab919 8740 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
8741 btrfs_tree_unlock(next);
8742 free_extent_buffer(next);
8743 next = NULL;
94fcca9f 8744 *lookup_info = 1;
1c4850e2
YZ
8745 }
8746
8747 if (!next) {
8748 if (reada && level == 1)
8749 reada_walk_down(trans, root, wc, path);
2ff7e61e 8750 next = read_tree_block(fs_info, bytenr, generation);
64c043de
LB
8751 if (IS_ERR(next)) {
8752 return PTR_ERR(next);
8753 } else if (!extent_buffer_uptodate(next)) {
416bc658 8754 free_extent_buffer(next);
97d9a8a4 8755 return -EIO;
416bc658 8756 }
1c4850e2
YZ
8757 btrfs_tree_lock(next);
8758 btrfs_set_lock_blocking(next);
8759 }
8760
8761 level--;
4867268c
JB
8762 ASSERT(level == btrfs_header_level(next));
8763 if (level != btrfs_header_level(next)) {
8764 btrfs_err(root->fs_info, "mismatched level");
8765 ret = -EIO;
8766 goto out_unlock;
8767 }
1c4850e2
YZ
8768 path->nodes[level] = next;
8769 path->slots[level] = 0;
bd681513 8770 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
1c4850e2
YZ
8771 wc->level = level;
8772 if (wc->level == 1)
8773 wc->reada_slot = 0;
8774 return 0;
8775skip:
8776 wc->refs[level - 1] = 0;
8777 wc->flags[level - 1] = 0;
94fcca9f
YZ
8778 if (wc->stage == DROP_REFERENCE) {
8779 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8780 parent = path->nodes[level]->start;
8781 } else {
4867268c 8782 ASSERT(root->root_key.objectid ==
94fcca9f 8783 btrfs_header_owner(path->nodes[level]));
4867268c
JB
8784 if (root->root_key.objectid !=
8785 btrfs_header_owner(path->nodes[level])) {
8786 btrfs_err(root->fs_info,
8787 "mismatched block owner");
8788 ret = -EIO;
8789 goto out_unlock;
8790 }
94fcca9f
YZ
8791 parent = 0;
8792 }
1c4850e2 8793
1152651a 8794 if (need_account) {
33d1f05c
QW
8795 ret = btrfs_qgroup_trace_subtree(trans, root, next,
8796 generation, level - 1);
1152651a 8797 if (ret) {
0b246afa 8798 btrfs_err_rl(fs_info,
5d163e0e
JM
8799 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8800 ret);
1152651a
MF
8801 }
8802 }
2ff7e61e
JM
8803 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
8804 parent, root->root_key.objectid,
8805 level - 1, 0);
4867268c
JB
8806 if (ret)
8807 goto out_unlock;
1c4850e2 8808 }
4867268c
JB
8809
8810 *lookup_info = 1;
8811 ret = 1;
8812
8813out_unlock:
1c4850e2
YZ
8814 btrfs_tree_unlock(next);
8815 free_extent_buffer(next);
4867268c
JB
8816
8817 return ret;
1c4850e2
YZ
8818}
8819
2c47e605 8820/*
2c016dc2 8821 * helper to process tree block while walking up the tree.
2c47e605
YZ
8822 *
8823 * when wc->stage == DROP_REFERENCE, this function drops
8824 * reference count on the block.
8825 *
8826 * when wc->stage == UPDATE_BACKREF, this function changes
8827 * wc->stage back to DROP_REFERENCE if we changed wc->stage
8828 * to UPDATE_BACKREF previously while processing the block.
8829 *
8830 * NOTE: return value 1 means we should stop walking up.
8831 */
8832static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8833 struct btrfs_root *root,
8834 struct btrfs_path *path,
8835 struct walk_control *wc)
8836{
0b246afa 8837 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 8838 int ret;
2c47e605
YZ
8839 int level = wc->level;
8840 struct extent_buffer *eb = path->nodes[level];
8841 u64 parent = 0;
8842
8843 if (wc->stage == UPDATE_BACKREF) {
8844 BUG_ON(wc->shared_level < level);
8845 if (level < wc->shared_level)
8846 goto out;
8847
2c47e605
YZ
8848 ret = find_next_key(path, level + 1, &wc->update_progress);
8849 if (ret > 0)
8850 wc->update_ref = 0;
8851
8852 wc->stage = DROP_REFERENCE;
8853 wc->shared_level = -1;
8854 path->slots[level] = 0;
8855
8856 /*
8857 * check reference count again if the block isn't locked.
8858 * we should start walking down the tree again if reference
8859 * count is one.
8860 */
8861 if (!path->locks[level]) {
8862 BUG_ON(level == 0);
8863 btrfs_tree_lock(eb);
8864 btrfs_set_lock_blocking(eb);
bd681513 8865 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 8866
2ff7e61e 8867 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 8868 eb->start, level, 1,
2c47e605
YZ
8869 &wc->refs[level],
8870 &wc->flags[level]);
79787eaa
JM
8871 if (ret < 0) {
8872 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 8873 path->locks[level] = 0;
79787eaa
JM
8874 return ret;
8875 }
2c47e605
YZ
8876 BUG_ON(wc->refs[level] == 0);
8877 if (wc->refs[level] == 1) {
bd681513 8878 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 8879 path->locks[level] = 0;
2c47e605
YZ
8880 return 1;
8881 }
f82d02d9 8882 }
2c47e605 8883 }
f82d02d9 8884
2c47e605
YZ
8885 /* wc->stage == DROP_REFERENCE */
8886 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 8887
2c47e605
YZ
8888 if (wc->refs[level] == 1) {
8889 if (level == 0) {
8890 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 8891 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 8892 else
e339a6b0 8893 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 8894 BUG_ON(ret); /* -ENOMEM */
2ff7e61e 8895 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
1152651a 8896 if (ret) {
0b246afa 8897 btrfs_err_rl(fs_info,
5d163e0e
JM
8898 "error %d accounting leaf items. Quota is out of sync, rescan required.",
8899 ret);
1152651a 8900 }
2c47e605
YZ
8901 }
8902 /* make block locked assertion in clean_tree_block happy */
8903 if (!path->locks[level] &&
8904 btrfs_header_generation(eb) == trans->transid) {
8905 btrfs_tree_lock(eb);
8906 btrfs_set_lock_blocking(eb);
bd681513 8907 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 8908 }
7c302b49 8909 clean_tree_block(fs_info, eb);
2c47e605
YZ
8910 }
8911
8912 if (eb == root->node) {
8913 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8914 parent = eb->start;
8915 else
8916 BUG_ON(root->root_key.objectid !=
8917 btrfs_header_owner(eb));
8918 } else {
8919 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8920 parent = path->nodes[level + 1]->start;
8921 else
8922 BUG_ON(root->root_key.objectid !=
8923 btrfs_header_owner(path->nodes[level + 1]));
f82d02d9 8924 }
f82d02d9 8925
5581a51a 8926 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
2c47e605
YZ
8927out:
8928 wc->refs[level] = 0;
8929 wc->flags[level] = 0;
f0486c68 8930 return 0;
2c47e605
YZ
8931}
8932
8933static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8934 struct btrfs_root *root,
8935 struct btrfs_path *path,
8936 struct walk_control *wc)
8937{
2c47e605 8938 int level = wc->level;
94fcca9f 8939 int lookup_info = 1;
2c47e605
YZ
8940 int ret;
8941
8942 while (level >= 0) {
94fcca9f 8943 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
8944 if (ret > 0)
8945 break;
8946
8947 if (level == 0)
8948 break;
8949
7a7965f8
YZ
8950 if (path->slots[level] >=
8951 btrfs_header_nritems(path->nodes[level]))
8952 break;
8953
94fcca9f 8954 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
8955 if (ret > 0) {
8956 path->slots[level]++;
8957 continue;
90d2c51d
MX
8958 } else if (ret < 0)
8959 return ret;
1c4850e2 8960 level = wc->level;
f82d02d9 8961 }
f82d02d9
YZ
8962 return 0;
8963}
8964
d397712b 8965static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 8966 struct btrfs_root *root,
f82d02d9 8967 struct btrfs_path *path,
2c47e605 8968 struct walk_control *wc, int max_level)
20524f02 8969{
2c47e605 8970 int level = wc->level;
20524f02 8971 int ret;
9f3a7427 8972
2c47e605
YZ
8973 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8974 while (level < max_level && path->nodes[level]) {
8975 wc->level = level;
8976 if (path->slots[level] + 1 <
8977 btrfs_header_nritems(path->nodes[level])) {
8978 path->slots[level]++;
20524f02
CM
8979 return 0;
8980 } else {
2c47e605
YZ
8981 ret = walk_up_proc(trans, root, path, wc);
8982 if (ret > 0)
8983 return 0;
bd56b302 8984
2c47e605 8985 if (path->locks[level]) {
bd681513
CM
8986 btrfs_tree_unlock_rw(path->nodes[level],
8987 path->locks[level]);
2c47e605 8988 path->locks[level] = 0;
f82d02d9 8989 }
2c47e605
YZ
8990 free_extent_buffer(path->nodes[level]);
8991 path->nodes[level] = NULL;
8992 level++;
20524f02
CM
8993 }
8994 }
8995 return 1;
8996}
8997
9aca1d51 8998/*
2c47e605
YZ
8999 * drop a subvolume tree.
9000 *
9001 * this function traverses the tree freeing any blocks that only
9002 * referenced by the tree.
9003 *
9004 * when a shared tree block is found. this function decreases its
9005 * reference count by one. if update_ref is true, this function
9006 * also make sure backrefs for the shared block and all lower level
9007 * blocks are properly updated.
9d1a2a3a
DS
9008 *
9009 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 9010 */
2c536799 9011int btrfs_drop_snapshot(struct btrfs_root *root,
66d7e7f0
AJ
9012 struct btrfs_block_rsv *block_rsv, int update_ref,
9013 int for_reloc)
20524f02 9014{
ab8d0fc4 9015 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 9016 struct btrfs_path *path;
2c47e605 9017 struct btrfs_trans_handle *trans;
ab8d0fc4 9018 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 9019 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
9020 struct walk_control *wc;
9021 struct btrfs_key key;
9022 int err = 0;
9023 int ret;
9024 int level;
d29a9f62 9025 bool root_dropped = false;
20524f02 9026
ab8d0fc4 9027 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
1152651a 9028
5caf2a00 9029 path = btrfs_alloc_path();
cb1b69f4
TI
9030 if (!path) {
9031 err = -ENOMEM;
9032 goto out;
9033 }
20524f02 9034
2c47e605 9035 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
9036 if (!wc) {
9037 btrfs_free_path(path);
cb1b69f4
TI
9038 err = -ENOMEM;
9039 goto out;
38a1a919 9040 }
2c47e605 9041
a22285a6 9042 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
9043 if (IS_ERR(trans)) {
9044 err = PTR_ERR(trans);
9045 goto out_free;
9046 }
98d5dc13 9047
3fd0a558
YZ
9048 if (block_rsv)
9049 trans->block_rsv = block_rsv;
2c47e605 9050
9f3a7427 9051 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 9052 level = btrfs_header_level(root->node);
5d4f98a2
YZ
9053 path->nodes[level] = btrfs_lock_root_node(root);
9054 btrfs_set_lock_blocking(path->nodes[level]);
9f3a7427 9055 path->slots[level] = 0;
bd681513 9056 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
9057 memset(&wc->update_progress, 0,
9058 sizeof(wc->update_progress));
9f3a7427 9059 } else {
9f3a7427 9060 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
9061 memcpy(&wc->update_progress, &key,
9062 sizeof(wc->update_progress));
9063
6702ed49 9064 level = root_item->drop_level;
2c47e605 9065 BUG_ON(level == 0);
6702ed49 9066 path->lowest_level = level;
2c47e605
YZ
9067 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9068 path->lowest_level = 0;
9069 if (ret < 0) {
9070 err = ret;
79787eaa 9071 goto out_end_trans;
9f3a7427 9072 }
1c4850e2 9073 WARN_ON(ret > 0);
2c47e605 9074
7d9eb12c
CM
9075 /*
9076 * unlock our path, this is safe because only this
9077 * function is allowed to delete this snapshot
9078 */
5d4f98a2 9079 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
9080
9081 level = btrfs_header_level(root->node);
9082 while (1) {
9083 btrfs_tree_lock(path->nodes[level]);
9084 btrfs_set_lock_blocking(path->nodes[level]);
fec386ac 9085 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 9086
2ff7e61e 9087 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 9088 path->nodes[level]->start,
3173a18f 9089 level, 1, &wc->refs[level],
2c47e605 9090 &wc->flags[level]);
79787eaa
JM
9091 if (ret < 0) {
9092 err = ret;
9093 goto out_end_trans;
9094 }
2c47e605
YZ
9095 BUG_ON(wc->refs[level] == 0);
9096
9097 if (level == root_item->drop_level)
9098 break;
9099
9100 btrfs_tree_unlock(path->nodes[level]);
fec386ac 9101 path->locks[level] = 0;
2c47e605
YZ
9102 WARN_ON(wc->refs[level] != 1);
9103 level--;
9104 }
9f3a7427 9105 }
2c47e605
YZ
9106
9107 wc->level = level;
9108 wc->shared_level = -1;
9109 wc->stage = DROP_REFERENCE;
9110 wc->update_ref = update_ref;
9111 wc->keep_locks = 0;
66d7e7f0 9112 wc->for_reloc = for_reloc;
0b246afa 9113 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 9114
d397712b 9115 while (1) {
9d1a2a3a 9116
2c47e605
YZ
9117 ret = walk_down_tree(trans, root, path, wc);
9118 if (ret < 0) {
9119 err = ret;
20524f02 9120 break;
2c47e605 9121 }
9aca1d51 9122
2c47e605
YZ
9123 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9124 if (ret < 0) {
9125 err = ret;
20524f02 9126 break;
2c47e605
YZ
9127 }
9128
9129 if (ret > 0) {
9130 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
9131 break;
9132 }
2c47e605
YZ
9133
9134 if (wc->stage == DROP_REFERENCE) {
9135 level = wc->level;
9136 btrfs_node_key(path->nodes[level],
9137 &root_item->drop_progress,
9138 path->slots[level]);
9139 root_item->drop_level = level;
9140 }
9141
9142 BUG_ON(wc->level == 0);
3a45bb20 9143 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 9144 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
9145 ret = btrfs_update_root(trans, tree_root,
9146 &root->root_key,
9147 root_item);
79787eaa 9148 if (ret) {
66642832 9149 btrfs_abort_transaction(trans, ret);
79787eaa
JM
9150 err = ret;
9151 goto out_end_trans;
9152 }
2c47e605 9153
3a45bb20 9154 btrfs_end_transaction_throttle(trans);
2ff7e61e 9155 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
9156 btrfs_debug(fs_info,
9157 "drop snapshot early exit");
3c8f2422
JB
9158 err = -EAGAIN;
9159 goto out_free;
9160 }
9161
a22285a6 9162 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
9163 if (IS_ERR(trans)) {
9164 err = PTR_ERR(trans);
9165 goto out_free;
9166 }
3fd0a558
YZ
9167 if (block_rsv)
9168 trans->block_rsv = block_rsv;
c3e69d58 9169 }
20524f02 9170 }
b3b4aa74 9171 btrfs_release_path(path);
79787eaa
JM
9172 if (err)
9173 goto out_end_trans;
2c47e605
YZ
9174
9175 ret = btrfs_del_root(trans, tree_root, &root->root_key);
79787eaa 9176 if (ret) {
66642832 9177 btrfs_abort_transaction(trans, ret);
79787eaa
JM
9178 goto out_end_trans;
9179 }
2c47e605 9180
76dda93c 9181 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
cb517eab
MX
9182 ret = btrfs_find_root(tree_root, &root->root_key, path,
9183 NULL, NULL);
79787eaa 9184 if (ret < 0) {
66642832 9185 btrfs_abort_transaction(trans, ret);
79787eaa
JM
9186 err = ret;
9187 goto out_end_trans;
9188 } else if (ret > 0) {
84cd948c
JB
9189 /* if we fail to delete the orphan item this time
9190 * around, it'll get picked up the next time.
9191 *
9192 * The most common failure here is just -ENOENT.
9193 */
9194 btrfs_del_orphan_item(trans, tree_root,
9195 root->root_key.objectid);
76dda93c
YZ
9196 }
9197 }
9198
27cdeb70 9199 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
2b9dbef2 9200 btrfs_add_dropped_root(trans, root);
76dda93c
YZ
9201 } else {
9202 free_extent_buffer(root->node);
9203 free_extent_buffer(root->commit_root);
b0feb9d9 9204 btrfs_put_fs_root(root);
76dda93c 9205 }
d29a9f62 9206 root_dropped = true;
79787eaa 9207out_end_trans:
3a45bb20 9208 btrfs_end_transaction_throttle(trans);
79787eaa 9209out_free:
2c47e605 9210 kfree(wc);
5caf2a00 9211 btrfs_free_path(path);
cb1b69f4 9212out:
d29a9f62
JB
9213 /*
9214 * So if we need to stop dropping the snapshot for whatever reason we
9215 * need to make sure to add it back to the dead root list so that we
9216 * keep trying to do the work later. This also cleans up roots if we
9217 * don't have it in the radix (like when we recover after a power fail
9218 * or unmount) so we don't leak memory.
9219 */
b37b39cd 9220 if (!for_reloc && root_dropped == false)
d29a9f62 9221 btrfs_add_dead_root(root);
90515e7f 9222 if (err && err != -EAGAIN)
ab8d0fc4 9223 btrfs_handle_fs_error(fs_info, err, NULL);
2c536799 9224 return err;
20524f02 9225}
9078a3e1 9226
2c47e605
YZ
9227/*
9228 * drop subtree rooted at tree block 'node'.
9229 *
9230 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 9231 * only used by relocation code
2c47e605 9232 */
f82d02d9
YZ
9233int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9234 struct btrfs_root *root,
9235 struct extent_buffer *node,
9236 struct extent_buffer *parent)
9237{
0b246afa 9238 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 9239 struct btrfs_path *path;
2c47e605 9240 struct walk_control *wc;
f82d02d9
YZ
9241 int level;
9242 int parent_level;
9243 int ret = 0;
9244 int wret;
9245
2c47e605
YZ
9246 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9247
f82d02d9 9248 path = btrfs_alloc_path();
db5b493a
TI
9249 if (!path)
9250 return -ENOMEM;
f82d02d9 9251
2c47e605 9252 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
9253 if (!wc) {
9254 btrfs_free_path(path);
9255 return -ENOMEM;
9256 }
2c47e605 9257
b9447ef8 9258 btrfs_assert_tree_locked(parent);
f82d02d9
YZ
9259 parent_level = btrfs_header_level(parent);
9260 extent_buffer_get(parent);
9261 path->nodes[parent_level] = parent;
9262 path->slots[parent_level] = btrfs_header_nritems(parent);
9263
b9447ef8 9264 btrfs_assert_tree_locked(node);
f82d02d9 9265 level = btrfs_header_level(node);
f82d02d9
YZ
9266 path->nodes[level] = node;
9267 path->slots[level] = 0;
bd681513 9268 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
9269
9270 wc->refs[parent_level] = 1;
9271 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9272 wc->level = level;
9273 wc->shared_level = -1;
9274 wc->stage = DROP_REFERENCE;
9275 wc->update_ref = 0;
9276 wc->keep_locks = 1;
66d7e7f0 9277 wc->for_reloc = 1;
0b246afa 9278 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
9279
9280 while (1) {
2c47e605
YZ
9281 wret = walk_down_tree(trans, root, path, wc);
9282 if (wret < 0) {
f82d02d9 9283 ret = wret;
f82d02d9 9284 break;
2c47e605 9285 }
f82d02d9 9286
2c47e605 9287 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
9288 if (wret < 0)
9289 ret = wret;
9290 if (wret != 0)
9291 break;
9292 }
9293
2c47e605 9294 kfree(wc);
f82d02d9
YZ
9295 btrfs_free_path(path);
9296 return ret;
9297}
9298
6202df69 9299static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
ec44a35c
CM
9300{
9301 u64 num_devices;
fc67c450 9302 u64 stripped;
e4d8ec0f 9303
fc67c450
ID
9304 /*
9305 * if restripe for this chunk_type is on pick target profile and
9306 * return, otherwise do the usual balance
9307 */
6202df69 9308 stripped = get_restripe_target(fs_info, flags);
fc67c450
ID
9309 if (stripped)
9310 return extended_to_chunk(stripped);
e4d8ec0f 9311
6202df69 9312 num_devices = fs_info->fs_devices->rw_devices;
cd02dca5 9313
fc67c450 9314 stripped = BTRFS_BLOCK_GROUP_RAID0 |
53b381b3 9315 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
fc67c450
ID
9316 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9317
ec44a35c
CM
9318 if (num_devices == 1) {
9319 stripped |= BTRFS_BLOCK_GROUP_DUP;
9320 stripped = flags & ~stripped;
9321
9322 /* turn raid0 into single device chunks */
9323 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9324 return stripped;
9325
9326 /* turn mirroring into duplication */
9327 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9328 BTRFS_BLOCK_GROUP_RAID10))
9329 return stripped | BTRFS_BLOCK_GROUP_DUP;
ec44a35c
CM
9330 } else {
9331 /* they already had raid on here, just return */
ec44a35c
CM
9332 if (flags & stripped)
9333 return flags;
9334
9335 stripped |= BTRFS_BLOCK_GROUP_DUP;
9336 stripped = flags & ~stripped;
9337
9338 /* switch duplicated blocks with raid1 */
9339 if (flags & BTRFS_BLOCK_GROUP_DUP)
9340 return stripped | BTRFS_BLOCK_GROUP_RAID1;
9341
e3176ca2 9342 /* this is drive concat, leave it alone */
ec44a35c 9343 }
e3176ca2 9344
ec44a35c
CM
9345 return flags;
9346}
9347
868f401a 9348static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
0ef3e66b 9349{
f0486c68
YZ
9350 struct btrfs_space_info *sinfo = cache->space_info;
9351 u64 num_bytes;
199c36ea 9352 u64 min_allocable_bytes;
f0486c68 9353 int ret = -ENOSPC;
0ef3e66b 9354
199c36ea
MX
9355 /*
9356 * We need some metadata space and system metadata space for
9357 * allocating chunks in some corner cases until we force to set
9358 * it to be readonly.
9359 */
9360 if ((sinfo->flags &
9361 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9362 !force)
ee22184b 9363 min_allocable_bytes = SZ_1M;
199c36ea
MX
9364 else
9365 min_allocable_bytes = 0;
9366
f0486c68
YZ
9367 spin_lock(&sinfo->lock);
9368 spin_lock(&cache->lock);
61cfea9b
W
9369
9370 if (cache->ro) {
868f401a 9371 cache->ro++;
61cfea9b
W
9372 ret = 0;
9373 goto out;
9374 }
9375
f0486c68
YZ
9376 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9377 cache->bytes_super - btrfs_block_group_used(&cache->item);
9378
4136135b 9379 if (btrfs_space_info_used(sinfo, true) + num_bytes +
37be25bc 9380 min_allocable_bytes <= sinfo->total_bytes) {
f0486c68 9381 sinfo->bytes_readonly += num_bytes;
868f401a 9382 cache->ro++;
633c0aad 9383 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
f0486c68
YZ
9384 ret = 0;
9385 }
61cfea9b 9386out:
f0486c68
YZ
9387 spin_unlock(&cache->lock);
9388 spin_unlock(&sinfo->lock);
9389 return ret;
9390}
7d9eb12c 9391
5e00f193 9392int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
f0486c68 9393 struct btrfs_block_group_cache *cache)
c286ac48 9394
f0486c68
YZ
9395{
9396 struct btrfs_trans_handle *trans;
9397 u64 alloc_flags;
9398 int ret;
7d9eb12c 9399
1bbc621e 9400again:
5e00f193 9401 trans = btrfs_join_transaction(fs_info->extent_root);
79787eaa
JM
9402 if (IS_ERR(trans))
9403 return PTR_ERR(trans);
5d4f98a2 9404
1bbc621e
CM
9405 /*
9406 * we're not allowed to set block groups readonly after the dirty
9407 * block groups cache has started writing. If it already started,
9408 * back off and let this transaction commit
9409 */
0b246afa 9410 mutex_lock(&fs_info->ro_block_group_mutex);
3204d33c 9411 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
1bbc621e
CM
9412 u64 transid = trans->transid;
9413
0b246afa 9414 mutex_unlock(&fs_info->ro_block_group_mutex);
3a45bb20 9415 btrfs_end_transaction(trans);
1bbc621e 9416
2ff7e61e 9417 ret = btrfs_wait_for_commit(fs_info, transid);
1bbc621e
CM
9418 if (ret)
9419 return ret;
9420 goto again;
9421 }
9422
153c35b6
CM
9423 /*
9424 * if we are changing raid levels, try to allocate a corresponding
9425 * block group with the new raid level.
9426 */
0b246afa 9427 alloc_flags = update_block_group_flags(fs_info, cache->flags);
153c35b6 9428 if (alloc_flags != cache->flags) {
2ff7e61e 9429 ret = do_chunk_alloc(trans, fs_info, alloc_flags,
153c35b6
CM
9430 CHUNK_ALLOC_FORCE);
9431 /*
9432 * ENOSPC is allowed here, we may have enough space
9433 * already allocated at the new raid level to
9434 * carry on
9435 */
9436 if (ret == -ENOSPC)
9437 ret = 0;
9438 if (ret < 0)
9439 goto out;
9440 }
1bbc621e 9441
868f401a 9442 ret = inc_block_group_ro(cache, 0);
f0486c68
YZ
9443 if (!ret)
9444 goto out;
2ff7e61e
JM
9445 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9446 ret = do_chunk_alloc(trans, fs_info, alloc_flags,
0e4f8f88 9447 CHUNK_ALLOC_FORCE);
f0486c68
YZ
9448 if (ret < 0)
9449 goto out;
868f401a 9450 ret = inc_block_group_ro(cache, 0);
f0486c68 9451out:
2f081088 9452 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
0b246afa 9453 alloc_flags = update_block_group_flags(fs_info, cache->flags);
34441361 9454 mutex_lock(&fs_info->chunk_mutex);
2ff7e61e 9455 check_system_chunk(trans, fs_info, alloc_flags);
34441361 9456 mutex_unlock(&fs_info->chunk_mutex);
2f081088 9457 }
0b246afa 9458 mutex_unlock(&fs_info->ro_block_group_mutex);
2f081088 9459
3a45bb20 9460 btrfs_end_transaction(trans);
f0486c68
YZ
9461 return ret;
9462}
5d4f98a2 9463
c87f08ca 9464int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2ff7e61e 9465 struct btrfs_fs_info *fs_info, u64 type)
c87f08ca 9466{
2ff7e61e
JM
9467 u64 alloc_flags = get_alloc_profile(fs_info, type);
9468
9469 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
c87f08ca
CM
9470}
9471
6d07bcec
MX
9472/*
9473 * helper to account the unused space of all the readonly block group in the
633c0aad 9474 * space_info. takes mirrors into account.
6d07bcec 9475 */
633c0aad 9476u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6d07bcec
MX
9477{
9478 struct btrfs_block_group_cache *block_group;
9479 u64 free_bytes = 0;
9480 int factor;
9481
01327610 9482 /* It's df, we don't care if it's racy */
633c0aad
JB
9483 if (list_empty(&sinfo->ro_bgs))
9484 return 0;
9485
9486 spin_lock(&sinfo->lock);
9487 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
6d07bcec
MX
9488 spin_lock(&block_group->lock);
9489
9490 if (!block_group->ro) {
9491 spin_unlock(&block_group->lock);
9492 continue;
9493 }
9494
9495 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9496 BTRFS_BLOCK_GROUP_RAID10 |
9497 BTRFS_BLOCK_GROUP_DUP))
9498 factor = 2;
9499 else
9500 factor = 1;
9501
9502 free_bytes += (block_group->key.offset -
9503 btrfs_block_group_used(&block_group->item)) *
9504 factor;
9505
9506 spin_unlock(&block_group->lock);
9507 }
6d07bcec
MX
9508 spin_unlock(&sinfo->lock);
9509
9510 return free_bytes;
9511}
9512
2ff7e61e 9513void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
5d4f98a2 9514{
f0486c68
YZ
9515 struct btrfs_space_info *sinfo = cache->space_info;
9516 u64 num_bytes;
9517
9518 BUG_ON(!cache->ro);
9519
9520 spin_lock(&sinfo->lock);
9521 spin_lock(&cache->lock);
868f401a
Z
9522 if (!--cache->ro) {
9523 num_bytes = cache->key.offset - cache->reserved -
9524 cache->pinned - cache->bytes_super -
9525 btrfs_block_group_used(&cache->item);
9526 sinfo->bytes_readonly -= num_bytes;
9527 list_del_init(&cache->ro_list);
9528 }
f0486c68
YZ
9529 spin_unlock(&cache->lock);
9530 spin_unlock(&sinfo->lock);
5d4f98a2
YZ
9531}
9532
ba1bf481
JB
9533/*
9534 * checks to see if its even possible to relocate this block group.
9535 *
9536 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9537 * ok to go ahead and try.
9538 */
6bccf3ab 9539int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
1a40e23b 9540{
6bccf3ab 9541 struct btrfs_root *root = fs_info->extent_root;
ba1bf481
JB
9542 struct btrfs_block_group_cache *block_group;
9543 struct btrfs_space_info *space_info;
0b246afa 9544 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
ba1bf481 9545 struct btrfs_device *device;
6df9a95e 9546 struct btrfs_trans_handle *trans;
cdcb725c 9547 u64 min_free;
6719db6a
JB
9548 u64 dev_min = 1;
9549 u64 dev_nr = 0;
4a5e98f5 9550 u64 target;
0305bc27 9551 int debug;
cdcb725c 9552 int index;
ba1bf481
JB
9553 int full = 0;
9554 int ret = 0;
1a40e23b 9555
0b246afa 9556 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
0305bc27 9557
0b246afa 9558 block_group = btrfs_lookup_block_group(fs_info, bytenr);
1a40e23b 9559
ba1bf481 9560 /* odd, couldn't find the block group, leave it alone */
0305bc27
QW
9561 if (!block_group) {
9562 if (debug)
0b246afa 9563 btrfs_warn(fs_info,
0305bc27
QW
9564 "can't find block group for bytenr %llu",
9565 bytenr);
ba1bf481 9566 return -1;
0305bc27 9567 }
1a40e23b 9568
cdcb725c 9569 min_free = btrfs_block_group_used(&block_group->item);
9570
ba1bf481 9571 /* no bytes used, we're good */
cdcb725c 9572 if (!min_free)
1a40e23b
ZY
9573 goto out;
9574
ba1bf481
JB
9575 space_info = block_group->space_info;
9576 spin_lock(&space_info->lock);
17d217fe 9577
ba1bf481 9578 full = space_info->full;
17d217fe 9579
ba1bf481
JB
9580 /*
9581 * if this is the last block group we have in this space, we can't
7ce618db
CM
9582 * relocate it unless we're able to allocate a new chunk below.
9583 *
9584 * Otherwise, we need to make sure we have room in the space to handle
9585 * all of the extents from this block group. If we can, we're good
ba1bf481 9586 */
7ce618db 9587 if ((space_info->total_bytes != block_group->key.offset) &&
4136135b
LB
9588 (btrfs_space_info_used(space_info, false) + min_free <
9589 space_info->total_bytes)) {
ba1bf481
JB
9590 spin_unlock(&space_info->lock);
9591 goto out;
17d217fe 9592 }
ba1bf481 9593 spin_unlock(&space_info->lock);
ea8c2819 9594
ba1bf481
JB
9595 /*
9596 * ok we don't have enough space, but maybe we have free space on our
9597 * devices to allocate new chunks for relocation, so loop through our
4a5e98f5
ID
9598 * alloc devices and guess if we have enough space. if this block
9599 * group is going to be restriped, run checks against the target
9600 * profile instead of the current one.
ba1bf481
JB
9601 */
9602 ret = -1;
ea8c2819 9603
cdcb725c 9604 /*
9605 * index:
9606 * 0: raid10
9607 * 1: raid1
9608 * 2: dup
9609 * 3: raid0
9610 * 4: single
9611 */
0b246afa 9612 target = get_restripe_target(fs_info, block_group->flags);
4a5e98f5 9613 if (target) {
31e50229 9614 index = __get_raid_index(extended_to_chunk(target));
4a5e98f5
ID
9615 } else {
9616 /*
9617 * this is just a balance, so if we were marked as full
9618 * we know there is no space for a new chunk
9619 */
0305bc27
QW
9620 if (full) {
9621 if (debug)
0b246afa
JM
9622 btrfs_warn(fs_info,
9623 "no space to alloc new chunk for block group %llu",
9624 block_group->key.objectid);
4a5e98f5 9625 goto out;
0305bc27 9626 }
4a5e98f5
ID
9627
9628 index = get_block_group_index(block_group);
9629 }
9630
e6ec716f 9631 if (index == BTRFS_RAID_RAID10) {
cdcb725c 9632 dev_min = 4;
6719db6a
JB
9633 /* Divide by 2 */
9634 min_free >>= 1;
e6ec716f 9635 } else if (index == BTRFS_RAID_RAID1) {
cdcb725c 9636 dev_min = 2;
e6ec716f 9637 } else if (index == BTRFS_RAID_DUP) {
6719db6a
JB
9638 /* Multiply by 2 */
9639 min_free <<= 1;
e6ec716f 9640 } else if (index == BTRFS_RAID_RAID0) {
cdcb725c 9641 dev_min = fs_devices->rw_devices;
47c5713f 9642 min_free = div64_u64(min_free, dev_min);
cdcb725c 9643 }
9644
6df9a95e
JB
9645 /* We need to do this so that we can look at pending chunks */
9646 trans = btrfs_join_transaction(root);
9647 if (IS_ERR(trans)) {
9648 ret = PTR_ERR(trans);
9649 goto out;
9650 }
9651
0b246afa 9652 mutex_lock(&fs_info->chunk_mutex);
ba1bf481 9653 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7bfc837d 9654 u64 dev_offset;
56bec294 9655
ba1bf481
JB
9656 /*
9657 * check to make sure we can actually find a chunk with enough
9658 * space to fit our block group in.
9659 */
63a212ab
SB
9660 if (device->total_bytes > device->bytes_used + min_free &&
9661 !device->is_tgtdev_for_dev_replace) {
6df9a95e 9662 ret = find_free_dev_extent(trans, device, min_free,
7bfc837d 9663 &dev_offset, NULL);
ba1bf481 9664 if (!ret)
cdcb725c 9665 dev_nr++;
9666
9667 if (dev_nr >= dev_min)
73e48b27 9668 break;
cdcb725c 9669
ba1bf481 9670 ret = -1;
725c8463 9671 }
edbd8d4e 9672 }
0305bc27 9673 if (debug && ret == -1)
0b246afa
JM
9674 btrfs_warn(fs_info,
9675 "no space to allocate a new chunk for block group %llu",
9676 block_group->key.objectid);
9677 mutex_unlock(&fs_info->chunk_mutex);
3a45bb20 9678 btrfs_end_transaction(trans);
edbd8d4e 9679out:
ba1bf481 9680 btrfs_put_block_group(block_group);
edbd8d4e
CM
9681 return ret;
9682}
9683
6bccf3ab
JM
9684static int find_first_block_group(struct btrfs_fs_info *fs_info,
9685 struct btrfs_path *path,
9686 struct btrfs_key *key)
0b86a832 9687{
6bccf3ab 9688 struct btrfs_root *root = fs_info->extent_root;
925baedd 9689 int ret = 0;
0b86a832
CM
9690 struct btrfs_key found_key;
9691 struct extent_buffer *leaf;
9692 int slot;
edbd8d4e 9693
0b86a832
CM
9694 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9695 if (ret < 0)
925baedd
CM
9696 goto out;
9697
d397712b 9698 while (1) {
0b86a832 9699 slot = path->slots[0];
edbd8d4e 9700 leaf = path->nodes[0];
0b86a832
CM
9701 if (slot >= btrfs_header_nritems(leaf)) {
9702 ret = btrfs_next_leaf(root, path);
9703 if (ret == 0)
9704 continue;
9705 if (ret < 0)
925baedd 9706 goto out;
0b86a832 9707 break;
edbd8d4e 9708 }
0b86a832 9709 btrfs_item_key_to_cpu(leaf, &found_key, slot);
edbd8d4e 9710
0b86a832 9711 if (found_key.objectid >= key->objectid &&
925baedd 9712 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6fb37b75
LB
9713 struct extent_map_tree *em_tree;
9714 struct extent_map *em;
9715
9716 em_tree = &root->fs_info->mapping_tree.map_tree;
9717 read_lock(&em_tree->lock);
9718 em = lookup_extent_mapping(em_tree, found_key.objectid,
9719 found_key.offset);
9720 read_unlock(&em_tree->lock);
9721 if (!em) {
0b246afa 9722 btrfs_err(fs_info,
6fb37b75
LB
9723 "logical %llu len %llu found bg but no related chunk",
9724 found_key.objectid, found_key.offset);
9725 ret = -ENOENT;
9726 } else {
9727 ret = 0;
9728 }
187ee58c 9729 free_extent_map(em);
925baedd
CM
9730 goto out;
9731 }
0b86a832 9732 path->slots[0]++;
edbd8d4e 9733 }
925baedd 9734out:
0b86a832 9735 return ret;
edbd8d4e
CM
9736}
9737
0af3d00b
JB
9738void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9739{
9740 struct btrfs_block_group_cache *block_group;
9741 u64 last = 0;
9742
9743 while (1) {
9744 struct inode *inode;
9745
9746 block_group = btrfs_lookup_first_block_group(info, last);
9747 while (block_group) {
9748 spin_lock(&block_group->lock);
9749 if (block_group->iref)
9750 break;
9751 spin_unlock(&block_group->lock);
2ff7e61e 9752 block_group = next_block_group(info, block_group);
0af3d00b
JB
9753 }
9754 if (!block_group) {
9755 if (last == 0)
9756 break;
9757 last = 0;
9758 continue;
9759 }
9760
9761 inode = block_group->inode;
9762 block_group->iref = 0;
9763 block_group->inode = NULL;
9764 spin_unlock(&block_group->lock);
f3bca802 9765 ASSERT(block_group->io_ctl.inode == NULL);
0af3d00b
JB
9766 iput(inode);
9767 last = block_group->key.objectid + block_group->key.offset;
9768 btrfs_put_block_group(block_group);
9769 }
9770}
9771
5cdd7db6
FM
9772/*
9773 * Must be called only after stopping all workers, since we could have block
9774 * group caching kthreads running, and therefore they could race with us if we
9775 * freed the block groups before stopping them.
9776 */
1a40e23b
ZY
9777int btrfs_free_block_groups(struct btrfs_fs_info *info)
9778{
9779 struct btrfs_block_group_cache *block_group;
4184ea7f 9780 struct btrfs_space_info *space_info;
11833d66 9781 struct btrfs_caching_control *caching_ctl;
1a40e23b
ZY
9782 struct rb_node *n;
9783
9e351cc8 9784 down_write(&info->commit_root_sem);
11833d66
YZ
9785 while (!list_empty(&info->caching_block_groups)) {
9786 caching_ctl = list_entry(info->caching_block_groups.next,
9787 struct btrfs_caching_control, list);
9788 list_del(&caching_ctl->list);
9789 put_caching_control(caching_ctl);
9790 }
9e351cc8 9791 up_write(&info->commit_root_sem);
11833d66 9792
47ab2a6c
JB
9793 spin_lock(&info->unused_bgs_lock);
9794 while (!list_empty(&info->unused_bgs)) {
9795 block_group = list_first_entry(&info->unused_bgs,
9796 struct btrfs_block_group_cache,
9797 bg_list);
9798 list_del_init(&block_group->bg_list);
9799 btrfs_put_block_group(block_group);
9800 }
9801 spin_unlock(&info->unused_bgs_lock);
9802
1a40e23b
ZY
9803 spin_lock(&info->block_group_cache_lock);
9804 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9805 block_group = rb_entry(n, struct btrfs_block_group_cache,
9806 cache_node);
1a40e23b
ZY
9807 rb_erase(&block_group->cache_node,
9808 &info->block_group_cache_tree);
01eacb27 9809 RB_CLEAR_NODE(&block_group->cache_node);
d899e052
YZ
9810 spin_unlock(&info->block_group_cache_lock);
9811
80eb234a 9812 down_write(&block_group->space_info->groups_sem);
1a40e23b 9813 list_del(&block_group->list);
80eb234a 9814 up_write(&block_group->space_info->groups_sem);
d2fb3437 9815
3c14874a
JB
9816 /*
9817 * We haven't cached this block group, which means we could
9818 * possibly have excluded extents on this block group.
9819 */
36cce922
JB
9820 if (block_group->cached == BTRFS_CACHE_NO ||
9821 block_group->cached == BTRFS_CACHE_ERROR)
2ff7e61e 9822 free_excluded_extents(info, block_group);
3c14874a 9823
817d52f8 9824 btrfs_remove_free_space_cache(block_group);
5cdd7db6 9825 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
f3bca802
LB
9826 ASSERT(list_empty(&block_group->dirty_list));
9827 ASSERT(list_empty(&block_group->io_list));
9828 ASSERT(list_empty(&block_group->bg_list));
9829 ASSERT(atomic_read(&block_group->count) == 1);
11dfe35a 9830 btrfs_put_block_group(block_group);
d899e052
YZ
9831
9832 spin_lock(&info->block_group_cache_lock);
1a40e23b
ZY
9833 }
9834 spin_unlock(&info->block_group_cache_lock);
4184ea7f
CM
9835
9836 /* now that all the block groups are freed, go through and
9837 * free all the space_info structs. This is only called during
9838 * the final stages of unmount, and so we know nobody is
9839 * using them. We call synchronize_rcu() once before we start,
9840 * just to be on the safe side.
9841 */
9842 synchronize_rcu();
9843
8929ecfa
YZ
9844 release_global_block_rsv(info);
9845
67871254 9846 while (!list_empty(&info->space_info)) {
6ab0a202
JM
9847 int i;
9848
4184ea7f
CM
9849 space_info = list_entry(info->space_info.next,
9850 struct btrfs_space_info,
9851 list);
d555b6c3
JB
9852
9853 /*
9854 * Do not hide this behind enospc_debug, this is actually
9855 * important and indicates a real bug if this happens.
9856 */
9857 if (WARN_ON(space_info->bytes_pinned > 0 ||
b069e0c3 9858 space_info->bytes_reserved > 0 ||
d555b6c3 9859 space_info->bytes_may_use > 0))
ab8d0fc4 9860 dump_space_info(info, space_info, 0, 0);
4184ea7f 9861 list_del(&space_info->list);
6ab0a202
JM
9862 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9863 struct kobject *kobj;
c1895442
JM
9864 kobj = space_info->block_group_kobjs[i];
9865 space_info->block_group_kobjs[i] = NULL;
9866 if (kobj) {
6ab0a202
JM
9867 kobject_del(kobj);
9868 kobject_put(kobj);
9869 }
9870 }
9871 kobject_del(&space_info->kobj);
9872 kobject_put(&space_info->kobj);
4184ea7f 9873 }
1a40e23b
ZY
9874 return 0;
9875}
9876
b742bb82
YZ
9877static void __link_block_group(struct btrfs_space_info *space_info,
9878 struct btrfs_block_group_cache *cache)
9879{
9880 int index = get_block_group_index(cache);
ed55b6ac 9881 bool first = false;
b742bb82
YZ
9882
9883 down_write(&space_info->groups_sem);
ed55b6ac
JM
9884 if (list_empty(&space_info->block_groups[index]))
9885 first = true;
9886 list_add_tail(&cache->list, &space_info->block_groups[index]);
9887 up_write(&space_info->groups_sem);
9888
9889 if (first) {
c1895442 9890 struct raid_kobject *rkobj;
6ab0a202
JM
9891 int ret;
9892
c1895442
JM
9893 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9894 if (!rkobj)
9895 goto out_err;
9896 rkobj->raid_type = index;
9897 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9898 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9899 "%s", get_raid_name(index));
6ab0a202 9900 if (ret) {
c1895442
JM
9901 kobject_put(&rkobj->kobj);
9902 goto out_err;
6ab0a202 9903 }
c1895442 9904 space_info->block_group_kobjs[index] = &rkobj->kobj;
6ab0a202 9905 }
c1895442
JM
9906
9907 return;
9908out_err:
ab8d0fc4
JM
9909 btrfs_warn(cache->fs_info,
9910 "failed to add kobject for block cache, ignoring");
b742bb82
YZ
9911}
9912
920e4a58 9913static struct btrfs_block_group_cache *
2ff7e61e
JM
9914btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9915 u64 start, u64 size)
920e4a58
MX
9916{
9917 struct btrfs_block_group_cache *cache;
9918
9919 cache = kzalloc(sizeof(*cache), GFP_NOFS);
9920 if (!cache)
9921 return NULL;
9922
9923 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9924 GFP_NOFS);
9925 if (!cache->free_space_ctl) {
9926 kfree(cache);
9927 return NULL;
9928 }
9929
9930 cache->key.objectid = start;
9931 cache->key.offset = size;
9932 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9933
0b246afa
JM
9934 cache->sectorsize = fs_info->sectorsize;
9935 cache->fs_info = fs_info;
2ff7e61e
JM
9936 cache->full_stripe_len = btrfs_full_stripe_len(fs_info,
9937 &fs_info->mapping_tree,
9938 start);
1e144fb8
OS
9939 set_free_space_tree_thresholds(cache);
9940
920e4a58
MX
9941 atomic_set(&cache->count, 1);
9942 spin_lock_init(&cache->lock);
e570fd27 9943 init_rwsem(&cache->data_rwsem);
920e4a58
MX
9944 INIT_LIST_HEAD(&cache->list);
9945 INIT_LIST_HEAD(&cache->cluster_list);
47ab2a6c 9946 INIT_LIST_HEAD(&cache->bg_list);
633c0aad 9947 INIT_LIST_HEAD(&cache->ro_list);
ce93ec54 9948 INIT_LIST_HEAD(&cache->dirty_list);
c9dc4c65 9949 INIT_LIST_HEAD(&cache->io_list);
920e4a58 9950 btrfs_init_free_space_ctl(cache);
04216820 9951 atomic_set(&cache->trimming, 0);
a5ed9182 9952 mutex_init(&cache->free_space_lock);
0966a7b1 9953 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
920e4a58
MX
9954
9955 return cache;
9956}
9957
5b4aacef 9958int btrfs_read_block_groups(struct btrfs_fs_info *info)
9078a3e1
CM
9959{
9960 struct btrfs_path *path;
9961 int ret;
9078a3e1 9962 struct btrfs_block_group_cache *cache;
6324fbf3 9963 struct btrfs_space_info *space_info;
9078a3e1
CM
9964 struct btrfs_key key;
9965 struct btrfs_key found_key;
5f39d397 9966 struct extent_buffer *leaf;
0af3d00b
JB
9967 int need_clear = 0;
9968 u64 cache_gen;
49303381
LB
9969 u64 feature;
9970 int mixed;
9971
9972 feature = btrfs_super_incompat_flags(info->super_copy);
9973 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
96b5179d 9974
9078a3e1 9975 key.objectid = 0;
0b86a832 9976 key.offset = 0;
962a298f 9977 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9078a3e1
CM
9978 path = btrfs_alloc_path();
9979 if (!path)
9980 return -ENOMEM;
e4058b54 9981 path->reada = READA_FORWARD;
9078a3e1 9982
0b246afa
JM
9983 cache_gen = btrfs_super_cache_generation(info->super_copy);
9984 if (btrfs_test_opt(info, SPACE_CACHE) &&
9985 btrfs_super_generation(info->super_copy) != cache_gen)
0af3d00b 9986 need_clear = 1;
0b246afa 9987 if (btrfs_test_opt(info, CLEAR_CACHE))
88c2ba3b 9988 need_clear = 1;
0af3d00b 9989
d397712b 9990 while (1) {
6bccf3ab 9991 ret = find_first_block_group(info, path, &key);
b742bb82
YZ
9992 if (ret > 0)
9993 break;
0b86a832
CM
9994 if (ret != 0)
9995 goto error;
920e4a58 9996
5f39d397
CM
9997 leaf = path->nodes[0];
9998 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
920e4a58 9999
2ff7e61e 10000 cache = btrfs_create_block_group_cache(info, found_key.objectid,
920e4a58 10001 found_key.offset);
9078a3e1 10002 if (!cache) {
0b86a832 10003 ret = -ENOMEM;
f0486c68 10004 goto error;
9078a3e1 10005 }
96303081 10006
cf7c1ef6
LB
10007 if (need_clear) {
10008 /*
10009 * When we mount with old space cache, we need to
10010 * set BTRFS_DC_CLEAR and set dirty flag.
10011 *
10012 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10013 * truncate the old free space cache inode and
10014 * setup a new one.
10015 * b) Setting 'dirty flag' makes sure that we flush
10016 * the new space cache info onto disk.
10017 */
0b246afa 10018 if (btrfs_test_opt(info, SPACE_CACHE))
ce93ec54 10019 cache->disk_cache_state = BTRFS_DC_CLEAR;
cf7c1ef6 10020 }
0af3d00b 10021
5f39d397
CM
10022 read_extent_buffer(leaf, &cache->item,
10023 btrfs_item_ptr_offset(leaf, path->slots[0]),
10024 sizeof(cache->item));
920e4a58 10025 cache->flags = btrfs_block_group_flags(&cache->item);
49303381
LB
10026 if (!mixed &&
10027 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10028 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10029 btrfs_err(info,
10030"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10031 cache->key.objectid);
10032 ret = -EINVAL;
10033 goto error;
10034 }
0b86a832 10035
9078a3e1 10036 key.objectid = found_key.objectid + found_key.offset;
b3b4aa74 10037 btrfs_release_path(path);
34d52cb6 10038
3c14874a
JB
10039 /*
10040 * We need to exclude the super stripes now so that the space
10041 * info has super bytes accounted for, otherwise we'll think
10042 * we have more space than we actually do.
10043 */
2ff7e61e 10044 ret = exclude_super_stripes(info, cache);
835d974f
JB
10045 if (ret) {
10046 /*
10047 * We may have excluded something, so call this just in
10048 * case.
10049 */
2ff7e61e 10050 free_excluded_extents(info, cache);
920e4a58 10051 btrfs_put_block_group(cache);
835d974f
JB
10052 goto error;
10053 }
3c14874a 10054
817d52f8
JB
10055 /*
10056 * check for two cases, either we are full, and therefore
10057 * don't need to bother with the caching work since we won't
10058 * find any space, or we are empty, and we can just add all
10059 * the space in and be done with it. This saves us _alot_ of
10060 * time, particularly in the full case.
10061 */
10062 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
11833d66 10063 cache->last_byte_to_unpin = (u64)-1;
817d52f8 10064 cache->cached = BTRFS_CACHE_FINISHED;
2ff7e61e 10065 free_excluded_extents(info, cache);
817d52f8 10066 } else if (btrfs_block_group_used(&cache->item) == 0) {
11833d66 10067 cache->last_byte_to_unpin = (u64)-1;
817d52f8 10068 cache->cached = BTRFS_CACHE_FINISHED;
0b246afa 10069 add_new_free_space(cache, info,
817d52f8
JB
10070 found_key.objectid,
10071 found_key.objectid +
10072 found_key.offset);
2ff7e61e 10073 free_excluded_extents(info, cache);
817d52f8 10074 }
96b5179d 10075
0b246afa 10076 ret = btrfs_add_block_group_cache(info, cache);
8c579fe7
JB
10077 if (ret) {
10078 btrfs_remove_free_space_cache(cache);
10079 btrfs_put_block_group(cache);
10080 goto error;
10081 }
10082
0b246afa 10083 trace_btrfs_add_block_group(info, cache, 0);
6324fbf3
CM
10084 ret = update_space_info(info, cache->flags, found_key.offset,
10085 btrfs_block_group_used(&cache->item),
e40edf2d 10086 cache->bytes_super, &space_info);
8c579fe7
JB
10087 if (ret) {
10088 btrfs_remove_free_space_cache(cache);
10089 spin_lock(&info->block_group_cache_lock);
10090 rb_erase(&cache->cache_node,
10091 &info->block_group_cache_tree);
01eacb27 10092 RB_CLEAR_NODE(&cache->cache_node);
8c579fe7
JB
10093 spin_unlock(&info->block_group_cache_lock);
10094 btrfs_put_block_group(cache);
10095 goto error;
10096 }
10097
6324fbf3 10098 cache->space_info = space_info;
1b2da372 10099
b742bb82 10100 __link_block_group(space_info, cache);
0f9dd46c 10101
0b246afa 10102 set_avail_alloc_bits(info, cache->flags);
2ff7e61e 10103 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
868f401a 10104 inc_block_group_ro(cache, 1);
47ab2a6c
JB
10105 } else if (btrfs_block_group_used(&cache->item) == 0) {
10106 spin_lock(&info->unused_bgs_lock);
10107 /* Should always be true but just in case. */
10108 if (list_empty(&cache->bg_list)) {
10109 btrfs_get_block_group(cache);
10110 list_add_tail(&cache->bg_list,
10111 &info->unused_bgs);
10112 }
10113 spin_unlock(&info->unused_bgs_lock);
10114 }
9078a3e1 10115 }
b742bb82 10116
0b246afa 10117 list_for_each_entry_rcu(space_info, &info->space_info, list) {
2ff7e61e 10118 if (!(get_alloc_profile(info, space_info->flags) &
b742bb82
YZ
10119 (BTRFS_BLOCK_GROUP_RAID10 |
10120 BTRFS_BLOCK_GROUP_RAID1 |
53b381b3
DW
10121 BTRFS_BLOCK_GROUP_RAID5 |
10122 BTRFS_BLOCK_GROUP_RAID6 |
b742bb82
YZ
10123 BTRFS_BLOCK_GROUP_DUP)))
10124 continue;
10125 /*
10126 * avoid allocating from un-mirrored block group if there are
10127 * mirrored block groups.
10128 */
1095cc0d 10129 list_for_each_entry(cache,
10130 &space_info->block_groups[BTRFS_RAID_RAID0],
10131 list)
868f401a 10132 inc_block_group_ro(cache, 1);
1095cc0d 10133 list_for_each_entry(cache,
10134 &space_info->block_groups[BTRFS_RAID_SINGLE],
10135 list)
868f401a 10136 inc_block_group_ro(cache, 1);
9078a3e1 10137 }
f0486c68
YZ
10138
10139 init_global_block_rsv(info);
0b86a832
CM
10140 ret = 0;
10141error:
9078a3e1 10142 btrfs_free_path(path);
0b86a832 10143 return ret;
9078a3e1 10144}
6324fbf3 10145
ea658bad 10146void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2ff7e61e 10147 struct btrfs_fs_info *fs_info)
ea658bad
JB
10148{
10149 struct btrfs_block_group_cache *block_group, *tmp;
0b246afa 10150 struct btrfs_root *extent_root = fs_info->extent_root;
ea658bad
JB
10151 struct btrfs_block_group_item item;
10152 struct btrfs_key key;
10153 int ret = 0;
d9a0540a 10154 bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
ea658bad 10155
d9a0540a 10156 trans->can_flush_pending_bgs = false;
47ab2a6c 10157 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
ea658bad 10158 if (ret)
c92f6be3 10159 goto next;
ea658bad
JB
10160
10161 spin_lock(&block_group->lock);
10162 memcpy(&item, &block_group->item, sizeof(item));
10163 memcpy(&key, &block_group->key, sizeof(key));
10164 spin_unlock(&block_group->lock);
10165
10166 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10167 sizeof(item));
10168 if (ret)
66642832 10169 btrfs_abort_transaction(trans, ret);
0b246afa
JM
10170 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
10171 key.offset);
6df9a95e 10172 if (ret)
66642832 10173 btrfs_abort_transaction(trans, ret);
0b246afa 10174 add_block_group_free_space(trans, fs_info, block_group);
1e144fb8 10175 /* already aborted the transaction if it failed. */
c92f6be3
FM
10176next:
10177 list_del_init(&block_group->bg_list);
ea658bad 10178 }
d9a0540a 10179 trans->can_flush_pending_bgs = can_flush_pending_bgs;
ea658bad
JB
10180}
10181
6324fbf3 10182int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2ff7e61e 10183 struct btrfs_fs_info *fs_info, u64 bytes_used,
e17cade2 10184 u64 type, u64 chunk_objectid, u64 chunk_offset,
6324fbf3
CM
10185 u64 size)
10186{
6324fbf3 10187 struct btrfs_block_group_cache *cache;
0b246afa 10188 int ret;
6324fbf3 10189
0b246afa 10190 btrfs_set_log_full_commit(fs_info, trans);
e02119d5 10191
2ff7e61e 10192 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
0f9dd46c
JB
10193 if (!cache)
10194 return -ENOMEM;
34d52cb6 10195
6324fbf3 10196 btrfs_set_block_group_used(&cache->item, bytes_used);
6324fbf3 10197 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
6324fbf3
CM
10198 btrfs_set_block_group_flags(&cache->item, type);
10199
920e4a58 10200 cache->flags = type;
11833d66 10201 cache->last_byte_to_unpin = (u64)-1;
817d52f8 10202 cache->cached = BTRFS_CACHE_FINISHED;
1e144fb8 10203 cache->needs_free_space = 1;
2ff7e61e 10204 ret = exclude_super_stripes(fs_info, cache);
835d974f
JB
10205 if (ret) {
10206 /*
10207 * We may have excluded something, so call this just in
10208 * case.
10209 */
2ff7e61e 10210 free_excluded_extents(fs_info, cache);
920e4a58 10211 btrfs_put_block_group(cache);
835d974f
JB
10212 return ret;
10213 }
96303081 10214
0b246afa 10215 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
817d52f8 10216
2ff7e61e 10217 free_excluded_extents(fs_info, cache);
11833d66 10218
d0bd4560 10219#ifdef CONFIG_BTRFS_DEBUG
2ff7e61e 10220 if (btrfs_should_fragment_free_space(cache)) {
d0bd4560
JB
10221 u64 new_bytes_used = size - bytes_used;
10222
10223 bytes_used += new_bytes_used >> 1;
2ff7e61e 10224 fragment_free_space(cache);
d0bd4560
JB
10225 }
10226#endif
2e6e5183
FM
10227 /*
10228 * Call to ensure the corresponding space_info object is created and
10229 * assigned to our block group, but don't update its counters just yet.
10230 * We want our bg to be added to the rbtree with its ->space_info set.
10231 */
0b246afa 10232 ret = update_space_info(fs_info, cache->flags, 0, 0, 0,
2e6e5183
FM
10233 &cache->space_info);
10234 if (ret) {
10235 btrfs_remove_free_space_cache(cache);
10236 btrfs_put_block_group(cache);
10237 return ret;
10238 }
10239
0b246afa 10240 ret = btrfs_add_block_group_cache(fs_info, cache);
8c579fe7
JB
10241 if (ret) {
10242 btrfs_remove_free_space_cache(cache);
10243 btrfs_put_block_group(cache);
10244 return ret;
10245 }
10246
2e6e5183
FM
10247 /*
10248 * Now that our block group has its ->space_info set and is inserted in
10249 * the rbtree, update the space info's counters.
10250 */
0b246afa
JM
10251 trace_btrfs_add_block_group(fs_info, cache, 1);
10252 ret = update_space_info(fs_info, cache->flags, size, bytes_used,
e40edf2d 10253 cache->bytes_super, &cache->space_info);
8c579fe7
JB
10254 if (ret) {
10255 btrfs_remove_free_space_cache(cache);
0b246afa 10256 spin_lock(&fs_info->block_group_cache_lock);
8c579fe7 10257 rb_erase(&cache->cache_node,
0b246afa 10258 &fs_info->block_group_cache_tree);
01eacb27 10259 RB_CLEAR_NODE(&cache->cache_node);
0b246afa 10260 spin_unlock(&fs_info->block_group_cache_lock);
8c579fe7
JB
10261 btrfs_put_block_group(cache);
10262 return ret;
10263 }
0b246afa 10264 update_global_block_rsv(fs_info);
1b2da372 10265
b742bb82 10266 __link_block_group(cache->space_info, cache);
6324fbf3 10267
47ab2a6c 10268 list_add_tail(&cache->bg_list, &trans->new_bgs);
6324fbf3 10269
0b246afa 10270 set_avail_alloc_bits(fs_info, type);
6324fbf3
CM
10271 return 0;
10272}
1a40e23b 10273
10ea00f5
ID
10274static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10275{
899c81ea
ID
10276 u64 extra_flags = chunk_to_extended(flags) &
10277 BTRFS_EXTENDED_PROFILE_MASK;
10ea00f5 10278
de98ced9 10279 write_seqlock(&fs_info->profiles_lock);
10ea00f5
ID
10280 if (flags & BTRFS_BLOCK_GROUP_DATA)
10281 fs_info->avail_data_alloc_bits &= ~extra_flags;
10282 if (flags & BTRFS_BLOCK_GROUP_METADATA)
10283 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10284 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10285 fs_info->avail_system_alloc_bits &= ~extra_flags;
de98ced9 10286 write_sequnlock(&fs_info->profiles_lock);
10ea00f5
ID
10287}
10288
1a40e23b 10289int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6bccf3ab 10290 struct btrfs_fs_info *fs_info, u64 group_start,
04216820 10291 struct extent_map *em)
1a40e23b 10292{
6bccf3ab 10293 struct btrfs_root *root = fs_info->extent_root;
1a40e23b
ZY
10294 struct btrfs_path *path;
10295 struct btrfs_block_group_cache *block_group;
44fb5511 10296 struct btrfs_free_cluster *cluster;
0b246afa 10297 struct btrfs_root *tree_root = fs_info->tree_root;
1a40e23b 10298 struct btrfs_key key;
0af3d00b 10299 struct inode *inode;
c1895442 10300 struct kobject *kobj = NULL;
1a40e23b 10301 int ret;
10ea00f5 10302 int index;
89a55897 10303 int factor;
4f69cb98 10304 struct btrfs_caching_control *caching_ctl = NULL;
04216820 10305 bool remove_em;
1a40e23b 10306
6bccf3ab 10307 block_group = btrfs_lookup_block_group(fs_info, group_start);
1a40e23b 10308 BUG_ON(!block_group);
c146afad 10309 BUG_ON(!block_group->ro);
1a40e23b 10310
9f7c43c9 10311 /*
10312 * Free the reserved super bytes from this block group before
10313 * remove it.
10314 */
2ff7e61e 10315 free_excluded_extents(fs_info, block_group);
9f7c43c9 10316
1a40e23b 10317 memcpy(&key, &block_group->key, sizeof(key));
10ea00f5 10318 index = get_block_group_index(block_group);
89a55897
JB
10319 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10320 BTRFS_BLOCK_GROUP_RAID1 |
10321 BTRFS_BLOCK_GROUP_RAID10))
10322 factor = 2;
10323 else
10324 factor = 1;
1a40e23b 10325
44fb5511 10326 /* make sure this block group isn't part of an allocation cluster */
0b246afa 10327 cluster = &fs_info->data_alloc_cluster;
44fb5511
CM
10328 spin_lock(&cluster->refill_lock);
10329 btrfs_return_cluster_to_free_space(block_group, cluster);
10330 spin_unlock(&cluster->refill_lock);
10331
10332 /*
10333 * make sure this block group isn't part of a metadata
10334 * allocation cluster
10335 */
0b246afa 10336 cluster = &fs_info->meta_alloc_cluster;
44fb5511
CM
10337 spin_lock(&cluster->refill_lock);
10338 btrfs_return_cluster_to_free_space(block_group, cluster);
10339 spin_unlock(&cluster->refill_lock);
10340
1a40e23b 10341 path = btrfs_alloc_path();
d8926bb3
MF
10342 if (!path) {
10343 ret = -ENOMEM;
10344 goto out;
10345 }
1a40e23b 10346
1bbc621e
CM
10347 /*
10348 * get the inode first so any iput calls done for the io_list
10349 * aren't the final iput (no unlinks allowed now)
10350 */
77ab86bf 10351 inode = lookup_free_space_inode(fs_info, block_group, path);
1bbc621e
CM
10352
10353 mutex_lock(&trans->transaction->cache_write_mutex);
10354 /*
10355 * make sure our free spache cache IO is done before remove the
10356 * free space inode
10357 */
10358 spin_lock(&trans->transaction->dirty_bgs_lock);
10359 if (!list_empty(&block_group->io_list)) {
10360 list_del_init(&block_group->io_list);
10361
10362 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10363
10364 spin_unlock(&trans->transaction->dirty_bgs_lock);
afdb5718 10365 btrfs_wait_cache_io(trans, block_group, path);
1bbc621e
CM
10366 btrfs_put_block_group(block_group);
10367 spin_lock(&trans->transaction->dirty_bgs_lock);
10368 }
10369
10370 if (!list_empty(&block_group->dirty_list)) {
10371 list_del_init(&block_group->dirty_list);
10372 btrfs_put_block_group(block_group);
10373 }
10374 spin_unlock(&trans->transaction->dirty_bgs_lock);
10375 mutex_unlock(&trans->transaction->cache_write_mutex);
10376
0af3d00b 10377 if (!IS_ERR(inode)) {
73f2e545 10378 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
79787eaa
JM
10379 if (ret) {
10380 btrfs_add_delayed_iput(inode);
10381 goto out;
10382 }
0af3d00b
JB
10383 clear_nlink(inode);
10384 /* One for the block groups ref */
10385 spin_lock(&block_group->lock);
10386 if (block_group->iref) {
10387 block_group->iref = 0;
10388 block_group->inode = NULL;
10389 spin_unlock(&block_group->lock);
10390 iput(inode);
10391 } else {
10392 spin_unlock(&block_group->lock);
10393 }
10394 /* One for our lookup ref */
455757c3 10395 btrfs_add_delayed_iput(inode);
0af3d00b
JB
10396 }
10397
10398 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10399 key.offset = block_group->key.objectid;
10400 key.type = 0;
10401
10402 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10403 if (ret < 0)
10404 goto out;
10405 if (ret > 0)
b3b4aa74 10406 btrfs_release_path(path);
0af3d00b
JB
10407 if (ret == 0) {
10408 ret = btrfs_del_item(trans, tree_root, path);
10409 if (ret)
10410 goto out;
b3b4aa74 10411 btrfs_release_path(path);
0af3d00b
JB
10412 }
10413
0b246afa 10414 spin_lock(&fs_info->block_group_cache_lock);
1a40e23b 10415 rb_erase(&block_group->cache_node,
0b246afa 10416 &fs_info->block_group_cache_tree);
292cbd51 10417 RB_CLEAR_NODE(&block_group->cache_node);
a1897fdd 10418
0b246afa
JM
10419 if (fs_info->first_logical_byte == block_group->key.objectid)
10420 fs_info->first_logical_byte = (u64)-1;
10421 spin_unlock(&fs_info->block_group_cache_lock);
817d52f8 10422
80eb234a 10423 down_write(&block_group->space_info->groups_sem);
44fb5511
CM
10424 /*
10425 * we must use list_del_init so people can check to see if they
10426 * are still on the list after taking the semaphore
10427 */
10428 list_del_init(&block_group->list);
6ab0a202 10429 if (list_empty(&block_group->space_info->block_groups[index])) {
c1895442
JM
10430 kobj = block_group->space_info->block_group_kobjs[index];
10431 block_group->space_info->block_group_kobjs[index] = NULL;
0b246afa 10432 clear_avail_alloc_bits(fs_info, block_group->flags);
6ab0a202 10433 }
80eb234a 10434 up_write(&block_group->space_info->groups_sem);
c1895442
JM
10435 if (kobj) {
10436 kobject_del(kobj);
10437 kobject_put(kobj);
10438 }
1a40e23b 10439
4f69cb98
FM
10440 if (block_group->has_caching_ctl)
10441 caching_ctl = get_caching_control(block_group);
817d52f8 10442 if (block_group->cached == BTRFS_CACHE_STARTED)
11833d66 10443 wait_block_group_cache_done(block_group);
4f69cb98 10444 if (block_group->has_caching_ctl) {
0b246afa 10445 down_write(&fs_info->commit_root_sem);
4f69cb98
FM
10446 if (!caching_ctl) {
10447 struct btrfs_caching_control *ctl;
10448
10449 list_for_each_entry(ctl,
0b246afa 10450 &fs_info->caching_block_groups, list)
4f69cb98
FM
10451 if (ctl->block_group == block_group) {
10452 caching_ctl = ctl;
1e4f4714 10453 refcount_inc(&caching_ctl->count);
4f69cb98
FM
10454 break;
10455 }
10456 }
10457 if (caching_ctl)
10458 list_del_init(&caching_ctl->list);
0b246afa 10459 up_write(&fs_info->commit_root_sem);
4f69cb98
FM
10460 if (caching_ctl) {
10461 /* Once for the caching bgs list and once for us. */
10462 put_caching_control(caching_ctl);
10463 put_caching_control(caching_ctl);
10464 }
10465 }
817d52f8 10466
ce93ec54
JB
10467 spin_lock(&trans->transaction->dirty_bgs_lock);
10468 if (!list_empty(&block_group->dirty_list)) {
1bbc621e
CM
10469 WARN_ON(1);
10470 }
10471 if (!list_empty(&block_group->io_list)) {
10472 WARN_ON(1);
ce93ec54
JB
10473 }
10474 spin_unlock(&trans->transaction->dirty_bgs_lock);
817d52f8
JB
10475 btrfs_remove_free_space_cache(block_group);
10476
c146afad 10477 spin_lock(&block_group->space_info->lock);
75c68e9f 10478 list_del_init(&block_group->ro_list);
18d018ad 10479
0b246afa 10480 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
18d018ad
ZL
10481 WARN_ON(block_group->space_info->total_bytes
10482 < block_group->key.offset);
10483 WARN_ON(block_group->space_info->bytes_readonly
10484 < block_group->key.offset);
10485 WARN_ON(block_group->space_info->disk_total
10486 < block_group->key.offset * factor);
10487 }
c146afad
YZ
10488 block_group->space_info->total_bytes -= block_group->key.offset;
10489 block_group->space_info->bytes_readonly -= block_group->key.offset;
89a55897 10490 block_group->space_info->disk_total -= block_group->key.offset * factor;
18d018ad 10491
c146afad 10492 spin_unlock(&block_group->space_info->lock);
283bb197 10493
0af3d00b
JB
10494 memcpy(&key, &block_group->key, sizeof(key));
10495
34441361 10496 mutex_lock(&fs_info->chunk_mutex);
495e64f4
FM
10497 if (!list_empty(&em->list)) {
10498 /* We're in the transaction->pending_chunks list. */
10499 free_extent_map(em);
10500 }
04216820
FM
10501 spin_lock(&block_group->lock);
10502 block_group->removed = 1;
10503 /*
10504 * At this point trimming can't start on this block group, because we
10505 * removed the block group from the tree fs_info->block_group_cache_tree
10506 * so no one can't find it anymore and even if someone already got this
10507 * block group before we removed it from the rbtree, they have already
10508 * incremented block_group->trimming - if they didn't, they won't find
10509 * any free space entries because we already removed them all when we
10510 * called btrfs_remove_free_space_cache().
10511 *
10512 * And we must not remove the extent map from the fs_info->mapping_tree
10513 * to prevent the same logical address range and physical device space
10514 * ranges from being reused for a new block group. This is because our
10515 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10516 * completely transactionless, so while it is trimming a range the
10517 * currently running transaction might finish and a new one start,
10518 * allowing for new block groups to be created that can reuse the same
10519 * physical device locations unless we take this special care.
e33e17ee
JM
10520 *
10521 * There may also be an implicit trim operation if the file system
10522 * is mounted with -odiscard. The same protections must remain
10523 * in place until the extents have been discarded completely when
10524 * the transaction commit has completed.
04216820
FM
10525 */
10526 remove_em = (atomic_read(&block_group->trimming) == 0);
10527 /*
10528 * Make sure a trimmer task always sees the em in the pinned_chunks list
10529 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10530 * before checking block_group->removed).
10531 */
10532 if (!remove_em) {
10533 /*
10534 * Our em might be in trans->transaction->pending_chunks which
10535 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10536 * and so is the fs_info->pinned_chunks list.
10537 *
10538 * So at this point we must be holding the chunk_mutex to avoid
10539 * any races with chunk allocation (more specifically at
10540 * volumes.c:contains_pending_extent()), to ensure it always
10541 * sees the em, either in the pending_chunks list or in the
10542 * pinned_chunks list.
10543 */
0b246afa 10544 list_move_tail(&em->list, &fs_info->pinned_chunks);
04216820
FM
10545 }
10546 spin_unlock(&block_group->lock);
04216820
FM
10547
10548 if (remove_em) {
10549 struct extent_map_tree *em_tree;
10550
0b246afa 10551 em_tree = &fs_info->mapping_tree.map_tree;
04216820 10552 write_lock(&em_tree->lock);
8dbcd10f
FM
10553 /*
10554 * The em might be in the pending_chunks list, so make sure the
10555 * chunk mutex is locked, since remove_extent_mapping() will
10556 * delete us from that list.
10557 */
04216820
FM
10558 remove_extent_mapping(em_tree, em);
10559 write_unlock(&em_tree->lock);
10560 /* once for the tree */
10561 free_extent_map(em);
10562 }
10563
34441361 10564 mutex_unlock(&fs_info->chunk_mutex);
8dbcd10f 10565
0b246afa 10566 ret = remove_block_group_free_space(trans, fs_info, block_group);
1e144fb8
OS
10567 if (ret)
10568 goto out;
10569
fa9c0d79
CM
10570 btrfs_put_block_group(block_group);
10571 btrfs_put_block_group(block_group);
1a40e23b
ZY
10572
10573 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10574 if (ret > 0)
10575 ret = -EIO;
10576 if (ret < 0)
10577 goto out;
10578
10579 ret = btrfs_del_item(trans, root, path);
10580out:
10581 btrfs_free_path(path);
10582 return ret;
10583}
acce952b 10584
8eab77ff 10585struct btrfs_trans_handle *
7fd01182
FM
10586btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10587 const u64 chunk_offset)
8eab77ff 10588{
7fd01182
FM
10589 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10590 struct extent_map *em;
10591 struct map_lookup *map;
10592 unsigned int num_items;
10593
10594 read_lock(&em_tree->lock);
10595 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10596 read_unlock(&em_tree->lock);
10597 ASSERT(em && em->start == chunk_offset);
10598
8eab77ff 10599 /*
7fd01182
FM
10600 * We need to reserve 3 + N units from the metadata space info in order
10601 * to remove a block group (done at btrfs_remove_chunk() and at
10602 * btrfs_remove_block_group()), which are used for:
10603 *
8eab77ff
FM
10604 * 1 unit for adding the free space inode's orphan (located in the tree
10605 * of tree roots).
7fd01182
FM
10606 * 1 unit for deleting the block group item (located in the extent
10607 * tree).
10608 * 1 unit for deleting the free space item (located in tree of tree
10609 * roots).
10610 * N units for deleting N device extent items corresponding to each
10611 * stripe (located in the device tree).
10612 *
10613 * In order to remove a block group we also need to reserve units in the
10614 * system space info in order to update the chunk tree (update one or
10615 * more device items and remove one chunk item), but this is done at
10616 * btrfs_remove_chunk() through a call to check_system_chunk().
8eab77ff 10617 */
95617d69 10618 map = em->map_lookup;
7fd01182
FM
10619 num_items = 3 + map->num_stripes;
10620 free_extent_map(em);
10621
8eab77ff 10622 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
7fd01182 10623 num_items, 1);
8eab77ff
FM
10624}
10625
47ab2a6c
JB
10626/*
10627 * Process the unused_bgs list and remove any that don't have any allocated
10628 * space inside of them.
10629 */
10630void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10631{
10632 struct btrfs_block_group_cache *block_group;
10633 struct btrfs_space_info *space_info;
47ab2a6c
JB
10634 struct btrfs_trans_handle *trans;
10635 int ret = 0;
10636
afcdd129 10637 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
47ab2a6c
JB
10638 return;
10639
10640 spin_lock(&fs_info->unused_bgs_lock);
10641 while (!list_empty(&fs_info->unused_bgs)) {
10642 u64 start, end;
e33e17ee 10643 int trimming;
47ab2a6c
JB
10644
10645 block_group = list_first_entry(&fs_info->unused_bgs,
10646 struct btrfs_block_group_cache,
10647 bg_list);
47ab2a6c 10648 list_del_init(&block_group->bg_list);
aefbe9a6
ZL
10649
10650 space_info = block_group->space_info;
10651
47ab2a6c
JB
10652 if (ret || btrfs_mixed_space_info(space_info)) {
10653 btrfs_put_block_group(block_group);
10654 continue;
10655 }
10656 spin_unlock(&fs_info->unused_bgs_lock);
10657
d5f2e33b 10658 mutex_lock(&fs_info->delete_unused_bgs_mutex);
67c5e7d4 10659
47ab2a6c
JB
10660 /* Don't want to race with allocators so take the groups_sem */
10661 down_write(&space_info->groups_sem);
10662 spin_lock(&block_group->lock);
10663 if (block_group->reserved ||
10664 btrfs_block_group_used(&block_group->item) ||
19c4d2f9 10665 block_group->ro ||
aefbe9a6 10666 list_is_singular(&block_group->list)) {
47ab2a6c
JB
10667 /*
10668 * We want to bail if we made new allocations or have
10669 * outstanding allocations in this block group. We do
10670 * the ro check in case balance is currently acting on
10671 * this block group.
10672 */
10673 spin_unlock(&block_group->lock);
10674 up_write(&space_info->groups_sem);
10675 goto next;
10676 }
10677 spin_unlock(&block_group->lock);
10678
10679 /* We don't want to force the issue, only flip if it's ok. */
868f401a 10680 ret = inc_block_group_ro(block_group, 0);
47ab2a6c
JB
10681 up_write(&space_info->groups_sem);
10682 if (ret < 0) {
10683 ret = 0;
10684 goto next;
10685 }
10686
10687 /*
10688 * Want to do this before we do anything else so we can recover
10689 * properly if we fail to join the transaction.
10690 */
7fd01182
FM
10691 trans = btrfs_start_trans_remove_block_group(fs_info,
10692 block_group->key.objectid);
47ab2a6c 10693 if (IS_ERR(trans)) {
2ff7e61e 10694 btrfs_dec_block_group_ro(block_group);
47ab2a6c
JB
10695 ret = PTR_ERR(trans);
10696 goto next;
10697 }
10698
10699 /*
10700 * We could have pending pinned extents for this block group,
10701 * just delete them, we don't care about them anymore.
10702 */
10703 start = block_group->key.objectid;
10704 end = start + block_group->key.offset - 1;
d4b450cd
FM
10705 /*
10706 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10707 * btrfs_finish_extent_commit(). If we are at transaction N,
10708 * another task might be running finish_extent_commit() for the
10709 * previous transaction N - 1, and have seen a range belonging
10710 * to the block group in freed_extents[] before we were able to
10711 * clear the whole block group range from freed_extents[]. This
10712 * means that task can lookup for the block group after we
10713 * unpinned it from freed_extents[] and removed it, leading to
10714 * a BUG_ON() at btrfs_unpin_extent_range().
10715 */
10716 mutex_lock(&fs_info->unused_bg_unpin_mutex);
758eb51e 10717 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
91166212 10718 EXTENT_DIRTY);
758eb51e 10719 if (ret) {
d4b450cd 10720 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 10721 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
10722 goto end_trans;
10723 }
10724 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
91166212 10725 EXTENT_DIRTY);
758eb51e 10726 if (ret) {
d4b450cd 10727 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2ff7e61e 10728 btrfs_dec_block_group_ro(block_group);
758eb51e
FM
10729 goto end_trans;
10730 }
d4b450cd 10731 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
47ab2a6c
JB
10732
10733 /* Reset pinned so btrfs_put_block_group doesn't complain */
c30666d4
ZL
10734 spin_lock(&space_info->lock);
10735 spin_lock(&block_group->lock);
10736
10737 space_info->bytes_pinned -= block_group->pinned;
10738 space_info->bytes_readonly += block_group->pinned;
10739 percpu_counter_add(&space_info->total_bytes_pinned,
10740 -block_group->pinned);
47ab2a6c
JB
10741 block_group->pinned = 0;
10742
c30666d4
ZL
10743 spin_unlock(&block_group->lock);
10744 spin_unlock(&space_info->lock);
10745
e33e17ee 10746 /* DISCARD can flip during remount */
0b246afa 10747 trimming = btrfs_test_opt(fs_info, DISCARD);
e33e17ee
JM
10748
10749 /* Implicit trim during transaction commit. */
10750 if (trimming)
10751 btrfs_get_block_group_trimming(block_group);
10752
47ab2a6c
JB
10753 /*
10754 * Btrfs_remove_chunk will abort the transaction if things go
10755 * horribly wrong.
10756 */
5b4aacef 10757 ret = btrfs_remove_chunk(trans, fs_info,
47ab2a6c 10758 block_group->key.objectid);
e33e17ee
JM
10759
10760 if (ret) {
10761 if (trimming)
10762 btrfs_put_block_group_trimming(block_group);
10763 goto end_trans;
10764 }
10765
10766 /*
10767 * If we're not mounted with -odiscard, we can just forget
10768 * about this block group. Otherwise we'll need to wait
10769 * until transaction commit to do the actual discard.
10770 */
10771 if (trimming) {
348a0013
FM
10772 spin_lock(&fs_info->unused_bgs_lock);
10773 /*
10774 * A concurrent scrub might have added us to the list
10775 * fs_info->unused_bgs, so use a list_move operation
10776 * to add the block group to the deleted_bgs list.
10777 */
e33e17ee
JM
10778 list_move(&block_group->bg_list,
10779 &trans->transaction->deleted_bgs);
348a0013 10780 spin_unlock(&fs_info->unused_bgs_lock);
e33e17ee
JM
10781 btrfs_get_block_group(block_group);
10782 }
758eb51e 10783end_trans:
3a45bb20 10784 btrfs_end_transaction(trans);
47ab2a6c 10785next:
d5f2e33b 10786 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
47ab2a6c
JB
10787 btrfs_put_block_group(block_group);
10788 spin_lock(&fs_info->unused_bgs_lock);
10789 }
10790 spin_unlock(&fs_info->unused_bgs_lock);
10791}
10792
c59021f8 10793int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10794{
10795 struct btrfs_space_info *space_info;
1aba86d6 10796 struct btrfs_super_block *disk_super;
10797 u64 features;
10798 u64 flags;
10799 int mixed = 0;
c59021f8 10800 int ret;
10801
6c41761f 10802 disk_super = fs_info->super_copy;
1aba86d6 10803 if (!btrfs_super_root(disk_super))
0dc924c5 10804 return -EINVAL;
c59021f8 10805
1aba86d6 10806 features = btrfs_super_incompat_flags(disk_super);
10807 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10808 mixed = 1;
c59021f8 10809
1aba86d6 10810 flags = BTRFS_BLOCK_GROUP_SYSTEM;
e40edf2d 10811 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
c59021f8 10812 if (ret)
1aba86d6 10813 goto out;
c59021f8 10814
1aba86d6 10815 if (mixed) {
10816 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
e40edf2d 10817 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
1aba86d6 10818 } else {
10819 flags = BTRFS_BLOCK_GROUP_METADATA;
e40edf2d 10820 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
1aba86d6 10821 if (ret)
10822 goto out;
10823
10824 flags = BTRFS_BLOCK_GROUP_DATA;
e40edf2d 10825 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
1aba86d6 10826 }
10827out:
c59021f8 10828 return ret;
10829}
10830
2ff7e61e
JM
10831int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10832 u64 start, u64 end)
acce952b 10833{
2ff7e61e 10834 return unpin_extent_range(fs_info, start, end, false);
acce952b 10835}
10836
499f377f
JM
10837/*
10838 * It used to be that old block groups would be left around forever.
10839 * Iterating over them would be enough to trim unused space. Since we
10840 * now automatically remove them, we also need to iterate over unallocated
10841 * space.
10842 *
10843 * We don't want a transaction for this since the discard may take a
10844 * substantial amount of time. We don't require that a transaction be
10845 * running, but we do need to take a running transaction into account
10846 * to ensure that we're not discarding chunks that were released in
10847 * the current transaction.
10848 *
10849 * Holding the chunks lock will prevent other threads from allocating
10850 * or releasing chunks, but it won't prevent a running transaction
10851 * from committing and releasing the memory that the pending chunks
10852 * list head uses. For that, we need to take a reference to the
10853 * transaction.
10854 */
10855static int btrfs_trim_free_extents(struct btrfs_device *device,
10856 u64 minlen, u64 *trimmed)
10857{
10858 u64 start = 0, len = 0;
10859 int ret;
10860
10861 *trimmed = 0;
10862
10863 /* Not writeable = nothing to do. */
10864 if (!device->writeable)
10865 return 0;
10866
10867 /* No free space = nothing to do. */
10868 if (device->total_bytes <= device->bytes_used)
10869 return 0;
10870
10871 ret = 0;
10872
10873 while (1) {
fb456252 10874 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
10875 struct btrfs_transaction *trans;
10876 u64 bytes;
10877
10878 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10879 if (ret)
10880 return ret;
10881
10882 down_read(&fs_info->commit_root_sem);
10883
10884 spin_lock(&fs_info->trans_lock);
10885 trans = fs_info->running_transaction;
10886 if (trans)
9b64f57d 10887 refcount_inc(&trans->use_count);
499f377f
JM
10888 spin_unlock(&fs_info->trans_lock);
10889
10890 ret = find_free_dev_extent_start(trans, device, minlen, start,
10891 &start, &len);
10892 if (trans)
10893 btrfs_put_transaction(trans);
10894
10895 if (ret) {
10896 up_read(&fs_info->commit_root_sem);
10897 mutex_unlock(&fs_info->chunk_mutex);
10898 if (ret == -ENOSPC)
10899 ret = 0;
10900 break;
10901 }
10902
10903 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10904 up_read(&fs_info->commit_root_sem);
10905 mutex_unlock(&fs_info->chunk_mutex);
10906
10907 if (ret)
10908 break;
10909
10910 start += len;
10911 *trimmed += bytes;
10912
10913 if (fatal_signal_pending(current)) {
10914 ret = -ERESTARTSYS;
10915 break;
10916 }
10917
10918 cond_resched();
10919 }
10920
10921 return ret;
10922}
10923
2ff7e61e 10924int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 10925{
f7039b1d 10926 struct btrfs_block_group_cache *cache = NULL;
499f377f
JM
10927 struct btrfs_device *device;
10928 struct list_head *devices;
f7039b1d
LD
10929 u64 group_trimmed;
10930 u64 start;
10931 u64 end;
10932 u64 trimmed = 0;
2cac13e4 10933 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
f7039b1d
LD
10934 int ret = 0;
10935
2cac13e4
LB
10936 /*
10937 * try to trim all FS space, our block group may start from non-zero.
10938 */
10939 if (range->len == total_bytes)
10940 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10941 else
10942 cache = btrfs_lookup_block_group(fs_info, range->start);
f7039b1d
LD
10943
10944 while (cache) {
10945 if (cache->key.objectid >= (range->start + range->len)) {
10946 btrfs_put_block_group(cache);
10947 break;
10948 }
10949
10950 start = max(range->start, cache->key.objectid);
10951 end = min(range->start + range->len,
10952 cache->key.objectid + cache->key.offset);
10953
10954 if (end - start >= range->minlen) {
10955 if (!block_group_cache_done(cache)) {
f6373bf3 10956 ret = cache_block_group(cache, 0);
1be41b78
JB
10957 if (ret) {
10958 btrfs_put_block_group(cache);
10959 break;
10960 }
10961 ret = wait_block_group_cache_done(cache);
10962 if (ret) {
10963 btrfs_put_block_group(cache);
10964 break;
10965 }
f7039b1d
LD
10966 }
10967 ret = btrfs_trim_block_group(cache,
10968 &group_trimmed,
10969 start,
10970 end,
10971 range->minlen);
10972
10973 trimmed += group_trimmed;
10974 if (ret) {
10975 btrfs_put_block_group(cache);
10976 break;
10977 }
10978 }
10979
2ff7e61e 10980 cache = next_block_group(fs_info, cache);
f7039b1d
LD
10981 }
10982
0b246afa
JM
10983 mutex_lock(&fs_info->fs_devices->device_list_mutex);
10984 devices = &fs_info->fs_devices->alloc_list;
499f377f
JM
10985 list_for_each_entry(device, devices, dev_alloc_list) {
10986 ret = btrfs_trim_free_extents(device, range->minlen,
10987 &group_trimmed);
10988 if (ret)
10989 break;
10990
10991 trimmed += group_trimmed;
10992 }
0b246afa 10993 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
499f377f 10994
f7039b1d
LD
10995 range->len = trimmed;
10996 return ret;
10997}
8257b2dc
MX
10998
10999/*
9ea24bbe
FM
11000 * btrfs_{start,end}_write_no_snapshoting() are similar to
11001 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11002 * data into the page cache through nocow before the subvolume is snapshoted,
11003 * but flush the data into disk after the snapshot creation, or to prevent
11004 * operations while snapshoting is ongoing and that cause the snapshot to be
11005 * inconsistent (writes followed by expanding truncates for example).
8257b2dc 11006 */
9ea24bbe 11007void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
8257b2dc
MX
11008{
11009 percpu_counter_dec(&root->subv_writers->counter);
11010 /*
a83342aa 11011 * Make sure counter is updated before we wake up waiters.
8257b2dc
MX
11012 */
11013 smp_mb();
11014 if (waitqueue_active(&root->subv_writers->wait))
11015 wake_up(&root->subv_writers->wait);
11016}
11017
9ea24bbe 11018int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
8257b2dc 11019{
ee39b432 11020 if (atomic_read(&root->will_be_snapshoted))
8257b2dc
MX
11021 return 0;
11022
11023 percpu_counter_inc(&root->subv_writers->counter);
11024 /*
11025 * Make sure counter is updated before we check for snapshot creation.
11026 */
11027 smp_mb();
ee39b432 11028 if (atomic_read(&root->will_be_snapshoted)) {
9ea24bbe 11029 btrfs_end_write_no_snapshoting(root);
8257b2dc
MX
11030 return 0;
11031 }
11032 return 1;
11033}
0bc19f90
ZL
11034
11035static int wait_snapshoting_atomic_t(atomic_t *a)
11036{
11037 schedule();
11038 return 0;
11039}
11040
11041void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11042{
11043 while (true) {
11044 int ret;
11045
11046 ret = btrfs_start_write_no_snapshoting(root);
11047 if (ret)
11048 break;
11049 wait_on_atomic_t(&root->will_be_snapshoted,
11050 wait_snapshoting_atomic_t,
11051 TASK_UNINTERRUPTIBLE);
11052 }
11053}