]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - fs/btrfs/extent-tree.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / fs / btrfs / extent-tree.c
1 /*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18 #include <linux/sched.h>
19 #include <linux/sched/signal.h>
20 #include <linux/pagemap.h>
21 #include <linux/writeback.h>
22 #include <linux/blkdev.h>
23 #include <linux/sort.h>
24 #include <linux/rcupdate.h>
25 #include <linux/kthread.h>
26 #include <linux/slab.h>
27 #include <linux/ratelimit.h>
28 #include <linux/percpu_counter.h>
29 #include <linux/lockdep.h>
30 #include "hash.h"
31 #include "tree-log.h"
32 #include "disk-io.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "raid56.h"
36 #include "locking.h"
37 #include "free-space-cache.h"
38 #include "free-space-tree.h"
39 #include "math.h"
40 #include "sysfs.h"
41 #include "qgroup.h"
42 #include "ref-verify.h"
43
44 #undef SCRAMBLE_DELAYED_REFS
45
46 /*
47 * control flags for do_chunk_alloc's force field
48 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
49 * if we really need one.
50 *
51 * CHUNK_ALLOC_LIMITED means to only try and allocate one
52 * if we have very few chunks already allocated. This is
53 * used as part of the clustering code to help make sure
54 * we have a good pool of storage to cluster in, without
55 * filling the FS with empty chunks
56 *
57 * CHUNK_ALLOC_FORCE means it must try to allocate one
58 *
59 */
60 enum {
61 CHUNK_ALLOC_NO_FORCE = 0,
62 CHUNK_ALLOC_LIMITED = 1,
63 CHUNK_ALLOC_FORCE = 2,
64 };
65
66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
67 struct btrfs_fs_info *fs_info,
68 struct btrfs_delayed_ref_node *node, u64 parent,
69 u64 root_objectid, u64 owner_objectid,
70 u64 owner_offset, int refs_to_drop,
71 struct btrfs_delayed_extent_op *extra_op);
72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
73 struct extent_buffer *leaf,
74 struct btrfs_extent_item *ei);
75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
76 struct btrfs_fs_info *fs_info,
77 u64 parent, u64 root_objectid,
78 u64 flags, u64 owner, u64 offset,
79 struct btrfs_key *ins, int ref_mod);
80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
81 struct btrfs_fs_info *fs_info,
82 u64 parent, u64 root_objectid,
83 u64 flags, struct btrfs_disk_key *key,
84 int level, struct btrfs_key *ins);
85 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
86 struct btrfs_fs_info *fs_info, u64 flags,
87 int force);
88 static int find_next_key(struct btrfs_path *path, int level,
89 struct btrfs_key *key);
90 static void dump_space_info(struct btrfs_fs_info *fs_info,
91 struct btrfs_space_info *info, u64 bytes,
92 int dump_block_groups);
93 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
94 u64 num_bytes);
95 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
96 struct btrfs_space_info *space_info,
97 u64 num_bytes);
98 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
99 struct btrfs_space_info *space_info,
100 u64 num_bytes);
101
102 static noinline int
103 block_group_cache_done(struct btrfs_block_group_cache *cache)
104 {
105 smp_mb();
106 return cache->cached == BTRFS_CACHE_FINISHED ||
107 cache->cached == BTRFS_CACHE_ERROR;
108 }
109
110 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
111 {
112 return (cache->flags & bits) == bits;
113 }
114
115 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
116 {
117 atomic_inc(&cache->count);
118 }
119
120 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
121 {
122 if (atomic_dec_and_test(&cache->count)) {
123 WARN_ON(cache->pinned > 0);
124 WARN_ON(cache->reserved > 0);
125
126 /*
127 * If not empty, someone is still holding mutex of
128 * full_stripe_lock, which can only be released by caller.
129 * And it will definitely cause use-after-free when caller
130 * tries to release full stripe lock.
131 *
132 * No better way to resolve, but only to warn.
133 */
134 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
135 kfree(cache->free_space_ctl);
136 kfree(cache);
137 }
138 }
139
140 /*
141 * this adds the block group to the fs_info rb tree for the block group
142 * cache
143 */
144 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
145 struct btrfs_block_group_cache *block_group)
146 {
147 struct rb_node **p;
148 struct rb_node *parent = NULL;
149 struct btrfs_block_group_cache *cache;
150
151 spin_lock(&info->block_group_cache_lock);
152 p = &info->block_group_cache_tree.rb_node;
153
154 while (*p) {
155 parent = *p;
156 cache = rb_entry(parent, struct btrfs_block_group_cache,
157 cache_node);
158 if (block_group->key.objectid < cache->key.objectid) {
159 p = &(*p)->rb_left;
160 } else if (block_group->key.objectid > cache->key.objectid) {
161 p = &(*p)->rb_right;
162 } else {
163 spin_unlock(&info->block_group_cache_lock);
164 return -EEXIST;
165 }
166 }
167
168 rb_link_node(&block_group->cache_node, parent, p);
169 rb_insert_color(&block_group->cache_node,
170 &info->block_group_cache_tree);
171
172 if (info->first_logical_byte > block_group->key.objectid)
173 info->first_logical_byte = block_group->key.objectid;
174
175 spin_unlock(&info->block_group_cache_lock);
176
177 return 0;
178 }
179
180 /*
181 * This will return the block group at or after bytenr if contains is 0, else
182 * it will return the block group that contains the bytenr
183 */
184 static struct btrfs_block_group_cache *
185 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
186 int contains)
187 {
188 struct btrfs_block_group_cache *cache, *ret = NULL;
189 struct rb_node *n;
190 u64 end, start;
191
192 spin_lock(&info->block_group_cache_lock);
193 n = info->block_group_cache_tree.rb_node;
194
195 while (n) {
196 cache = rb_entry(n, struct btrfs_block_group_cache,
197 cache_node);
198 end = cache->key.objectid + cache->key.offset - 1;
199 start = cache->key.objectid;
200
201 if (bytenr < start) {
202 if (!contains && (!ret || start < ret->key.objectid))
203 ret = cache;
204 n = n->rb_left;
205 } else if (bytenr > start) {
206 if (contains && bytenr <= end) {
207 ret = cache;
208 break;
209 }
210 n = n->rb_right;
211 } else {
212 ret = cache;
213 break;
214 }
215 }
216 if (ret) {
217 btrfs_get_block_group(ret);
218 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
219 info->first_logical_byte = ret->key.objectid;
220 }
221 spin_unlock(&info->block_group_cache_lock);
222
223 return ret;
224 }
225
226 static int add_excluded_extent(struct btrfs_fs_info *fs_info,
227 u64 start, u64 num_bytes)
228 {
229 u64 end = start + num_bytes - 1;
230 set_extent_bits(&fs_info->freed_extents[0],
231 start, end, EXTENT_UPTODATE);
232 set_extent_bits(&fs_info->freed_extents[1],
233 start, end, EXTENT_UPTODATE);
234 return 0;
235 }
236
237 static void free_excluded_extents(struct btrfs_fs_info *fs_info,
238 struct btrfs_block_group_cache *cache)
239 {
240 u64 start, end;
241
242 start = cache->key.objectid;
243 end = start + cache->key.offset - 1;
244
245 clear_extent_bits(&fs_info->freed_extents[0],
246 start, end, EXTENT_UPTODATE);
247 clear_extent_bits(&fs_info->freed_extents[1],
248 start, end, EXTENT_UPTODATE);
249 }
250
251 static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
252 struct btrfs_block_group_cache *cache)
253 {
254 u64 bytenr;
255 u64 *logical;
256 int stripe_len;
257 int i, nr, ret;
258
259 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
260 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
261 cache->bytes_super += stripe_len;
262 ret = add_excluded_extent(fs_info, cache->key.objectid,
263 stripe_len);
264 if (ret)
265 return ret;
266 }
267
268 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
269 bytenr = btrfs_sb_offset(i);
270 ret = btrfs_rmap_block(fs_info, cache->key.objectid,
271 bytenr, 0, &logical, &nr, &stripe_len);
272 if (ret)
273 return ret;
274
275 while (nr--) {
276 u64 start, len;
277
278 if (logical[nr] > cache->key.objectid +
279 cache->key.offset)
280 continue;
281
282 if (logical[nr] + stripe_len <= cache->key.objectid)
283 continue;
284
285 start = logical[nr];
286 if (start < cache->key.objectid) {
287 start = cache->key.objectid;
288 len = (logical[nr] + stripe_len) - start;
289 } else {
290 len = min_t(u64, stripe_len,
291 cache->key.objectid +
292 cache->key.offset - start);
293 }
294
295 cache->bytes_super += len;
296 ret = add_excluded_extent(fs_info, start, len);
297 if (ret) {
298 kfree(logical);
299 return ret;
300 }
301 }
302
303 kfree(logical);
304 }
305 return 0;
306 }
307
308 static struct btrfs_caching_control *
309 get_caching_control(struct btrfs_block_group_cache *cache)
310 {
311 struct btrfs_caching_control *ctl;
312
313 spin_lock(&cache->lock);
314 if (!cache->caching_ctl) {
315 spin_unlock(&cache->lock);
316 return NULL;
317 }
318
319 ctl = cache->caching_ctl;
320 refcount_inc(&ctl->count);
321 spin_unlock(&cache->lock);
322 return ctl;
323 }
324
325 static void put_caching_control(struct btrfs_caching_control *ctl)
326 {
327 if (refcount_dec_and_test(&ctl->count))
328 kfree(ctl);
329 }
330
331 #ifdef CONFIG_BTRFS_DEBUG
332 static void fragment_free_space(struct btrfs_block_group_cache *block_group)
333 {
334 struct btrfs_fs_info *fs_info = block_group->fs_info;
335 u64 start = block_group->key.objectid;
336 u64 len = block_group->key.offset;
337 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
338 fs_info->nodesize : fs_info->sectorsize;
339 u64 step = chunk << 1;
340
341 while (len > chunk) {
342 btrfs_remove_free_space(block_group, start, chunk);
343 start += step;
344 if (len < step)
345 len = 0;
346 else
347 len -= step;
348 }
349 }
350 #endif
351
352 /*
353 * this is only called by cache_block_group, since we could have freed extents
354 * we need to check the pinned_extents for any extents that can't be used yet
355 * since their free space will be released as soon as the transaction commits.
356 */
357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
358 struct btrfs_fs_info *info, u64 start, u64 end)
359 {
360 u64 extent_start, extent_end, size, total_added = 0;
361 int ret;
362
363 while (start < end) {
364 ret = find_first_extent_bit(info->pinned_extents, start,
365 &extent_start, &extent_end,
366 EXTENT_DIRTY | EXTENT_UPTODATE,
367 NULL);
368 if (ret)
369 break;
370
371 if (extent_start <= start) {
372 start = extent_end + 1;
373 } else if (extent_start > start && extent_start < end) {
374 size = extent_start - start;
375 total_added += size;
376 ret = btrfs_add_free_space(block_group, start,
377 size);
378 BUG_ON(ret); /* -ENOMEM or logic error */
379 start = extent_end + 1;
380 } else {
381 break;
382 }
383 }
384
385 if (start < end) {
386 size = end - start;
387 total_added += size;
388 ret = btrfs_add_free_space(block_group, start, size);
389 BUG_ON(ret); /* -ENOMEM or logic error */
390 }
391
392 return total_added;
393 }
394
395 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
396 {
397 struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
398 struct btrfs_fs_info *fs_info = block_group->fs_info;
399 struct btrfs_root *extent_root = fs_info->extent_root;
400 struct btrfs_path *path;
401 struct extent_buffer *leaf;
402 struct btrfs_key key;
403 u64 total_found = 0;
404 u64 last = 0;
405 u32 nritems;
406 int ret;
407 bool wakeup = true;
408
409 path = btrfs_alloc_path();
410 if (!path)
411 return -ENOMEM;
412
413 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
414
415 #ifdef CONFIG_BTRFS_DEBUG
416 /*
417 * If we're fragmenting we don't want to make anybody think we can
418 * allocate from this block group until we've had a chance to fragment
419 * the free space.
420 */
421 if (btrfs_should_fragment_free_space(block_group))
422 wakeup = false;
423 #endif
424 /*
425 * We don't want to deadlock with somebody trying to allocate a new
426 * extent for the extent root while also trying to search the extent
427 * root to add free space. So we skip locking and search the commit
428 * root, since its read-only
429 */
430 path->skip_locking = 1;
431 path->search_commit_root = 1;
432 path->reada = READA_FORWARD;
433
434 key.objectid = last;
435 key.offset = 0;
436 key.type = BTRFS_EXTENT_ITEM_KEY;
437
438 next:
439 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
440 if (ret < 0)
441 goto out;
442
443 leaf = path->nodes[0];
444 nritems = btrfs_header_nritems(leaf);
445
446 while (1) {
447 if (btrfs_fs_closing(fs_info) > 1) {
448 last = (u64)-1;
449 break;
450 }
451
452 if (path->slots[0] < nritems) {
453 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
454 } else {
455 ret = find_next_key(path, 0, &key);
456 if (ret)
457 break;
458
459 if (need_resched() ||
460 rwsem_is_contended(&fs_info->commit_root_sem)) {
461 if (wakeup)
462 caching_ctl->progress = last;
463 btrfs_release_path(path);
464 up_read(&fs_info->commit_root_sem);
465 mutex_unlock(&caching_ctl->mutex);
466 cond_resched();
467 mutex_lock(&caching_ctl->mutex);
468 down_read(&fs_info->commit_root_sem);
469 goto next;
470 }
471
472 ret = btrfs_next_leaf(extent_root, path);
473 if (ret < 0)
474 goto out;
475 if (ret)
476 break;
477 leaf = path->nodes[0];
478 nritems = btrfs_header_nritems(leaf);
479 continue;
480 }
481
482 if (key.objectid < last) {
483 key.objectid = last;
484 key.offset = 0;
485 key.type = BTRFS_EXTENT_ITEM_KEY;
486
487 if (wakeup)
488 caching_ctl->progress = last;
489 btrfs_release_path(path);
490 goto next;
491 }
492
493 if (key.objectid < block_group->key.objectid) {
494 path->slots[0]++;
495 continue;
496 }
497
498 if (key.objectid >= block_group->key.objectid +
499 block_group->key.offset)
500 break;
501
502 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
503 key.type == BTRFS_METADATA_ITEM_KEY) {
504 total_found += add_new_free_space(block_group,
505 fs_info, last,
506 key.objectid);
507 if (key.type == BTRFS_METADATA_ITEM_KEY)
508 last = key.objectid +
509 fs_info->nodesize;
510 else
511 last = key.objectid + key.offset;
512
513 if (total_found > CACHING_CTL_WAKE_UP) {
514 total_found = 0;
515 if (wakeup)
516 wake_up(&caching_ctl->wait);
517 }
518 }
519 path->slots[0]++;
520 }
521 ret = 0;
522
523 total_found += add_new_free_space(block_group, fs_info, last,
524 block_group->key.objectid +
525 block_group->key.offset);
526 caching_ctl->progress = (u64)-1;
527
528 out:
529 btrfs_free_path(path);
530 return ret;
531 }
532
533 static noinline void caching_thread(struct btrfs_work *work)
534 {
535 struct btrfs_block_group_cache *block_group;
536 struct btrfs_fs_info *fs_info;
537 struct btrfs_caching_control *caching_ctl;
538 struct btrfs_root *extent_root;
539 int ret;
540
541 caching_ctl = container_of(work, struct btrfs_caching_control, work);
542 block_group = caching_ctl->block_group;
543 fs_info = block_group->fs_info;
544 extent_root = fs_info->extent_root;
545
546 mutex_lock(&caching_ctl->mutex);
547 down_read(&fs_info->commit_root_sem);
548
549 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
550 ret = load_free_space_tree(caching_ctl);
551 else
552 ret = load_extent_tree_free(caching_ctl);
553
554 spin_lock(&block_group->lock);
555 block_group->caching_ctl = NULL;
556 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
557 spin_unlock(&block_group->lock);
558
559 #ifdef CONFIG_BTRFS_DEBUG
560 if (btrfs_should_fragment_free_space(block_group)) {
561 u64 bytes_used;
562
563 spin_lock(&block_group->space_info->lock);
564 spin_lock(&block_group->lock);
565 bytes_used = block_group->key.offset -
566 btrfs_block_group_used(&block_group->item);
567 block_group->space_info->bytes_used += bytes_used >> 1;
568 spin_unlock(&block_group->lock);
569 spin_unlock(&block_group->space_info->lock);
570 fragment_free_space(block_group);
571 }
572 #endif
573
574 caching_ctl->progress = (u64)-1;
575
576 up_read(&fs_info->commit_root_sem);
577 free_excluded_extents(fs_info, block_group);
578 mutex_unlock(&caching_ctl->mutex);
579
580 wake_up(&caching_ctl->wait);
581
582 put_caching_control(caching_ctl);
583 btrfs_put_block_group(block_group);
584 }
585
586 static int cache_block_group(struct btrfs_block_group_cache *cache,
587 int load_cache_only)
588 {
589 DEFINE_WAIT(wait);
590 struct btrfs_fs_info *fs_info = cache->fs_info;
591 struct btrfs_caching_control *caching_ctl;
592 int ret = 0;
593
594 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
595 if (!caching_ctl)
596 return -ENOMEM;
597
598 INIT_LIST_HEAD(&caching_ctl->list);
599 mutex_init(&caching_ctl->mutex);
600 init_waitqueue_head(&caching_ctl->wait);
601 caching_ctl->block_group = cache;
602 caching_ctl->progress = cache->key.objectid;
603 refcount_set(&caching_ctl->count, 1);
604 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
605 caching_thread, NULL, NULL);
606
607 spin_lock(&cache->lock);
608 /*
609 * This should be a rare occasion, but this could happen I think in the
610 * case where one thread starts to load the space cache info, and then
611 * some other thread starts a transaction commit which tries to do an
612 * allocation while the other thread is still loading the space cache
613 * info. The previous loop should have kept us from choosing this block
614 * group, but if we've moved to the state where we will wait on caching
615 * block groups we need to first check if we're doing a fast load here,
616 * so we can wait for it to finish, otherwise we could end up allocating
617 * from a block group who's cache gets evicted for one reason or
618 * another.
619 */
620 while (cache->cached == BTRFS_CACHE_FAST) {
621 struct btrfs_caching_control *ctl;
622
623 ctl = cache->caching_ctl;
624 refcount_inc(&ctl->count);
625 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
626 spin_unlock(&cache->lock);
627
628 schedule();
629
630 finish_wait(&ctl->wait, &wait);
631 put_caching_control(ctl);
632 spin_lock(&cache->lock);
633 }
634
635 if (cache->cached != BTRFS_CACHE_NO) {
636 spin_unlock(&cache->lock);
637 kfree(caching_ctl);
638 return 0;
639 }
640 WARN_ON(cache->caching_ctl);
641 cache->caching_ctl = caching_ctl;
642 cache->cached = BTRFS_CACHE_FAST;
643 spin_unlock(&cache->lock);
644
645 if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
646 mutex_lock(&caching_ctl->mutex);
647 ret = load_free_space_cache(fs_info, cache);
648
649 spin_lock(&cache->lock);
650 if (ret == 1) {
651 cache->caching_ctl = NULL;
652 cache->cached = BTRFS_CACHE_FINISHED;
653 cache->last_byte_to_unpin = (u64)-1;
654 caching_ctl->progress = (u64)-1;
655 } else {
656 if (load_cache_only) {
657 cache->caching_ctl = NULL;
658 cache->cached = BTRFS_CACHE_NO;
659 } else {
660 cache->cached = BTRFS_CACHE_STARTED;
661 cache->has_caching_ctl = 1;
662 }
663 }
664 spin_unlock(&cache->lock);
665 #ifdef CONFIG_BTRFS_DEBUG
666 if (ret == 1 &&
667 btrfs_should_fragment_free_space(cache)) {
668 u64 bytes_used;
669
670 spin_lock(&cache->space_info->lock);
671 spin_lock(&cache->lock);
672 bytes_used = cache->key.offset -
673 btrfs_block_group_used(&cache->item);
674 cache->space_info->bytes_used += bytes_used >> 1;
675 spin_unlock(&cache->lock);
676 spin_unlock(&cache->space_info->lock);
677 fragment_free_space(cache);
678 }
679 #endif
680 mutex_unlock(&caching_ctl->mutex);
681
682 wake_up(&caching_ctl->wait);
683 if (ret == 1) {
684 put_caching_control(caching_ctl);
685 free_excluded_extents(fs_info, cache);
686 return 0;
687 }
688 } else {
689 /*
690 * We're either using the free space tree or no caching at all.
691 * Set cached to the appropriate value and wakeup any waiters.
692 */
693 spin_lock(&cache->lock);
694 if (load_cache_only) {
695 cache->caching_ctl = NULL;
696 cache->cached = BTRFS_CACHE_NO;
697 } else {
698 cache->cached = BTRFS_CACHE_STARTED;
699 cache->has_caching_ctl = 1;
700 }
701 spin_unlock(&cache->lock);
702 wake_up(&caching_ctl->wait);
703 }
704
705 if (load_cache_only) {
706 put_caching_control(caching_ctl);
707 return 0;
708 }
709
710 down_write(&fs_info->commit_root_sem);
711 refcount_inc(&caching_ctl->count);
712 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
713 up_write(&fs_info->commit_root_sem);
714
715 btrfs_get_block_group(cache);
716
717 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
718
719 return ret;
720 }
721
722 /*
723 * return the block group that starts at or after bytenr
724 */
725 static struct btrfs_block_group_cache *
726 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
727 {
728 return block_group_cache_tree_search(info, bytenr, 0);
729 }
730
731 /*
732 * return the block group that contains the given bytenr
733 */
734 struct btrfs_block_group_cache *btrfs_lookup_block_group(
735 struct btrfs_fs_info *info,
736 u64 bytenr)
737 {
738 return block_group_cache_tree_search(info, bytenr, 1);
739 }
740
741 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
742 u64 flags)
743 {
744 struct list_head *head = &info->space_info;
745 struct btrfs_space_info *found;
746
747 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
748
749 rcu_read_lock();
750 list_for_each_entry_rcu(found, head, list) {
751 if (found->flags & flags) {
752 rcu_read_unlock();
753 return found;
754 }
755 }
756 rcu_read_unlock();
757 return NULL;
758 }
759
760 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
761 u64 owner, u64 root_objectid)
762 {
763 struct btrfs_space_info *space_info;
764 u64 flags;
765
766 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
767 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
768 flags = BTRFS_BLOCK_GROUP_SYSTEM;
769 else
770 flags = BTRFS_BLOCK_GROUP_METADATA;
771 } else {
772 flags = BTRFS_BLOCK_GROUP_DATA;
773 }
774
775 space_info = __find_space_info(fs_info, flags);
776 ASSERT(space_info);
777 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
778 }
779
780 /*
781 * after adding space to the filesystem, we need to clear the full flags
782 * on all the space infos.
783 */
784 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
785 {
786 struct list_head *head = &info->space_info;
787 struct btrfs_space_info *found;
788
789 rcu_read_lock();
790 list_for_each_entry_rcu(found, head, list)
791 found->full = 0;
792 rcu_read_unlock();
793 }
794
795 /* simple helper to search for an existing data extent at a given offset */
796 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
797 {
798 int ret;
799 struct btrfs_key key;
800 struct btrfs_path *path;
801
802 path = btrfs_alloc_path();
803 if (!path)
804 return -ENOMEM;
805
806 key.objectid = start;
807 key.offset = len;
808 key.type = BTRFS_EXTENT_ITEM_KEY;
809 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
810 btrfs_free_path(path);
811 return ret;
812 }
813
814 /*
815 * helper function to lookup reference count and flags of a tree block.
816 *
817 * the head node for delayed ref is used to store the sum of all the
818 * reference count modifications queued up in the rbtree. the head
819 * node may also store the extent flags to set. This way you can check
820 * to see what the reference count and extent flags would be if all of
821 * the delayed refs are not processed.
822 */
823 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
824 struct btrfs_fs_info *fs_info, u64 bytenr,
825 u64 offset, int metadata, u64 *refs, u64 *flags)
826 {
827 struct btrfs_delayed_ref_head *head;
828 struct btrfs_delayed_ref_root *delayed_refs;
829 struct btrfs_path *path;
830 struct btrfs_extent_item *ei;
831 struct extent_buffer *leaf;
832 struct btrfs_key key;
833 u32 item_size;
834 u64 num_refs;
835 u64 extent_flags;
836 int ret;
837
838 /*
839 * If we don't have skinny metadata, don't bother doing anything
840 * different
841 */
842 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
843 offset = fs_info->nodesize;
844 metadata = 0;
845 }
846
847 path = btrfs_alloc_path();
848 if (!path)
849 return -ENOMEM;
850
851 if (!trans) {
852 path->skip_locking = 1;
853 path->search_commit_root = 1;
854 }
855
856 search_again:
857 key.objectid = bytenr;
858 key.offset = offset;
859 if (metadata)
860 key.type = BTRFS_METADATA_ITEM_KEY;
861 else
862 key.type = BTRFS_EXTENT_ITEM_KEY;
863
864 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
865 if (ret < 0)
866 goto out_free;
867
868 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
869 if (path->slots[0]) {
870 path->slots[0]--;
871 btrfs_item_key_to_cpu(path->nodes[0], &key,
872 path->slots[0]);
873 if (key.objectid == bytenr &&
874 key.type == BTRFS_EXTENT_ITEM_KEY &&
875 key.offset == fs_info->nodesize)
876 ret = 0;
877 }
878 }
879
880 if (ret == 0) {
881 leaf = path->nodes[0];
882 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
883 if (item_size >= sizeof(*ei)) {
884 ei = btrfs_item_ptr(leaf, path->slots[0],
885 struct btrfs_extent_item);
886 num_refs = btrfs_extent_refs(leaf, ei);
887 extent_flags = btrfs_extent_flags(leaf, ei);
888 } else {
889 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
890 struct btrfs_extent_item_v0 *ei0;
891 BUG_ON(item_size != sizeof(*ei0));
892 ei0 = btrfs_item_ptr(leaf, path->slots[0],
893 struct btrfs_extent_item_v0);
894 num_refs = btrfs_extent_refs_v0(leaf, ei0);
895 /* FIXME: this isn't correct for data */
896 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
897 #else
898 BUG();
899 #endif
900 }
901 BUG_ON(num_refs == 0);
902 } else {
903 num_refs = 0;
904 extent_flags = 0;
905 ret = 0;
906 }
907
908 if (!trans)
909 goto out;
910
911 delayed_refs = &trans->transaction->delayed_refs;
912 spin_lock(&delayed_refs->lock);
913 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
914 if (head) {
915 if (!mutex_trylock(&head->mutex)) {
916 refcount_inc(&head->refs);
917 spin_unlock(&delayed_refs->lock);
918
919 btrfs_release_path(path);
920
921 /*
922 * Mutex was contended, block until it's released and try
923 * again
924 */
925 mutex_lock(&head->mutex);
926 mutex_unlock(&head->mutex);
927 btrfs_put_delayed_ref_head(head);
928 goto search_again;
929 }
930 spin_lock(&head->lock);
931 if (head->extent_op && head->extent_op->update_flags)
932 extent_flags |= head->extent_op->flags_to_set;
933 else
934 BUG_ON(num_refs == 0);
935
936 num_refs += head->ref_mod;
937 spin_unlock(&head->lock);
938 mutex_unlock(&head->mutex);
939 }
940 spin_unlock(&delayed_refs->lock);
941 out:
942 WARN_ON(num_refs == 0);
943 if (refs)
944 *refs = num_refs;
945 if (flags)
946 *flags = extent_flags;
947 out_free:
948 btrfs_free_path(path);
949 return ret;
950 }
951
952 /*
953 * Back reference rules. Back refs have three main goals:
954 *
955 * 1) differentiate between all holders of references to an extent so that
956 * when a reference is dropped we can make sure it was a valid reference
957 * before freeing the extent.
958 *
959 * 2) Provide enough information to quickly find the holders of an extent
960 * if we notice a given block is corrupted or bad.
961 *
962 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
963 * maintenance. This is actually the same as #2, but with a slightly
964 * different use case.
965 *
966 * There are two kinds of back refs. The implicit back refs is optimized
967 * for pointers in non-shared tree blocks. For a given pointer in a block,
968 * back refs of this kind provide information about the block's owner tree
969 * and the pointer's key. These information allow us to find the block by
970 * b-tree searching. The full back refs is for pointers in tree blocks not
971 * referenced by their owner trees. The location of tree block is recorded
972 * in the back refs. Actually the full back refs is generic, and can be
973 * used in all cases the implicit back refs is used. The major shortcoming
974 * of the full back refs is its overhead. Every time a tree block gets
975 * COWed, we have to update back refs entry for all pointers in it.
976 *
977 * For a newly allocated tree block, we use implicit back refs for
978 * pointers in it. This means most tree related operations only involve
979 * implicit back refs. For a tree block created in old transaction, the
980 * only way to drop a reference to it is COW it. So we can detect the
981 * event that tree block loses its owner tree's reference and do the
982 * back refs conversion.
983 *
984 * When a tree block is COWed through a tree, there are four cases:
985 *
986 * The reference count of the block is one and the tree is the block's
987 * owner tree. Nothing to do in this case.
988 *
989 * The reference count of the block is one and the tree is not the
990 * block's owner tree. In this case, full back refs is used for pointers
991 * in the block. Remove these full back refs, add implicit back refs for
992 * every pointers in the new block.
993 *
994 * The reference count of the block is greater than one and the tree is
995 * the block's owner tree. In this case, implicit back refs is used for
996 * pointers in the block. Add full back refs for every pointers in the
997 * block, increase lower level extents' reference counts. The original
998 * implicit back refs are entailed to the new block.
999 *
1000 * The reference count of the block is greater than one and the tree is
1001 * not the block's owner tree. Add implicit back refs for every pointer in
1002 * the new block, increase lower level extents' reference count.
1003 *
1004 * Back Reference Key composing:
1005 *
1006 * The key objectid corresponds to the first byte in the extent,
1007 * The key type is used to differentiate between types of back refs.
1008 * There are different meanings of the key offset for different types
1009 * of back refs.
1010 *
1011 * File extents can be referenced by:
1012 *
1013 * - multiple snapshots, subvolumes, or different generations in one subvol
1014 * - different files inside a single subvolume
1015 * - different offsets inside a file (bookend extents in file.c)
1016 *
1017 * The extent ref structure for the implicit back refs has fields for:
1018 *
1019 * - Objectid of the subvolume root
1020 * - objectid of the file holding the reference
1021 * - original offset in the file
1022 * - how many bookend extents
1023 *
1024 * The key offset for the implicit back refs is hash of the first
1025 * three fields.
1026 *
1027 * The extent ref structure for the full back refs has field for:
1028 *
1029 * - number of pointers in the tree leaf
1030 *
1031 * The key offset for the implicit back refs is the first byte of
1032 * the tree leaf
1033 *
1034 * When a file extent is allocated, The implicit back refs is used.
1035 * the fields are filled in:
1036 *
1037 * (root_key.objectid, inode objectid, offset in file, 1)
1038 *
1039 * When a file extent is removed file truncation, we find the
1040 * corresponding implicit back refs and check the following fields:
1041 *
1042 * (btrfs_header_owner(leaf), inode objectid, offset in file)
1043 *
1044 * Btree extents can be referenced by:
1045 *
1046 * - Different subvolumes
1047 *
1048 * Both the implicit back refs and the full back refs for tree blocks
1049 * only consist of key. The key offset for the implicit back refs is
1050 * objectid of block's owner tree. The key offset for the full back refs
1051 * is the first byte of parent block.
1052 *
1053 * When implicit back refs is used, information about the lowest key and
1054 * level of the tree block are required. These information are stored in
1055 * tree block info structure.
1056 */
1057
1058 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1059 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1060 struct btrfs_fs_info *fs_info,
1061 struct btrfs_path *path,
1062 u64 owner, u32 extra_size)
1063 {
1064 struct btrfs_root *root = fs_info->extent_root;
1065 struct btrfs_extent_item *item;
1066 struct btrfs_extent_item_v0 *ei0;
1067 struct btrfs_extent_ref_v0 *ref0;
1068 struct btrfs_tree_block_info *bi;
1069 struct extent_buffer *leaf;
1070 struct btrfs_key key;
1071 struct btrfs_key found_key;
1072 u32 new_size = sizeof(*item);
1073 u64 refs;
1074 int ret;
1075
1076 leaf = path->nodes[0];
1077 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1078
1079 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1080 ei0 = btrfs_item_ptr(leaf, path->slots[0],
1081 struct btrfs_extent_item_v0);
1082 refs = btrfs_extent_refs_v0(leaf, ei0);
1083
1084 if (owner == (u64)-1) {
1085 while (1) {
1086 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1087 ret = btrfs_next_leaf(root, path);
1088 if (ret < 0)
1089 return ret;
1090 BUG_ON(ret > 0); /* Corruption */
1091 leaf = path->nodes[0];
1092 }
1093 btrfs_item_key_to_cpu(leaf, &found_key,
1094 path->slots[0]);
1095 BUG_ON(key.objectid != found_key.objectid);
1096 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1097 path->slots[0]++;
1098 continue;
1099 }
1100 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1101 struct btrfs_extent_ref_v0);
1102 owner = btrfs_ref_objectid_v0(leaf, ref0);
1103 break;
1104 }
1105 }
1106 btrfs_release_path(path);
1107
1108 if (owner < BTRFS_FIRST_FREE_OBJECTID)
1109 new_size += sizeof(*bi);
1110
1111 new_size -= sizeof(*ei0);
1112 ret = btrfs_search_slot(trans, root, &key, path,
1113 new_size + extra_size, 1);
1114 if (ret < 0)
1115 return ret;
1116 BUG_ON(ret); /* Corruption */
1117
1118 btrfs_extend_item(fs_info, path, new_size);
1119
1120 leaf = path->nodes[0];
1121 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1122 btrfs_set_extent_refs(leaf, item, refs);
1123 /* FIXME: get real generation */
1124 btrfs_set_extent_generation(leaf, item, 0);
1125 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1126 btrfs_set_extent_flags(leaf, item,
1127 BTRFS_EXTENT_FLAG_TREE_BLOCK |
1128 BTRFS_BLOCK_FLAG_FULL_BACKREF);
1129 bi = (struct btrfs_tree_block_info *)(item + 1);
1130 /* FIXME: get first key of the block */
1131 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
1132 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1133 } else {
1134 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1135 }
1136 btrfs_mark_buffer_dirty(leaf);
1137 return 0;
1138 }
1139 #endif
1140
1141 /*
1142 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1143 * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1144 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1145 */
1146 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1147 struct btrfs_extent_inline_ref *iref,
1148 enum btrfs_inline_ref_type is_data)
1149 {
1150 int type = btrfs_extent_inline_ref_type(eb, iref);
1151 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1152
1153 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1154 type == BTRFS_SHARED_BLOCK_REF_KEY ||
1155 type == BTRFS_SHARED_DATA_REF_KEY ||
1156 type == BTRFS_EXTENT_DATA_REF_KEY) {
1157 if (is_data == BTRFS_REF_TYPE_BLOCK) {
1158 if (type == BTRFS_TREE_BLOCK_REF_KEY)
1159 return type;
1160 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1161 ASSERT(eb->fs_info);
1162 /*
1163 * Every shared one has parent tree
1164 * block, which must be aligned to
1165 * nodesize.
1166 */
1167 if (offset &&
1168 IS_ALIGNED(offset, eb->fs_info->nodesize))
1169 return type;
1170 }
1171 } else if (is_data == BTRFS_REF_TYPE_DATA) {
1172 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1173 return type;
1174 if (type == BTRFS_SHARED_DATA_REF_KEY) {
1175 ASSERT(eb->fs_info);
1176 /*
1177 * Every shared one has parent tree
1178 * block, which must be aligned to
1179 * nodesize.
1180 */
1181 if (offset &&
1182 IS_ALIGNED(offset, eb->fs_info->nodesize))
1183 return type;
1184 }
1185 } else {
1186 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1187 return type;
1188 }
1189 }
1190
1191 btrfs_print_leaf((struct extent_buffer *)eb);
1192 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1193 eb->start, type);
1194 WARN_ON(1);
1195
1196 return BTRFS_REF_TYPE_INVALID;
1197 }
1198
1199 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1200 {
1201 u32 high_crc = ~(u32)0;
1202 u32 low_crc = ~(u32)0;
1203 __le64 lenum;
1204
1205 lenum = cpu_to_le64(root_objectid);
1206 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1207 lenum = cpu_to_le64(owner);
1208 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1209 lenum = cpu_to_le64(offset);
1210 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1211
1212 return ((u64)high_crc << 31) ^ (u64)low_crc;
1213 }
1214
1215 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1216 struct btrfs_extent_data_ref *ref)
1217 {
1218 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1219 btrfs_extent_data_ref_objectid(leaf, ref),
1220 btrfs_extent_data_ref_offset(leaf, ref));
1221 }
1222
1223 static int match_extent_data_ref(struct extent_buffer *leaf,
1224 struct btrfs_extent_data_ref *ref,
1225 u64 root_objectid, u64 owner, u64 offset)
1226 {
1227 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1228 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1229 btrfs_extent_data_ref_offset(leaf, ref) != offset)
1230 return 0;
1231 return 1;
1232 }
1233
1234 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1235 struct btrfs_fs_info *fs_info,
1236 struct btrfs_path *path,
1237 u64 bytenr, u64 parent,
1238 u64 root_objectid,
1239 u64 owner, u64 offset)
1240 {
1241 struct btrfs_root *root = fs_info->extent_root;
1242 struct btrfs_key key;
1243 struct btrfs_extent_data_ref *ref;
1244 struct extent_buffer *leaf;
1245 u32 nritems;
1246 int ret;
1247 int recow;
1248 int err = -ENOENT;
1249
1250 key.objectid = bytenr;
1251 if (parent) {
1252 key.type = BTRFS_SHARED_DATA_REF_KEY;
1253 key.offset = parent;
1254 } else {
1255 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1256 key.offset = hash_extent_data_ref(root_objectid,
1257 owner, offset);
1258 }
1259 again:
1260 recow = 0;
1261 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1262 if (ret < 0) {
1263 err = ret;
1264 goto fail;
1265 }
1266
1267 if (parent) {
1268 if (!ret)
1269 return 0;
1270 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1271 key.type = BTRFS_EXTENT_REF_V0_KEY;
1272 btrfs_release_path(path);
1273 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1274 if (ret < 0) {
1275 err = ret;
1276 goto fail;
1277 }
1278 if (!ret)
1279 return 0;
1280 #endif
1281 goto fail;
1282 }
1283
1284 leaf = path->nodes[0];
1285 nritems = btrfs_header_nritems(leaf);
1286 while (1) {
1287 if (path->slots[0] >= nritems) {
1288 ret = btrfs_next_leaf(root, path);
1289 if (ret < 0)
1290 err = ret;
1291 if (ret)
1292 goto fail;
1293
1294 leaf = path->nodes[0];
1295 nritems = btrfs_header_nritems(leaf);
1296 recow = 1;
1297 }
1298
1299 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1300 if (key.objectid != bytenr ||
1301 key.type != BTRFS_EXTENT_DATA_REF_KEY)
1302 goto fail;
1303
1304 ref = btrfs_item_ptr(leaf, path->slots[0],
1305 struct btrfs_extent_data_ref);
1306
1307 if (match_extent_data_ref(leaf, ref, root_objectid,
1308 owner, offset)) {
1309 if (recow) {
1310 btrfs_release_path(path);
1311 goto again;
1312 }
1313 err = 0;
1314 break;
1315 }
1316 path->slots[0]++;
1317 }
1318 fail:
1319 return err;
1320 }
1321
1322 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1323 struct btrfs_fs_info *fs_info,
1324 struct btrfs_path *path,
1325 u64 bytenr, u64 parent,
1326 u64 root_objectid, u64 owner,
1327 u64 offset, int refs_to_add)
1328 {
1329 struct btrfs_root *root = fs_info->extent_root;
1330 struct btrfs_key key;
1331 struct extent_buffer *leaf;
1332 u32 size;
1333 u32 num_refs;
1334 int ret;
1335
1336 key.objectid = bytenr;
1337 if (parent) {
1338 key.type = BTRFS_SHARED_DATA_REF_KEY;
1339 key.offset = parent;
1340 size = sizeof(struct btrfs_shared_data_ref);
1341 } else {
1342 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1343 key.offset = hash_extent_data_ref(root_objectid,
1344 owner, offset);
1345 size = sizeof(struct btrfs_extent_data_ref);
1346 }
1347
1348 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1349 if (ret && ret != -EEXIST)
1350 goto fail;
1351
1352 leaf = path->nodes[0];
1353 if (parent) {
1354 struct btrfs_shared_data_ref *ref;
1355 ref = btrfs_item_ptr(leaf, path->slots[0],
1356 struct btrfs_shared_data_ref);
1357 if (ret == 0) {
1358 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1359 } else {
1360 num_refs = btrfs_shared_data_ref_count(leaf, ref);
1361 num_refs += refs_to_add;
1362 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1363 }
1364 } else {
1365 struct btrfs_extent_data_ref *ref;
1366 while (ret == -EEXIST) {
1367 ref = btrfs_item_ptr(leaf, path->slots[0],
1368 struct btrfs_extent_data_ref);
1369 if (match_extent_data_ref(leaf, ref, root_objectid,
1370 owner, offset))
1371 break;
1372 btrfs_release_path(path);
1373 key.offset++;
1374 ret = btrfs_insert_empty_item(trans, root, path, &key,
1375 size);
1376 if (ret && ret != -EEXIST)
1377 goto fail;
1378
1379 leaf = path->nodes[0];
1380 }
1381 ref = btrfs_item_ptr(leaf, path->slots[0],
1382 struct btrfs_extent_data_ref);
1383 if (ret == 0) {
1384 btrfs_set_extent_data_ref_root(leaf, ref,
1385 root_objectid);
1386 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1387 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1388 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1389 } else {
1390 num_refs = btrfs_extent_data_ref_count(leaf, ref);
1391 num_refs += refs_to_add;
1392 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1393 }
1394 }
1395 btrfs_mark_buffer_dirty(leaf);
1396 ret = 0;
1397 fail:
1398 btrfs_release_path(path);
1399 return ret;
1400 }
1401
1402 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1403 struct btrfs_fs_info *fs_info,
1404 struct btrfs_path *path,
1405 int refs_to_drop, int *last_ref)
1406 {
1407 struct btrfs_key key;
1408 struct btrfs_extent_data_ref *ref1 = NULL;
1409 struct btrfs_shared_data_ref *ref2 = NULL;
1410 struct extent_buffer *leaf;
1411 u32 num_refs = 0;
1412 int ret = 0;
1413
1414 leaf = path->nodes[0];
1415 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1416
1417 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1418 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1419 struct btrfs_extent_data_ref);
1420 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1421 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1422 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1423 struct btrfs_shared_data_ref);
1424 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1425 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1426 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1427 struct btrfs_extent_ref_v0 *ref0;
1428 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1429 struct btrfs_extent_ref_v0);
1430 num_refs = btrfs_ref_count_v0(leaf, ref0);
1431 #endif
1432 } else {
1433 BUG();
1434 }
1435
1436 BUG_ON(num_refs < refs_to_drop);
1437 num_refs -= refs_to_drop;
1438
1439 if (num_refs == 0) {
1440 ret = btrfs_del_item(trans, fs_info->extent_root, path);
1441 *last_ref = 1;
1442 } else {
1443 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1444 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1445 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1446 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1447 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1448 else {
1449 struct btrfs_extent_ref_v0 *ref0;
1450 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1451 struct btrfs_extent_ref_v0);
1452 btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1453 }
1454 #endif
1455 btrfs_mark_buffer_dirty(leaf);
1456 }
1457 return ret;
1458 }
1459
1460 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1461 struct btrfs_extent_inline_ref *iref)
1462 {
1463 struct btrfs_key key;
1464 struct extent_buffer *leaf;
1465 struct btrfs_extent_data_ref *ref1;
1466 struct btrfs_shared_data_ref *ref2;
1467 u32 num_refs = 0;
1468 int type;
1469
1470 leaf = path->nodes[0];
1471 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1472 if (iref) {
1473 /*
1474 * If type is invalid, we should have bailed out earlier than
1475 * this call.
1476 */
1477 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1478 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1479 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1480 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1481 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1482 } else {
1483 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1484 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1485 }
1486 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1487 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1488 struct btrfs_extent_data_ref);
1489 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1490 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1491 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1492 struct btrfs_shared_data_ref);
1493 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1494 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1495 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1496 struct btrfs_extent_ref_v0 *ref0;
1497 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1498 struct btrfs_extent_ref_v0);
1499 num_refs = btrfs_ref_count_v0(leaf, ref0);
1500 #endif
1501 } else {
1502 WARN_ON(1);
1503 }
1504 return num_refs;
1505 }
1506
1507 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1508 struct btrfs_fs_info *fs_info,
1509 struct btrfs_path *path,
1510 u64 bytenr, u64 parent,
1511 u64 root_objectid)
1512 {
1513 struct btrfs_root *root = fs_info->extent_root;
1514 struct btrfs_key key;
1515 int ret;
1516
1517 key.objectid = bytenr;
1518 if (parent) {
1519 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1520 key.offset = parent;
1521 } else {
1522 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1523 key.offset = root_objectid;
1524 }
1525
1526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1527 if (ret > 0)
1528 ret = -ENOENT;
1529 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1530 if (ret == -ENOENT && parent) {
1531 btrfs_release_path(path);
1532 key.type = BTRFS_EXTENT_REF_V0_KEY;
1533 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1534 if (ret > 0)
1535 ret = -ENOENT;
1536 }
1537 #endif
1538 return ret;
1539 }
1540
1541 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1542 struct btrfs_fs_info *fs_info,
1543 struct btrfs_path *path,
1544 u64 bytenr, u64 parent,
1545 u64 root_objectid)
1546 {
1547 struct btrfs_key key;
1548 int ret;
1549
1550 key.objectid = bytenr;
1551 if (parent) {
1552 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1553 key.offset = parent;
1554 } else {
1555 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1556 key.offset = root_objectid;
1557 }
1558
1559 ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1560 path, &key, 0);
1561 btrfs_release_path(path);
1562 return ret;
1563 }
1564
1565 static inline int extent_ref_type(u64 parent, u64 owner)
1566 {
1567 int type;
1568 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1569 if (parent > 0)
1570 type = BTRFS_SHARED_BLOCK_REF_KEY;
1571 else
1572 type = BTRFS_TREE_BLOCK_REF_KEY;
1573 } else {
1574 if (parent > 0)
1575 type = BTRFS_SHARED_DATA_REF_KEY;
1576 else
1577 type = BTRFS_EXTENT_DATA_REF_KEY;
1578 }
1579 return type;
1580 }
1581
1582 static int find_next_key(struct btrfs_path *path, int level,
1583 struct btrfs_key *key)
1584
1585 {
1586 for (; level < BTRFS_MAX_LEVEL; level++) {
1587 if (!path->nodes[level])
1588 break;
1589 if (path->slots[level] + 1 >=
1590 btrfs_header_nritems(path->nodes[level]))
1591 continue;
1592 if (level == 0)
1593 btrfs_item_key_to_cpu(path->nodes[level], key,
1594 path->slots[level] + 1);
1595 else
1596 btrfs_node_key_to_cpu(path->nodes[level], key,
1597 path->slots[level] + 1);
1598 return 0;
1599 }
1600 return 1;
1601 }
1602
1603 /*
1604 * look for inline back ref. if back ref is found, *ref_ret is set
1605 * to the address of inline back ref, and 0 is returned.
1606 *
1607 * if back ref isn't found, *ref_ret is set to the address where it
1608 * should be inserted, and -ENOENT is returned.
1609 *
1610 * if insert is true and there are too many inline back refs, the path
1611 * points to the extent item, and -EAGAIN is returned.
1612 *
1613 * NOTE: inline back refs are ordered in the same way that back ref
1614 * items in the tree are ordered.
1615 */
1616 static noinline_for_stack
1617 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1618 struct btrfs_fs_info *fs_info,
1619 struct btrfs_path *path,
1620 struct btrfs_extent_inline_ref **ref_ret,
1621 u64 bytenr, u64 num_bytes,
1622 u64 parent, u64 root_objectid,
1623 u64 owner, u64 offset, int insert)
1624 {
1625 struct btrfs_root *root = fs_info->extent_root;
1626 struct btrfs_key key;
1627 struct extent_buffer *leaf;
1628 struct btrfs_extent_item *ei;
1629 struct btrfs_extent_inline_ref *iref;
1630 u64 flags;
1631 u64 item_size;
1632 unsigned long ptr;
1633 unsigned long end;
1634 int extra_size;
1635 int type;
1636 int want;
1637 int ret;
1638 int err = 0;
1639 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1640 int needed;
1641
1642 key.objectid = bytenr;
1643 key.type = BTRFS_EXTENT_ITEM_KEY;
1644 key.offset = num_bytes;
1645
1646 want = extent_ref_type(parent, owner);
1647 if (insert) {
1648 extra_size = btrfs_extent_inline_ref_size(want);
1649 path->keep_locks = 1;
1650 } else
1651 extra_size = -1;
1652
1653 /*
1654 * Owner is our parent level, so we can just add one to get the level
1655 * for the block we are interested in.
1656 */
1657 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1658 key.type = BTRFS_METADATA_ITEM_KEY;
1659 key.offset = owner;
1660 }
1661
1662 again:
1663 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1664 if (ret < 0) {
1665 err = ret;
1666 goto out;
1667 }
1668
1669 /*
1670 * We may be a newly converted file system which still has the old fat
1671 * extent entries for metadata, so try and see if we have one of those.
1672 */
1673 if (ret > 0 && skinny_metadata) {
1674 skinny_metadata = false;
1675 if (path->slots[0]) {
1676 path->slots[0]--;
1677 btrfs_item_key_to_cpu(path->nodes[0], &key,
1678 path->slots[0]);
1679 if (key.objectid == bytenr &&
1680 key.type == BTRFS_EXTENT_ITEM_KEY &&
1681 key.offset == num_bytes)
1682 ret = 0;
1683 }
1684 if (ret) {
1685 key.objectid = bytenr;
1686 key.type = BTRFS_EXTENT_ITEM_KEY;
1687 key.offset = num_bytes;
1688 btrfs_release_path(path);
1689 goto again;
1690 }
1691 }
1692
1693 if (ret && !insert) {
1694 err = -ENOENT;
1695 goto out;
1696 } else if (WARN_ON(ret)) {
1697 err = -EIO;
1698 goto out;
1699 }
1700
1701 leaf = path->nodes[0];
1702 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1703 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1704 if (item_size < sizeof(*ei)) {
1705 if (!insert) {
1706 err = -ENOENT;
1707 goto out;
1708 }
1709 ret = convert_extent_item_v0(trans, fs_info, path, owner,
1710 extra_size);
1711 if (ret < 0) {
1712 err = ret;
1713 goto out;
1714 }
1715 leaf = path->nodes[0];
1716 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1717 }
1718 #endif
1719 BUG_ON(item_size < sizeof(*ei));
1720
1721 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1722 flags = btrfs_extent_flags(leaf, ei);
1723
1724 ptr = (unsigned long)(ei + 1);
1725 end = (unsigned long)ei + item_size;
1726
1727 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1728 ptr += sizeof(struct btrfs_tree_block_info);
1729 BUG_ON(ptr > end);
1730 }
1731
1732 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1733 needed = BTRFS_REF_TYPE_DATA;
1734 else
1735 needed = BTRFS_REF_TYPE_BLOCK;
1736
1737 err = -ENOENT;
1738 while (1) {
1739 if (ptr >= end) {
1740 WARN_ON(ptr > end);
1741 break;
1742 }
1743 iref = (struct btrfs_extent_inline_ref *)ptr;
1744 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1745 if (type == BTRFS_REF_TYPE_INVALID) {
1746 err = -EINVAL;
1747 goto out;
1748 }
1749
1750 if (want < type)
1751 break;
1752 if (want > type) {
1753 ptr += btrfs_extent_inline_ref_size(type);
1754 continue;
1755 }
1756
1757 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1758 struct btrfs_extent_data_ref *dref;
1759 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1760 if (match_extent_data_ref(leaf, dref, root_objectid,
1761 owner, offset)) {
1762 err = 0;
1763 break;
1764 }
1765 if (hash_extent_data_ref_item(leaf, dref) <
1766 hash_extent_data_ref(root_objectid, owner, offset))
1767 break;
1768 } else {
1769 u64 ref_offset;
1770 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1771 if (parent > 0) {
1772 if (parent == ref_offset) {
1773 err = 0;
1774 break;
1775 }
1776 if (ref_offset < parent)
1777 break;
1778 } else {
1779 if (root_objectid == ref_offset) {
1780 err = 0;
1781 break;
1782 }
1783 if (ref_offset < root_objectid)
1784 break;
1785 }
1786 }
1787 ptr += btrfs_extent_inline_ref_size(type);
1788 }
1789 if (err == -ENOENT && insert) {
1790 if (item_size + extra_size >=
1791 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1792 err = -EAGAIN;
1793 goto out;
1794 }
1795 /*
1796 * To add new inline back ref, we have to make sure
1797 * there is no corresponding back ref item.
1798 * For simplicity, we just do not add new inline back
1799 * ref if there is any kind of item for this block
1800 */
1801 if (find_next_key(path, 0, &key) == 0 &&
1802 key.objectid == bytenr &&
1803 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1804 err = -EAGAIN;
1805 goto out;
1806 }
1807 }
1808 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1809 out:
1810 if (insert) {
1811 path->keep_locks = 0;
1812 btrfs_unlock_up_safe(path, 1);
1813 }
1814 return err;
1815 }
1816
1817 /*
1818 * helper to add new inline back ref
1819 */
1820 static noinline_for_stack
1821 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1822 struct btrfs_path *path,
1823 struct btrfs_extent_inline_ref *iref,
1824 u64 parent, u64 root_objectid,
1825 u64 owner, u64 offset, int refs_to_add,
1826 struct btrfs_delayed_extent_op *extent_op)
1827 {
1828 struct extent_buffer *leaf;
1829 struct btrfs_extent_item *ei;
1830 unsigned long ptr;
1831 unsigned long end;
1832 unsigned long item_offset;
1833 u64 refs;
1834 int size;
1835 int type;
1836
1837 leaf = path->nodes[0];
1838 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1839 item_offset = (unsigned long)iref - (unsigned long)ei;
1840
1841 type = extent_ref_type(parent, owner);
1842 size = btrfs_extent_inline_ref_size(type);
1843
1844 btrfs_extend_item(fs_info, path, size);
1845
1846 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1847 refs = btrfs_extent_refs(leaf, ei);
1848 refs += refs_to_add;
1849 btrfs_set_extent_refs(leaf, ei, refs);
1850 if (extent_op)
1851 __run_delayed_extent_op(extent_op, leaf, ei);
1852
1853 ptr = (unsigned long)ei + item_offset;
1854 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1855 if (ptr < end - size)
1856 memmove_extent_buffer(leaf, ptr + size, ptr,
1857 end - size - ptr);
1858
1859 iref = (struct btrfs_extent_inline_ref *)ptr;
1860 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1861 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1862 struct btrfs_extent_data_ref *dref;
1863 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1864 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1865 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1866 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1867 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1868 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1869 struct btrfs_shared_data_ref *sref;
1870 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1871 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1872 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1873 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1874 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1875 } else {
1876 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1877 }
1878 btrfs_mark_buffer_dirty(leaf);
1879 }
1880
1881 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1882 struct btrfs_fs_info *fs_info,
1883 struct btrfs_path *path,
1884 struct btrfs_extent_inline_ref **ref_ret,
1885 u64 bytenr, u64 num_bytes, u64 parent,
1886 u64 root_objectid, u64 owner, u64 offset)
1887 {
1888 int ret;
1889
1890 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
1891 bytenr, num_bytes, parent,
1892 root_objectid, owner, offset, 0);
1893 if (ret != -ENOENT)
1894 return ret;
1895
1896 btrfs_release_path(path);
1897 *ref_ret = NULL;
1898
1899 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1900 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1901 parent, root_objectid);
1902 } else {
1903 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1904 parent, root_objectid, owner,
1905 offset);
1906 }
1907 return ret;
1908 }
1909
1910 /*
1911 * helper to update/remove inline back ref
1912 */
1913 static noinline_for_stack
1914 void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
1915 struct btrfs_path *path,
1916 struct btrfs_extent_inline_ref *iref,
1917 int refs_to_mod,
1918 struct btrfs_delayed_extent_op *extent_op,
1919 int *last_ref)
1920 {
1921 struct extent_buffer *leaf;
1922 struct btrfs_extent_item *ei;
1923 struct btrfs_extent_data_ref *dref = NULL;
1924 struct btrfs_shared_data_ref *sref = NULL;
1925 unsigned long ptr;
1926 unsigned long end;
1927 u32 item_size;
1928 int size;
1929 int type;
1930 u64 refs;
1931
1932 leaf = path->nodes[0];
1933 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1934 refs = btrfs_extent_refs(leaf, ei);
1935 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1936 refs += refs_to_mod;
1937 btrfs_set_extent_refs(leaf, ei, refs);
1938 if (extent_op)
1939 __run_delayed_extent_op(extent_op, leaf, ei);
1940
1941 /*
1942 * If type is invalid, we should have bailed out after
1943 * lookup_inline_extent_backref().
1944 */
1945 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1946 ASSERT(type != BTRFS_REF_TYPE_INVALID);
1947
1948 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1949 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1950 refs = btrfs_extent_data_ref_count(leaf, dref);
1951 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1952 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1953 refs = btrfs_shared_data_ref_count(leaf, sref);
1954 } else {
1955 refs = 1;
1956 BUG_ON(refs_to_mod != -1);
1957 }
1958
1959 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1960 refs += refs_to_mod;
1961
1962 if (refs > 0) {
1963 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1964 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1965 else
1966 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1967 } else {
1968 *last_ref = 1;
1969 size = btrfs_extent_inline_ref_size(type);
1970 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1971 ptr = (unsigned long)iref;
1972 end = (unsigned long)ei + item_size;
1973 if (ptr + size < end)
1974 memmove_extent_buffer(leaf, ptr, ptr + size,
1975 end - ptr - size);
1976 item_size -= size;
1977 btrfs_truncate_item(fs_info, path, item_size, 1);
1978 }
1979 btrfs_mark_buffer_dirty(leaf);
1980 }
1981
1982 static noinline_for_stack
1983 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1984 struct btrfs_fs_info *fs_info,
1985 struct btrfs_path *path,
1986 u64 bytenr, u64 num_bytes, u64 parent,
1987 u64 root_objectid, u64 owner,
1988 u64 offset, int refs_to_add,
1989 struct btrfs_delayed_extent_op *extent_op)
1990 {
1991 struct btrfs_extent_inline_ref *iref;
1992 int ret;
1993
1994 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
1995 bytenr, num_bytes, parent,
1996 root_objectid, owner, offset, 1);
1997 if (ret == 0) {
1998 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1999 update_inline_extent_backref(fs_info, path, iref,
2000 refs_to_add, extent_op, NULL);
2001 } else if (ret == -ENOENT) {
2002 setup_inline_extent_backref(fs_info, path, iref, parent,
2003 root_objectid, owner, offset,
2004 refs_to_add, extent_op);
2005 ret = 0;
2006 }
2007 return ret;
2008 }
2009
2010 static int insert_extent_backref(struct btrfs_trans_handle *trans,
2011 struct btrfs_fs_info *fs_info,
2012 struct btrfs_path *path,
2013 u64 bytenr, u64 parent, u64 root_objectid,
2014 u64 owner, u64 offset, int refs_to_add)
2015 {
2016 int ret;
2017 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2018 BUG_ON(refs_to_add != 1);
2019 ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
2020 parent, root_objectid);
2021 } else {
2022 ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
2023 parent, root_objectid,
2024 owner, offset, refs_to_add);
2025 }
2026 return ret;
2027 }
2028
2029 static int remove_extent_backref(struct btrfs_trans_handle *trans,
2030 struct btrfs_fs_info *fs_info,
2031 struct btrfs_path *path,
2032 struct btrfs_extent_inline_ref *iref,
2033 int refs_to_drop, int is_data, int *last_ref)
2034 {
2035 int ret = 0;
2036
2037 BUG_ON(!is_data && refs_to_drop != 1);
2038 if (iref) {
2039 update_inline_extent_backref(fs_info, path, iref,
2040 -refs_to_drop, NULL, last_ref);
2041 } else if (is_data) {
2042 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
2043 last_ref);
2044 } else {
2045 *last_ref = 1;
2046 ret = btrfs_del_item(trans, fs_info->extent_root, path);
2047 }
2048 return ret;
2049 }
2050
2051 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
2052 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
2053 u64 *discarded_bytes)
2054 {
2055 int j, ret = 0;
2056 u64 bytes_left, end;
2057 u64 aligned_start = ALIGN(start, 1 << 9);
2058
2059 if (WARN_ON(start != aligned_start)) {
2060 len -= aligned_start - start;
2061 len = round_down(len, 1 << 9);
2062 start = aligned_start;
2063 }
2064
2065 *discarded_bytes = 0;
2066
2067 if (!len)
2068 return 0;
2069
2070 end = start + len;
2071 bytes_left = len;
2072
2073 /* Skip any superblocks on this device. */
2074 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
2075 u64 sb_start = btrfs_sb_offset(j);
2076 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2077 u64 size = sb_start - start;
2078
2079 if (!in_range(sb_start, start, bytes_left) &&
2080 !in_range(sb_end, start, bytes_left) &&
2081 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2082 continue;
2083
2084 /*
2085 * Superblock spans beginning of range. Adjust start and
2086 * try again.
2087 */
2088 if (sb_start <= start) {
2089 start += sb_end - start;
2090 if (start > end) {
2091 bytes_left = 0;
2092 break;
2093 }
2094 bytes_left = end - start;
2095 continue;
2096 }
2097
2098 if (size) {
2099 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2100 GFP_NOFS, 0);
2101 if (!ret)
2102 *discarded_bytes += size;
2103 else if (ret != -EOPNOTSUPP)
2104 return ret;
2105 }
2106
2107 start = sb_end;
2108 if (start > end) {
2109 bytes_left = 0;
2110 break;
2111 }
2112 bytes_left = end - start;
2113 }
2114
2115 if (bytes_left) {
2116 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2117 GFP_NOFS, 0);
2118 if (!ret)
2119 *discarded_bytes += bytes_left;
2120 }
2121 return ret;
2122 }
2123
2124 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2125 u64 num_bytes, u64 *actual_bytes)
2126 {
2127 int ret;
2128 u64 discarded_bytes = 0;
2129 struct btrfs_bio *bbio = NULL;
2130
2131
2132 /*
2133 * Avoid races with device replace and make sure our bbio has devices
2134 * associated to its stripes that don't go away while we are discarding.
2135 */
2136 btrfs_bio_counter_inc_blocked(fs_info);
2137 /* Tell the block device(s) that the sectors can be discarded */
2138 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2139 &bbio, 0);
2140 /* Error condition is -ENOMEM */
2141 if (!ret) {
2142 struct btrfs_bio_stripe *stripe = bbio->stripes;
2143 int i;
2144
2145
2146 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2147 u64 bytes;
2148 if (!stripe->dev->can_discard)
2149 continue;
2150
2151 ret = btrfs_issue_discard(stripe->dev->bdev,
2152 stripe->physical,
2153 stripe->length,
2154 &bytes);
2155 if (!ret)
2156 discarded_bytes += bytes;
2157 else if (ret != -EOPNOTSUPP)
2158 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2159
2160 /*
2161 * Just in case we get back EOPNOTSUPP for some reason,
2162 * just ignore the return value so we don't screw up
2163 * people calling discard_extent.
2164 */
2165 ret = 0;
2166 }
2167 btrfs_put_bbio(bbio);
2168 }
2169 btrfs_bio_counter_dec(fs_info);
2170
2171 if (actual_bytes)
2172 *actual_bytes = discarded_bytes;
2173
2174
2175 if (ret == -EOPNOTSUPP)
2176 ret = 0;
2177 return ret;
2178 }
2179
2180 /* Can return -ENOMEM */
2181 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2182 struct btrfs_root *root,
2183 u64 bytenr, u64 num_bytes, u64 parent,
2184 u64 root_objectid, u64 owner, u64 offset)
2185 {
2186 struct btrfs_fs_info *fs_info = root->fs_info;
2187 int old_ref_mod, new_ref_mod;
2188 int ret;
2189
2190 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2191 root_objectid == BTRFS_TREE_LOG_OBJECTID);
2192
2193 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2194 owner, offset, BTRFS_ADD_DELAYED_REF);
2195
2196 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2197 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2198 num_bytes, parent,
2199 root_objectid, (int)owner,
2200 BTRFS_ADD_DELAYED_REF, NULL,
2201 &old_ref_mod, &new_ref_mod);
2202 } else {
2203 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2204 num_bytes, parent,
2205 root_objectid, owner, offset,
2206 0, BTRFS_ADD_DELAYED_REF,
2207 &old_ref_mod, &new_ref_mod);
2208 }
2209
2210 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2211 add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
2212
2213 return ret;
2214 }
2215
2216 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2217 struct btrfs_fs_info *fs_info,
2218 struct btrfs_delayed_ref_node *node,
2219 u64 parent, u64 root_objectid,
2220 u64 owner, u64 offset, int refs_to_add,
2221 struct btrfs_delayed_extent_op *extent_op)
2222 {
2223 struct btrfs_path *path;
2224 struct extent_buffer *leaf;
2225 struct btrfs_extent_item *item;
2226 struct btrfs_key key;
2227 u64 bytenr = node->bytenr;
2228 u64 num_bytes = node->num_bytes;
2229 u64 refs;
2230 int ret;
2231
2232 path = btrfs_alloc_path();
2233 if (!path)
2234 return -ENOMEM;
2235
2236 path->reada = READA_FORWARD;
2237 path->leave_spinning = 1;
2238 /* this will setup the path even if it fails to insert the back ref */
2239 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2240 num_bytes, parent, root_objectid,
2241 owner, offset,
2242 refs_to_add, extent_op);
2243 if ((ret < 0 && ret != -EAGAIN) || !ret)
2244 goto out;
2245
2246 /*
2247 * Ok we had -EAGAIN which means we didn't have space to insert and
2248 * inline extent ref, so just update the reference count and add a
2249 * normal backref.
2250 */
2251 leaf = path->nodes[0];
2252 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2253 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2254 refs = btrfs_extent_refs(leaf, item);
2255 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2256 if (extent_op)
2257 __run_delayed_extent_op(extent_op, leaf, item);
2258
2259 btrfs_mark_buffer_dirty(leaf);
2260 btrfs_release_path(path);
2261
2262 path->reada = READA_FORWARD;
2263 path->leave_spinning = 1;
2264 /* now insert the actual backref */
2265 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2266 root_objectid, owner, offset, refs_to_add);
2267 if (ret)
2268 btrfs_abort_transaction(trans, ret);
2269 out:
2270 btrfs_free_path(path);
2271 return ret;
2272 }
2273
2274 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2275 struct btrfs_fs_info *fs_info,
2276 struct btrfs_delayed_ref_node *node,
2277 struct btrfs_delayed_extent_op *extent_op,
2278 int insert_reserved)
2279 {
2280 int ret = 0;
2281 struct btrfs_delayed_data_ref *ref;
2282 struct btrfs_key ins;
2283 u64 parent = 0;
2284 u64 ref_root = 0;
2285 u64 flags = 0;
2286
2287 ins.objectid = node->bytenr;
2288 ins.offset = node->num_bytes;
2289 ins.type = BTRFS_EXTENT_ITEM_KEY;
2290
2291 ref = btrfs_delayed_node_to_data_ref(node);
2292 trace_run_delayed_data_ref(fs_info, node, ref, node->action);
2293
2294 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2295 parent = ref->parent;
2296 ref_root = ref->root;
2297
2298 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2299 if (extent_op)
2300 flags |= extent_op->flags_to_set;
2301 ret = alloc_reserved_file_extent(trans, fs_info,
2302 parent, ref_root, flags,
2303 ref->objectid, ref->offset,
2304 &ins, node->ref_mod);
2305 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2306 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
2307 ref_root, ref->objectid,
2308 ref->offset, node->ref_mod,
2309 extent_op);
2310 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2311 ret = __btrfs_free_extent(trans, fs_info, node, parent,
2312 ref_root, ref->objectid,
2313 ref->offset, node->ref_mod,
2314 extent_op);
2315 } else {
2316 BUG();
2317 }
2318 return ret;
2319 }
2320
2321 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2322 struct extent_buffer *leaf,
2323 struct btrfs_extent_item *ei)
2324 {
2325 u64 flags = btrfs_extent_flags(leaf, ei);
2326 if (extent_op->update_flags) {
2327 flags |= extent_op->flags_to_set;
2328 btrfs_set_extent_flags(leaf, ei, flags);
2329 }
2330
2331 if (extent_op->update_key) {
2332 struct btrfs_tree_block_info *bi;
2333 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2334 bi = (struct btrfs_tree_block_info *)(ei + 1);
2335 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2336 }
2337 }
2338
2339 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2340 struct btrfs_fs_info *fs_info,
2341 struct btrfs_delayed_ref_head *head,
2342 struct btrfs_delayed_extent_op *extent_op)
2343 {
2344 struct btrfs_key key;
2345 struct btrfs_path *path;
2346 struct btrfs_extent_item *ei;
2347 struct extent_buffer *leaf;
2348 u32 item_size;
2349 int ret;
2350 int err = 0;
2351 int metadata = !extent_op->is_data;
2352
2353 if (trans->aborted)
2354 return 0;
2355
2356 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2357 metadata = 0;
2358
2359 path = btrfs_alloc_path();
2360 if (!path)
2361 return -ENOMEM;
2362
2363 key.objectid = head->bytenr;
2364
2365 if (metadata) {
2366 key.type = BTRFS_METADATA_ITEM_KEY;
2367 key.offset = extent_op->level;
2368 } else {
2369 key.type = BTRFS_EXTENT_ITEM_KEY;
2370 key.offset = head->num_bytes;
2371 }
2372
2373 again:
2374 path->reada = READA_FORWARD;
2375 path->leave_spinning = 1;
2376 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2377 if (ret < 0) {
2378 err = ret;
2379 goto out;
2380 }
2381 if (ret > 0) {
2382 if (metadata) {
2383 if (path->slots[0] > 0) {
2384 path->slots[0]--;
2385 btrfs_item_key_to_cpu(path->nodes[0], &key,
2386 path->slots[0]);
2387 if (key.objectid == head->bytenr &&
2388 key.type == BTRFS_EXTENT_ITEM_KEY &&
2389 key.offset == head->num_bytes)
2390 ret = 0;
2391 }
2392 if (ret > 0) {
2393 btrfs_release_path(path);
2394 metadata = 0;
2395
2396 key.objectid = head->bytenr;
2397 key.offset = head->num_bytes;
2398 key.type = BTRFS_EXTENT_ITEM_KEY;
2399 goto again;
2400 }
2401 } else {
2402 err = -EIO;
2403 goto out;
2404 }
2405 }
2406
2407 leaf = path->nodes[0];
2408 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2409 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2410 if (item_size < sizeof(*ei)) {
2411 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
2412 if (ret < 0) {
2413 err = ret;
2414 goto out;
2415 }
2416 leaf = path->nodes[0];
2417 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2418 }
2419 #endif
2420 BUG_ON(item_size < sizeof(*ei));
2421 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2422 __run_delayed_extent_op(extent_op, leaf, ei);
2423
2424 btrfs_mark_buffer_dirty(leaf);
2425 out:
2426 btrfs_free_path(path);
2427 return err;
2428 }
2429
2430 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2431 struct btrfs_fs_info *fs_info,
2432 struct btrfs_delayed_ref_node *node,
2433 struct btrfs_delayed_extent_op *extent_op,
2434 int insert_reserved)
2435 {
2436 int ret = 0;
2437 struct btrfs_delayed_tree_ref *ref;
2438 struct btrfs_key ins;
2439 u64 parent = 0;
2440 u64 ref_root = 0;
2441 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2442
2443 ref = btrfs_delayed_node_to_tree_ref(node);
2444 trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
2445
2446 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2447 parent = ref->parent;
2448 ref_root = ref->root;
2449
2450 ins.objectid = node->bytenr;
2451 if (skinny_metadata) {
2452 ins.offset = ref->level;
2453 ins.type = BTRFS_METADATA_ITEM_KEY;
2454 } else {
2455 ins.offset = node->num_bytes;
2456 ins.type = BTRFS_EXTENT_ITEM_KEY;
2457 }
2458
2459 if (node->ref_mod != 1) {
2460 btrfs_err(fs_info,
2461 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2462 node->bytenr, node->ref_mod, node->action, ref_root,
2463 parent);
2464 return -EIO;
2465 }
2466 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2467 BUG_ON(!extent_op || !extent_op->update_flags);
2468 ret = alloc_reserved_tree_block(trans, fs_info,
2469 parent, ref_root,
2470 extent_op->flags_to_set,
2471 &extent_op->key,
2472 ref->level, &ins);
2473 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2474 ret = __btrfs_inc_extent_ref(trans, fs_info, node,
2475 parent, ref_root,
2476 ref->level, 0, 1,
2477 extent_op);
2478 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2479 ret = __btrfs_free_extent(trans, fs_info, node,
2480 parent, ref_root,
2481 ref->level, 0, 1, extent_op);
2482 } else {
2483 BUG();
2484 }
2485 return ret;
2486 }
2487
2488 /* helper function to actually process a single delayed ref entry */
2489 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2490 struct btrfs_fs_info *fs_info,
2491 struct btrfs_delayed_ref_node *node,
2492 struct btrfs_delayed_extent_op *extent_op,
2493 int insert_reserved)
2494 {
2495 int ret = 0;
2496
2497 if (trans->aborted) {
2498 if (insert_reserved)
2499 btrfs_pin_extent(fs_info, node->bytenr,
2500 node->num_bytes, 1);
2501 return 0;
2502 }
2503
2504 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2505 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2506 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
2507 insert_reserved);
2508 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2509 node->type == BTRFS_SHARED_DATA_REF_KEY)
2510 ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
2511 insert_reserved);
2512 else
2513 BUG();
2514 if (ret && insert_reserved)
2515 btrfs_pin_extent(trans->fs_info, node->bytenr,
2516 node->num_bytes, 1);
2517 return ret;
2518 }
2519
2520 static inline struct btrfs_delayed_ref_node *
2521 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2522 {
2523 struct btrfs_delayed_ref_node *ref;
2524
2525 if (RB_EMPTY_ROOT(&head->ref_tree))
2526 return NULL;
2527
2528 /*
2529 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2530 * This is to prevent a ref count from going down to zero, which deletes
2531 * the extent item from the extent tree, when there still are references
2532 * to add, which would fail because they would not find the extent item.
2533 */
2534 if (!list_empty(&head->ref_add_list))
2535 return list_first_entry(&head->ref_add_list,
2536 struct btrfs_delayed_ref_node, add_list);
2537
2538 ref = rb_entry(rb_first(&head->ref_tree),
2539 struct btrfs_delayed_ref_node, ref_node);
2540 ASSERT(list_empty(&ref->add_list));
2541 return ref;
2542 }
2543
2544 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2545 struct btrfs_delayed_ref_head *head)
2546 {
2547 spin_lock(&delayed_refs->lock);
2548 head->processing = 0;
2549 delayed_refs->num_heads_ready++;
2550 spin_unlock(&delayed_refs->lock);
2551 btrfs_delayed_ref_unlock(head);
2552 }
2553
2554 static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2555 struct btrfs_fs_info *fs_info,
2556 struct btrfs_delayed_ref_head *head)
2557 {
2558 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2559 int ret;
2560
2561 if (!extent_op)
2562 return 0;
2563 head->extent_op = NULL;
2564 if (head->must_insert_reserved) {
2565 btrfs_free_delayed_extent_op(extent_op);
2566 return 0;
2567 }
2568 spin_unlock(&head->lock);
2569 ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
2570 btrfs_free_delayed_extent_op(extent_op);
2571 return ret ? ret : 1;
2572 }
2573
2574 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2575 struct btrfs_fs_info *fs_info,
2576 struct btrfs_delayed_ref_head *head)
2577 {
2578 struct btrfs_delayed_ref_root *delayed_refs;
2579 int ret;
2580
2581 delayed_refs = &trans->transaction->delayed_refs;
2582
2583 ret = cleanup_extent_op(trans, fs_info, head);
2584 if (ret < 0) {
2585 unselect_delayed_ref_head(delayed_refs, head);
2586 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2587 return ret;
2588 } else if (ret) {
2589 return ret;
2590 }
2591
2592 /*
2593 * Need to drop our head ref lock and re-acquire the delayed ref lock
2594 * and then re-check to make sure nobody got added.
2595 */
2596 spin_unlock(&head->lock);
2597 spin_lock(&delayed_refs->lock);
2598 spin_lock(&head->lock);
2599 if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
2600 spin_unlock(&head->lock);
2601 spin_unlock(&delayed_refs->lock);
2602 return 1;
2603 }
2604 delayed_refs->num_heads--;
2605 rb_erase(&head->href_node, &delayed_refs->href_root);
2606 RB_CLEAR_NODE(&head->href_node);
2607 spin_unlock(&delayed_refs->lock);
2608 spin_unlock(&head->lock);
2609 atomic_dec(&delayed_refs->num_entries);
2610
2611 trace_run_delayed_ref_head(fs_info, head, 0);
2612
2613 if (head->total_ref_mod < 0) {
2614 struct btrfs_space_info *space_info;
2615 u64 flags;
2616
2617 if (head->is_data)
2618 flags = BTRFS_BLOCK_GROUP_DATA;
2619 else if (head->is_system)
2620 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2621 else
2622 flags = BTRFS_BLOCK_GROUP_METADATA;
2623 space_info = __find_space_info(fs_info, flags);
2624 ASSERT(space_info);
2625 percpu_counter_add(&space_info->total_bytes_pinned,
2626 -head->num_bytes);
2627
2628 if (head->is_data) {
2629 spin_lock(&delayed_refs->lock);
2630 delayed_refs->pending_csums -= head->num_bytes;
2631 spin_unlock(&delayed_refs->lock);
2632 }
2633 }
2634
2635 if (head->must_insert_reserved) {
2636 btrfs_pin_extent(fs_info, head->bytenr,
2637 head->num_bytes, 1);
2638 if (head->is_data) {
2639 ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2640 head->num_bytes);
2641 }
2642 }
2643
2644 /* Also free its reserved qgroup space */
2645 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2646 head->qgroup_reserved);
2647 btrfs_delayed_ref_unlock(head);
2648 btrfs_put_delayed_ref_head(head);
2649 return 0;
2650 }
2651
2652 /*
2653 * Returns 0 on success or if called with an already aborted transaction.
2654 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2655 */
2656 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2657 struct btrfs_fs_info *fs_info,
2658 unsigned long nr)
2659 {
2660 struct btrfs_delayed_ref_root *delayed_refs;
2661 struct btrfs_delayed_ref_node *ref;
2662 struct btrfs_delayed_ref_head *locked_ref = NULL;
2663 struct btrfs_delayed_extent_op *extent_op;
2664 ktime_t start = ktime_get();
2665 int ret;
2666 unsigned long count = 0;
2667 unsigned long actual_count = 0;
2668 int must_insert_reserved = 0;
2669
2670 delayed_refs = &trans->transaction->delayed_refs;
2671 while (1) {
2672 if (!locked_ref) {
2673 if (count >= nr)
2674 break;
2675
2676 spin_lock(&delayed_refs->lock);
2677 locked_ref = btrfs_select_ref_head(trans);
2678 if (!locked_ref) {
2679 spin_unlock(&delayed_refs->lock);
2680 break;
2681 }
2682
2683 /* grab the lock that says we are going to process
2684 * all the refs for this head */
2685 ret = btrfs_delayed_ref_lock(trans, locked_ref);
2686 spin_unlock(&delayed_refs->lock);
2687 /*
2688 * we may have dropped the spin lock to get the head
2689 * mutex lock, and that might have given someone else
2690 * time to free the head. If that's true, it has been
2691 * removed from our list and we can move on.
2692 */
2693 if (ret == -EAGAIN) {
2694 locked_ref = NULL;
2695 count++;
2696 continue;
2697 }
2698 }
2699
2700 /*
2701 * We need to try and merge add/drops of the same ref since we
2702 * can run into issues with relocate dropping the implicit ref
2703 * and then it being added back again before the drop can
2704 * finish. If we merged anything we need to re-loop so we can
2705 * get a good ref.
2706 * Or we can get node references of the same type that weren't
2707 * merged when created due to bumps in the tree mod seq, and
2708 * we need to merge them to prevent adding an inline extent
2709 * backref before dropping it (triggering a BUG_ON at
2710 * insert_inline_extent_backref()).
2711 */
2712 spin_lock(&locked_ref->lock);
2713 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2714 locked_ref);
2715
2716 /*
2717 * locked_ref is the head node, so we have to go one
2718 * node back for any delayed ref updates
2719 */
2720 ref = select_delayed_ref(locked_ref);
2721
2722 if (ref && ref->seq &&
2723 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2724 spin_unlock(&locked_ref->lock);
2725 unselect_delayed_ref_head(delayed_refs, locked_ref);
2726 locked_ref = NULL;
2727 cond_resched();
2728 count++;
2729 continue;
2730 }
2731
2732 /*
2733 * We're done processing refs in this ref_head, clean everything
2734 * up and move on to the next ref_head.
2735 */
2736 if (!ref) {
2737 ret = cleanup_ref_head(trans, fs_info, locked_ref);
2738 if (ret > 0 ) {
2739 /* We dropped our lock, we need to loop. */
2740 ret = 0;
2741 continue;
2742 } else if (ret) {
2743 return ret;
2744 }
2745 locked_ref = NULL;
2746 count++;
2747 continue;
2748 }
2749
2750 actual_count++;
2751 ref->in_tree = 0;
2752 rb_erase(&ref->ref_node, &locked_ref->ref_tree);
2753 RB_CLEAR_NODE(&ref->ref_node);
2754 if (!list_empty(&ref->add_list))
2755 list_del(&ref->add_list);
2756 /*
2757 * When we play the delayed ref, also correct the ref_mod on
2758 * head
2759 */
2760 switch (ref->action) {
2761 case BTRFS_ADD_DELAYED_REF:
2762 case BTRFS_ADD_DELAYED_EXTENT:
2763 locked_ref->ref_mod -= ref->ref_mod;
2764 break;
2765 case BTRFS_DROP_DELAYED_REF:
2766 locked_ref->ref_mod += ref->ref_mod;
2767 break;
2768 default:
2769 WARN_ON(1);
2770 }
2771 atomic_dec(&delayed_refs->num_entries);
2772
2773 /*
2774 * Record the must-insert_reserved flag before we drop the spin
2775 * lock.
2776 */
2777 must_insert_reserved = locked_ref->must_insert_reserved;
2778 locked_ref->must_insert_reserved = 0;
2779
2780 extent_op = locked_ref->extent_op;
2781 locked_ref->extent_op = NULL;
2782 spin_unlock(&locked_ref->lock);
2783
2784 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
2785 must_insert_reserved);
2786
2787 btrfs_free_delayed_extent_op(extent_op);
2788 if (ret) {
2789 unselect_delayed_ref_head(delayed_refs, locked_ref);
2790 btrfs_put_delayed_ref(ref);
2791 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2792 ret);
2793 return ret;
2794 }
2795
2796 btrfs_put_delayed_ref(ref);
2797 count++;
2798 cond_resched();
2799 }
2800
2801 /*
2802 * We don't want to include ref heads since we can have empty ref heads
2803 * and those will drastically skew our runtime down since we just do
2804 * accounting, no actual extent tree updates.
2805 */
2806 if (actual_count > 0) {
2807 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2808 u64 avg;
2809
2810 /*
2811 * We weigh the current average higher than our current runtime
2812 * to avoid large swings in the average.
2813 */
2814 spin_lock(&delayed_refs->lock);
2815 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2816 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
2817 spin_unlock(&delayed_refs->lock);
2818 }
2819 return 0;
2820 }
2821
2822 #ifdef SCRAMBLE_DELAYED_REFS
2823 /*
2824 * Normally delayed refs get processed in ascending bytenr order. This
2825 * correlates in most cases to the order added. To expose dependencies on this
2826 * order, we start to process the tree in the middle instead of the beginning
2827 */
2828 static u64 find_middle(struct rb_root *root)
2829 {
2830 struct rb_node *n = root->rb_node;
2831 struct btrfs_delayed_ref_node *entry;
2832 int alt = 1;
2833 u64 middle;
2834 u64 first = 0, last = 0;
2835
2836 n = rb_first(root);
2837 if (n) {
2838 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2839 first = entry->bytenr;
2840 }
2841 n = rb_last(root);
2842 if (n) {
2843 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2844 last = entry->bytenr;
2845 }
2846 n = root->rb_node;
2847
2848 while (n) {
2849 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2850 WARN_ON(!entry->in_tree);
2851
2852 middle = entry->bytenr;
2853
2854 if (alt)
2855 n = n->rb_left;
2856 else
2857 n = n->rb_right;
2858
2859 alt = 1 - alt;
2860 }
2861 return middle;
2862 }
2863 #endif
2864
2865 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2866 {
2867 u64 num_bytes;
2868
2869 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2870 sizeof(struct btrfs_extent_inline_ref));
2871 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2872 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2873
2874 /*
2875 * We don't ever fill up leaves all the way so multiply by 2 just to be
2876 * closer to what we're really going to want to use.
2877 */
2878 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2879 }
2880
2881 /*
2882 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2883 * would require to store the csums for that many bytes.
2884 */
2885 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2886 {
2887 u64 csum_size;
2888 u64 num_csums_per_leaf;
2889 u64 num_csums;
2890
2891 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2892 num_csums_per_leaf = div64_u64(csum_size,
2893 (u64)btrfs_super_csum_size(fs_info->super_copy));
2894 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2895 num_csums += num_csums_per_leaf - 1;
2896 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2897 return num_csums;
2898 }
2899
2900 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2901 struct btrfs_fs_info *fs_info)
2902 {
2903 struct btrfs_block_rsv *global_rsv;
2904 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2905 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2906 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2907 u64 num_bytes, num_dirty_bgs_bytes;
2908 int ret = 0;
2909
2910 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2911 num_heads = heads_to_leaves(fs_info, num_heads);
2912 if (num_heads > 1)
2913 num_bytes += (num_heads - 1) * fs_info->nodesize;
2914 num_bytes <<= 1;
2915 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2916 fs_info->nodesize;
2917 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2918 num_dirty_bgs);
2919 global_rsv = &fs_info->global_block_rsv;
2920
2921 /*
2922 * If we can't allocate any more chunks lets make sure we have _lots_ of
2923 * wiggle room since running delayed refs can create more delayed refs.
2924 */
2925 if (global_rsv->space_info->full) {
2926 num_dirty_bgs_bytes <<= 1;
2927 num_bytes <<= 1;
2928 }
2929
2930 spin_lock(&global_rsv->lock);
2931 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2932 ret = 1;
2933 spin_unlock(&global_rsv->lock);
2934 return ret;
2935 }
2936
2937 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2938 struct btrfs_fs_info *fs_info)
2939 {
2940 u64 num_entries =
2941 atomic_read(&trans->transaction->delayed_refs.num_entries);
2942 u64 avg_runtime;
2943 u64 val;
2944
2945 smp_mb();
2946 avg_runtime = fs_info->avg_delayed_ref_runtime;
2947 val = num_entries * avg_runtime;
2948 if (val >= NSEC_PER_SEC)
2949 return 1;
2950 if (val >= NSEC_PER_SEC / 2)
2951 return 2;
2952
2953 return btrfs_check_space_for_delayed_refs(trans, fs_info);
2954 }
2955
2956 struct async_delayed_refs {
2957 struct btrfs_root *root;
2958 u64 transid;
2959 int count;
2960 int error;
2961 int sync;
2962 struct completion wait;
2963 struct btrfs_work work;
2964 };
2965
2966 static inline struct async_delayed_refs *
2967 to_async_delayed_refs(struct btrfs_work *work)
2968 {
2969 return container_of(work, struct async_delayed_refs, work);
2970 }
2971
2972 static void delayed_ref_async_start(struct btrfs_work *work)
2973 {
2974 struct async_delayed_refs *async = to_async_delayed_refs(work);
2975 struct btrfs_trans_handle *trans;
2976 struct btrfs_fs_info *fs_info = async->root->fs_info;
2977 int ret;
2978
2979 /* if the commit is already started, we don't need to wait here */
2980 if (btrfs_transaction_blocked(fs_info))
2981 goto done;
2982
2983 trans = btrfs_join_transaction(async->root);
2984 if (IS_ERR(trans)) {
2985 async->error = PTR_ERR(trans);
2986 goto done;
2987 }
2988
2989 /*
2990 * trans->sync means that when we call end_transaction, we won't
2991 * wait on delayed refs
2992 */
2993 trans->sync = true;
2994
2995 /* Don't bother flushing if we got into a different transaction */
2996 if (trans->transid > async->transid)
2997 goto end;
2998
2999 ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
3000 if (ret)
3001 async->error = ret;
3002 end:
3003 ret = btrfs_end_transaction(trans);
3004 if (ret && !async->error)
3005 async->error = ret;
3006 done:
3007 if (async->sync)
3008 complete(&async->wait);
3009 else
3010 kfree(async);
3011 }
3012
3013 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
3014 unsigned long count, u64 transid, int wait)
3015 {
3016 struct async_delayed_refs *async;
3017 int ret;
3018
3019 async = kmalloc(sizeof(*async), GFP_NOFS);
3020 if (!async)
3021 return -ENOMEM;
3022
3023 async->root = fs_info->tree_root;
3024 async->count = count;
3025 async->error = 0;
3026 async->transid = transid;
3027 if (wait)
3028 async->sync = 1;
3029 else
3030 async->sync = 0;
3031 init_completion(&async->wait);
3032
3033 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
3034 delayed_ref_async_start, NULL, NULL);
3035
3036 btrfs_queue_work(fs_info->extent_workers, &async->work);
3037
3038 if (wait) {
3039 wait_for_completion(&async->wait);
3040 ret = async->error;
3041 kfree(async);
3042 return ret;
3043 }
3044 return 0;
3045 }
3046
3047 /*
3048 * this starts processing the delayed reference count updates and
3049 * extent insertions we have queued up so far. count can be
3050 * 0, which means to process everything in the tree at the start
3051 * of the run (but not newly added entries), or it can be some target
3052 * number you'd like to process.
3053 *
3054 * Returns 0 on success or if called with an aborted transaction
3055 * Returns <0 on error and aborts the transaction
3056 */
3057 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3058 struct btrfs_fs_info *fs_info, unsigned long count)
3059 {
3060 struct rb_node *node;
3061 struct btrfs_delayed_ref_root *delayed_refs;
3062 struct btrfs_delayed_ref_head *head;
3063 int ret;
3064 int run_all = count == (unsigned long)-1;
3065
3066 /* We'll clean this up in btrfs_cleanup_transaction */
3067 if (trans->aborted)
3068 return 0;
3069
3070 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
3071 return 0;
3072
3073 delayed_refs = &trans->transaction->delayed_refs;
3074 if (count == 0)
3075 count = atomic_read(&delayed_refs->num_entries) * 2;
3076
3077 again:
3078 #ifdef SCRAMBLE_DELAYED_REFS
3079 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3080 #endif
3081 ret = __btrfs_run_delayed_refs(trans, fs_info, count);
3082 if (ret < 0) {
3083 btrfs_abort_transaction(trans, ret);
3084 return ret;
3085 }
3086
3087 if (run_all) {
3088 if (!list_empty(&trans->new_bgs))
3089 btrfs_create_pending_block_groups(trans, fs_info);
3090
3091 spin_lock(&delayed_refs->lock);
3092 node = rb_first(&delayed_refs->href_root);
3093 if (!node) {
3094 spin_unlock(&delayed_refs->lock);
3095 goto out;
3096 }
3097 head = rb_entry(node, struct btrfs_delayed_ref_head,
3098 href_node);
3099 refcount_inc(&head->refs);
3100 spin_unlock(&delayed_refs->lock);
3101
3102 /* Mutex was contended, block until it's released and retry. */
3103 mutex_lock(&head->mutex);
3104 mutex_unlock(&head->mutex);
3105
3106 btrfs_put_delayed_ref_head(head);
3107 cond_resched();
3108 goto again;
3109 }
3110 out:
3111 return 0;
3112 }
3113
3114 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3115 struct btrfs_fs_info *fs_info,
3116 u64 bytenr, u64 num_bytes, u64 flags,
3117 int level, int is_data)
3118 {
3119 struct btrfs_delayed_extent_op *extent_op;
3120 int ret;
3121
3122 extent_op = btrfs_alloc_delayed_extent_op();
3123 if (!extent_op)
3124 return -ENOMEM;
3125
3126 extent_op->flags_to_set = flags;
3127 extent_op->update_flags = true;
3128 extent_op->update_key = false;
3129 extent_op->is_data = is_data ? true : false;
3130 extent_op->level = level;
3131
3132 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3133 num_bytes, extent_op);
3134 if (ret)
3135 btrfs_free_delayed_extent_op(extent_op);
3136 return ret;
3137 }
3138
3139 static noinline int check_delayed_ref(struct btrfs_root *root,
3140 struct btrfs_path *path,
3141 u64 objectid, u64 offset, u64 bytenr)
3142 {
3143 struct btrfs_delayed_ref_head *head;
3144 struct btrfs_delayed_ref_node *ref;
3145 struct btrfs_delayed_data_ref *data_ref;
3146 struct btrfs_delayed_ref_root *delayed_refs;
3147 struct btrfs_transaction *cur_trans;
3148 struct rb_node *node;
3149 int ret = 0;
3150
3151 spin_lock(&root->fs_info->trans_lock);
3152 cur_trans = root->fs_info->running_transaction;
3153 if (cur_trans)
3154 refcount_inc(&cur_trans->use_count);
3155 spin_unlock(&root->fs_info->trans_lock);
3156 if (!cur_trans)
3157 return 0;
3158
3159 delayed_refs = &cur_trans->delayed_refs;
3160 spin_lock(&delayed_refs->lock);
3161 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3162 if (!head) {
3163 spin_unlock(&delayed_refs->lock);
3164 btrfs_put_transaction(cur_trans);
3165 return 0;
3166 }
3167
3168 if (!mutex_trylock(&head->mutex)) {
3169 refcount_inc(&head->refs);
3170 spin_unlock(&delayed_refs->lock);
3171
3172 btrfs_release_path(path);
3173
3174 /*
3175 * Mutex was contended, block until it's released and let
3176 * caller try again
3177 */
3178 mutex_lock(&head->mutex);
3179 mutex_unlock(&head->mutex);
3180 btrfs_put_delayed_ref_head(head);
3181 btrfs_put_transaction(cur_trans);
3182 return -EAGAIN;
3183 }
3184 spin_unlock(&delayed_refs->lock);
3185
3186 spin_lock(&head->lock);
3187 /*
3188 * XXX: We should replace this with a proper search function in the
3189 * future.
3190 */
3191 for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
3192 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3193 /* If it's a shared ref we know a cross reference exists */
3194 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3195 ret = 1;
3196 break;
3197 }
3198
3199 data_ref = btrfs_delayed_node_to_data_ref(ref);
3200
3201 /*
3202 * If our ref doesn't match the one we're currently looking at
3203 * then we have a cross reference.
3204 */
3205 if (data_ref->root != root->root_key.objectid ||
3206 data_ref->objectid != objectid ||
3207 data_ref->offset != offset) {
3208 ret = 1;
3209 break;
3210 }
3211 }
3212 spin_unlock(&head->lock);
3213 mutex_unlock(&head->mutex);
3214 btrfs_put_transaction(cur_trans);
3215 return ret;
3216 }
3217
3218 static noinline int check_committed_ref(struct btrfs_root *root,
3219 struct btrfs_path *path,
3220 u64 objectid, u64 offset, u64 bytenr)
3221 {
3222 struct btrfs_fs_info *fs_info = root->fs_info;
3223 struct btrfs_root *extent_root = fs_info->extent_root;
3224 struct extent_buffer *leaf;
3225 struct btrfs_extent_data_ref *ref;
3226 struct btrfs_extent_inline_ref *iref;
3227 struct btrfs_extent_item *ei;
3228 struct btrfs_key key;
3229 u32 item_size;
3230 int type;
3231 int ret;
3232
3233 key.objectid = bytenr;
3234 key.offset = (u64)-1;
3235 key.type = BTRFS_EXTENT_ITEM_KEY;
3236
3237 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3238 if (ret < 0)
3239 goto out;
3240 BUG_ON(ret == 0); /* Corruption */
3241
3242 ret = -ENOENT;
3243 if (path->slots[0] == 0)
3244 goto out;
3245
3246 path->slots[0]--;
3247 leaf = path->nodes[0];
3248 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3249
3250 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3251 goto out;
3252
3253 ret = 1;
3254 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3255 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3256 if (item_size < sizeof(*ei)) {
3257 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3258 goto out;
3259 }
3260 #endif
3261 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3262
3263 if (item_size != sizeof(*ei) +
3264 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3265 goto out;
3266
3267 if (btrfs_extent_generation(leaf, ei) <=
3268 btrfs_root_last_snapshot(&root->root_item))
3269 goto out;
3270
3271 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3272
3273 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3274 if (type != BTRFS_EXTENT_DATA_REF_KEY)
3275 goto out;
3276
3277 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3278 if (btrfs_extent_refs(leaf, ei) !=
3279 btrfs_extent_data_ref_count(leaf, ref) ||
3280 btrfs_extent_data_ref_root(leaf, ref) !=
3281 root->root_key.objectid ||
3282 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3283 btrfs_extent_data_ref_offset(leaf, ref) != offset)
3284 goto out;
3285
3286 ret = 0;
3287 out:
3288 return ret;
3289 }
3290
3291 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3292 u64 bytenr)
3293 {
3294 struct btrfs_path *path;
3295 int ret;
3296 int ret2;
3297
3298 path = btrfs_alloc_path();
3299 if (!path)
3300 return -ENOENT;
3301
3302 do {
3303 ret = check_committed_ref(root, path, objectid,
3304 offset, bytenr);
3305 if (ret && ret != -ENOENT)
3306 goto out;
3307
3308 ret2 = check_delayed_ref(root, path, objectid,
3309 offset, bytenr);
3310 } while (ret2 == -EAGAIN);
3311
3312 if (ret2 && ret2 != -ENOENT) {
3313 ret = ret2;
3314 goto out;
3315 }
3316
3317 if (ret != -ENOENT || ret2 != -ENOENT)
3318 ret = 0;
3319 out:
3320 btrfs_free_path(path);
3321 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3322 WARN_ON(ret > 0);
3323 return ret;
3324 }
3325
3326 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3327 struct btrfs_root *root,
3328 struct extent_buffer *buf,
3329 int full_backref, int inc)
3330 {
3331 struct btrfs_fs_info *fs_info = root->fs_info;
3332 u64 bytenr;
3333 u64 num_bytes;
3334 u64 parent;
3335 u64 ref_root;
3336 u32 nritems;
3337 struct btrfs_key key;
3338 struct btrfs_file_extent_item *fi;
3339 int i;
3340 int level;
3341 int ret = 0;
3342 int (*process_func)(struct btrfs_trans_handle *,
3343 struct btrfs_root *,
3344 u64, u64, u64, u64, u64, u64);
3345
3346
3347 if (btrfs_is_testing(fs_info))
3348 return 0;
3349
3350 ref_root = btrfs_header_owner(buf);
3351 nritems = btrfs_header_nritems(buf);
3352 level = btrfs_header_level(buf);
3353
3354 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3355 return 0;
3356
3357 if (inc)
3358 process_func = btrfs_inc_extent_ref;
3359 else
3360 process_func = btrfs_free_extent;
3361
3362 if (full_backref)
3363 parent = buf->start;
3364 else
3365 parent = 0;
3366
3367 for (i = 0; i < nritems; i++) {
3368 if (level == 0) {
3369 btrfs_item_key_to_cpu(buf, &key, i);
3370 if (key.type != BTRFS_EXTENT_DATA_KEY)
3371 continue;
3372 fi = btrfs_item_ptr(buf, i,
3373 struct btrfs_file_extent_item);
3374 if (btrfs_file_extent_type(buf, fi) ==
3375 BTRFS_FILE_EXTENT_INLINE)
3376 continue;
3377 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3378 if (bytenr == 0)
3379 continue;
3380
3381 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3382 key.offset -= btrfs_file_extent_offset(buf, fi);
3383 ret = process_func(trans, root, bytenr, num_bytes,
3384 parent, ref_root, key.objectid,
3385 key.offset);
3386 if (ret)
3387 goto fail;
3388 } else {
3389 bytenr = btrfs_node_blockptr(buf, i);
3390 num_bytes = fs_info->nodesize;
3391 ret = process_func(trans, root, bytenr, num_bytes,
3392 parent, ref_root, level - 1, 0);
3393 if (ret)
3394 goto fail;
3395 }
3396 }
3397 return 0;
3398 fail:
3399 return ret;
3400 }
3401
3402 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3403 struct extent_buffer *buf, int full_backref)
3404 {
3405 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3406 }
3407
3408 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3409 struct extent_buffer *buf, int full_backref)
3410 {
3411 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3412 }
3413
3414 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3415 struct btrfs_fs_info *fs_info,
3416 struct btrfs_path *path,
3417 struct btrfs_block_group_cache *cache)
3418 {
3419 int ret;
3420 struct btrfs_root *extent_root = fs_info->extent_root;
3421 unsigned long bi;
3422 struct extent_buffer *leaf;
3423
3424 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3425 if (ret) {
3426 if (ret > 0)
3427 ret = -ENOENT;
3428 goto fail;
3429 }
3430
3431 leaf = path->nodes[0];
3432 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3433 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3434 btrfs_mark_buffer_dirty(leaf);
3435 fail:
3436 btrfs_release_path(path);
3437 return ret;
3438
3439 }
3440
3441 static struct btrfs_block_group_cache *
3442 next_block_group(struct btrfs_fs_info *fs_info,
3443 struct btrfs_block_group_cache *cache)
3444 {
3445 struct rb_node *node;
3446
3447 spin_lock(&fs_info->block_group_cache_lock);
3448
3449 /* If our block group was removed, we need a full search. */
3450 if (RB_EMPTY_NODE(&cache->cache_node)) {
3451 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3452
3453 spin_unlock(&fs_info->block_group_cache_lock);
3454 btrfs_put_block_group(cache);
3455 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3456 }
3457 node = rb_next(&cache->cache_node);
3458 btrfs_put_block_group(cache);
3459 if (node) {
3460 cache = rb_entry(node, struct btrfs_block_group_cache,
3461 cache_node);
3462 btrfs_get_block_group(cache);
3463 } else
3464 cache = NULL;
3465 spin_unlock(&fs_info->block_group_cache_lock);
3466 return cache;
3467 }
3468
3469 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3470 struct btrfs_trans_handle *trans,
3471 struct btrfs_path *path)
3472 {
3473 struct btrfs_fs_info *fs_info = block_group->fs_info;
3474 struct btrfs_root *root = fs_info->tree_root;
3475 struct inode *inode = NULL;
3476 struct extent_changeset *data_reserved = NULL;
3477 u64 alloc_hint = 0;
3478 int dcs = BTRFS_DC_ERROR;
3479 u64 num_pages = 0;
3480 int retries = 0;
3481 int ret = 0;
3482
3483 /*
3484 * If this block group is smaller than 100 megs don't bother caching the
3485 * block group.
3486 */
3487 if (block_group->key.offset < (100 * SZ_1M)) {
3488 spin_lock(&block_group->lock);
3489 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3490 spin_unlock(&block_group->lock);
3491 return 0;
3492 }
3493
3494 if (trans->aborted)
3495 return 0;
3496 again:
3497 inode = lookup_free_space_inode(fs_info, block_group, path);
3498 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3499 ret = PTR_ERR(inode);
3500 btrfs_release_path(path);
3501 goto out;
3502 }
3503
3504 if (IS_ERR(inode)) {
3505 BUG_ON(retries);
3506 retries++;
3507
3508 if (block_group->ro)
3509 goto out_free;
3510
3511 ret = create_free_space_inode(fs_info, trans, block_group,
3512 path);
3513 if (ret)
3514 goto out_free;
3515 goto again;
3516 }
3517
3518 /*
3519 * We want to set the generation to 0, that way if anything goes wrong
3520 * from here on out we know not to trust this cache when we load up next
3521 * time.
3522 */
3523 BTRFS_I(inode)->generation = 0;
3524 ret = btrfs_update_inode(trans, root, inode);
3525 if (ret) {
3526 /*
3527 * So theoretically we could recover from this, simply set the
3528 * super cache generation to 0 so we know to invalidate the
3529 * cache, but then we'd have to keep track of the block groups
3530 * that fail this way so we know we _have_ to reset this cache
3531 * before the next commit or risk reading stale cache. So to
3532 * limit our exposure to horrible edge cases lets just abort the
3533 * transaction, this only happens in really bad situations
3534 * anyway.
3535 */
3536 btrfs_abort_transaction(trans, ret);
3537 goto out_put;
3538 }
3539 WARN_ON(ret);
3540
3541 /* We've already setup this transaction, go ahead and exit */
3542 if (block_group->cache_generation == trans->transid &&
3543 i_size_read(inode)) {
3544 dcs = BTRFS_DC_SETUP;
3545 goto out_put;
3546 }
3547
3548 if (i_size_read(inode) > 0) {
3549 ret = btrfs_check_trunc_cache_free_space(fs_info,
3550 &fs_info->global_block_rsv);
3551 if (ret)
3552 goto out_put;
3553
3554 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3555 if (ret)
3556 goto out_put;
3557 }
3558
3559 spin_lock(&block_group->lock);
3560 if (block_group->cached != BTRFS_CACHE_FINISHED ||
3561 !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3562 /*
3563 * don't bother trying to write stuff out _if_
3564 * a) we're not cached,
3565 * b) we're with nospace_cache mount option,
3566 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3567 */
3568 dcs = BTRFS_DC_WRITTEN;
3569 spin_unlock(&block_group->lock);
3570 goto out_put;
3571 }
3572 spin_unlock(&block_group->lock);
3573
3574 /*
3575 * We hit an ENOSPC when setting up the cache in this transaction, just
3576 * skip doing the setup, we've already cleared the cache so we're safe.
3577 */
3578 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3579 ret = -ENOSPC;
3580 goto out_put;
3581 }
3582
3583 /*
3584 * Try to preallocate enough space based on how big the block group is.
3585 * Keep in mind this has to include any pinned space which could end up
3586 * taking up quite a bit since it's not folded into the other space
3587 * cache.
3588 */
3589 num_pages = div_u64(block_group->key.offset, SZ_256M);
3590 if (!num_pages)
3591 num_pages = 1;
3592
3593 num_pages *= 16;
3594 num_pages *= PAGE_SIZE;
3595
3596 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3597 if (ret)
3598 goto out_put;
3599
3600 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3601 num_pages, num_pages,
3602 &alloc_hint);
3603 /*
3604 * Our cache requires contiguous chunks so that we don't modify a bunch
3605 * of metadata or split extents when writing the cache out, which means
3606 * we can enospc if we are heavily fragmented in addition to just normal
3607 * out of space conditions. So if we hit this just skip setting up any
3608 * other block groups for this transaction, maybe we'll unpin enough
3609 * space the next time around.
3610 */
3611 if (!ret)
3612 dcs = BTRFS_DC_SETUP;
3613 else if (ret == -ENOSPC)
3614 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3615
3616 out_put:
3617 iput(inode);
3618 out_free:
3619 btrfs_release_path(path);
3620 out:
3621 spin_lock(&block_group->lock);
3622 if (!ret && dcs == BTRFS_DC_SETUP)
3623 block_group->cache_generation = trans->transid;
3624 block_group->disk_cache_state = dcs;
3625 spin_unlock(&block_group->lock);
3626
3627 extent_changeset_free(data_reserved);
3628 return ret;
3629 }
3630
3631 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3632 struct btrfs_fs_info *fs_info)
3633 {
3634 struct btrfs_block_group_cache *cache, *tmp;
3635 struct btrfs_transaction *cur_trans = trans->transaction;
3636 struct btrfs_path *path;
3637
3638 if (list_empty(&cur_trans->dirty_bgs) ||
3639 !btrfs_test_opt(fs_info, SPACE_CACHE))
3640 return 0;
3641
3642 path = btrfs_alloc_path();
3643 if (!path)
3644 return -ENOMEM;
3645
3646 /* Could add new block groups, use _safe just in case */
3647 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3648 dirty_list) {
3649 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3650 cache_save_setup(cache, trans, path);
3651 }
3652
3653 btrfs_free_path(path);
3654 return 0;
3655 }
3656
3657 /*
3658 * transaction commit does final block group cache writeback during a
3659 * critical section where nothing is allowed to change the FS. This is
3660 * required in order for the cache to actually match the block group,
3661 * but can introduce a lot of latency into the commit.
3662 *
3663 * So, btrfs_start_dirty_block_groups is here to kick off block group
3664 * cache IO. There's a chance we'll have to redo some of it if the
3665 * block group changes again during the commit, but it greatly reduces
3666 * the commit latency by getting rid of the easy block groups while
3667 * we're still allowing others to join the commit.
3668 */
3669 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3670 struct btrfs_fs_info *fs_info)
3671 {
3672 struct btrfs_block_group_cache *cache;
3673 struct btrfs_transaction *cur_trans = trans->transaction;
3674 int ret = 0;
3675 int should_put;
3676 struct btrfs_path *path = NULL;
3677 LIST_HEAD(dirty);
3678 struct list_head *io = &cur_trans->io_bgs;
3679 int num_started = 0;
3680 int loops = 0;
3681
3682 spin_lock(&cur_trans->dirty_bgs_lock);
3683 if (list_empty(&cur_trans->dirty_bgs)) {
3684 spin_unlock(&cur_trans->dirty_bgs_lock);
3685 return 0;
3686 }
3687 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3688 spin_unlock(&cur_trans->dirty_bgs_lock);
3689
3690 again:
3691 /*
3692 * make sure all the block groups on our dirty list actually
3693 * exist
3694 */
3695 btrfs_create_pending_block_groups(trans, fs_info);
3696
3697 if (!path) {
3698 path = btrfs_alloc_path();
3699 if (!path)
3700 return -ENOMEM;
3701 }
3702
3703 /*
3704 * cache_write_mutex is here only to save us from balance or automatic
3705 * removal of empty block groups deleting this block group while we are
3706 * writing out the cache
3707 */
3708 mutex_lock(&trans->transaction->cache_write_mutex);
3709 while (!list_empty(&dirty)) {
3710 cache = list_first_entry(&dirty,
3711 struct btrfs_block_group_cache,
3712 dirty_list);
3713 /*
3714 * this can happen if something re-dirties a block
3715 * group that is already under IO. Just wait for it to
3716 * finish and then do it all again
3717 */
3718 if (!list_empty(&cache->io_list)) {
3719 list_del_init(&cache->io_list);
3720 btrfs_wait_cache_io(trans, cache, path);
3721 btrfs_put_block_group(cache);
3722 }
3723
3724
3725 /*
3726 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3727 * if it should update the cache_state. Don't delete
3728 * until after we wait.
3729 *
3730 * Since we're not running in the commit critical section
3731 * we need the dirty_bgs_lock to protect from update_block_group
3732 */
3733 spin_lock(&cur_trans->dirty_bgs_lock);
3734 list_del_init(&cache->dirty_list);
3735 spin_unlock(&cur_trans->dirty_bgs_lock);
3736
3737 should_put = 1;
3738
3739 cache_save_setup(cache, trans, path);
3740
3741 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3742 cache->io_ctl.inode = NULL;
3743 ret = btrfs_write_out_cache(fs_info, trans,
3744 cache, path);
3745 if (ret == 0 && cache->io_ctl.inode) {
3746 num_started++;
3747 should_put = 0;
3748
3749 /*
3750 * the cache_write_mutex is protecting
3751 * the io_list
3752 */
3753 list_add_tail(&cache->io_list, io);
3754 } else {
3755 /*
3756 * if we failed to write the cache, the
3757 * generation will be bad and life goes on
3758 */
3759 ret = 0;
3760 }
3761 }
3762 if (!ret) {
3763 ret = write_one_cache_group(trans, fs_info,
3764 path, cache);
3765 /*
3766 * Our block group might still be attached to the list
3767 * of new block groups in the transaction handle of some
3768 * other task (struct btrfs_trans_handle->new_bgs). This
3769 * means its block group item isn't yet in the extent
3770 * tree. If this happens ignore the error, as we will
3771 * try again later in the critical section of the
3772 * transaction commit.
3773 */
3774 if (ret == -ENOENT) {
3775 ret = 0;
3776 spin_lock(&cur_trans->dirty_bgs_lock);
3777 if (list_empty(&cache->dirty_list)) {
3778 list_add_tail(&cache->dirty_list,
3779 &cur_trans->dirty_bgs);
3780 btrfs_get_block_group(cache);
3781 }
3782 spin_unlock(&cur_trans->dirty_bgs_lock);
3783 } else if (ret) {
3784 btrfs_abort_transaction(trans, ret);
3785 }
3786 }
3787
3788 /* if its not on the io list, we need to put the block group */
3789 if (should_put)
3790 btrfs_put_block_group(cache);
3791
3792 if (ret)
3793 break;
3794
3795 /*
3796 * Avoid blocking other tasks for too long. It might even save
3797 * us from writing caches for block groups that are going to be
3798 * removed.
3799 */
3800 mutex_unlock(&trans->transaction->cache_write_mutex);
3801 mutex_lock(&trans->transaction->cache_write_mutex);
3802 }
3803 mutex_unlock(&trans->transaction->cache_write_mutex);
3804
3805 /*
3806 * go through delayed refs for all the stuff we've just kicked off
3807 * and then loop back (just once)
3808 */
3809 ret = btrfs_run_delayed_refs(trans, fs_info, 0);
3810 if (!ret && loops == 0) {
3811 loops++;
3812 spin_lock(&cur_trans->dirty_bgs_lock);
3813 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3814 /*
3815 * dirty_bgs_lock protects us from concurrent block group
3816 * deletes too (not just cache_write_mutex).
3817 */
3818 if (!list_empty(&dirty)) {
3819 spin_unlock(&cur_trans->dirty_bgs_lock);
3820 goto again;
3821 }
3822 spin_unlock(&cur_trans->dirty_bgs_lock);
3823 } else if (ret < 0) {
3824 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3825 }
3826
3827 btrfs_free_path(path);
3828 return ret;
3829 }
3830
3831 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3832 struct btrfs_fs_info *fs_info)
3833 {
3834 struct btrfs_block_group_cache *cache;
3835 struct btrfs_transaction *cur_trans = trans->transaction;
3836 int ret = 0;
3837 int should_put;
3838 struct btrfs_path *path;
3839 struct list_head *io = &cur_trans->io_bgs;
3840 int num_started = 0;
3841
3842 path = btrfs_alloc_path();
3843 if (!path)
3844 return -ENOMEM;
3845
3846 /*
3847 * Even though we are in the critical section of the transaction commit,
3848 * we can still have concurrent tasks adding elements to this
3849 * transaction's list of dirty block groups. These tasks correspond to
3850 * endio free space workers started when writeback finishes for a
3851 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3852 * allocate new block groups as a result of COWing nodes of the root
3853 * tree when updating the free space inode. The writeback for the space
3854 * caches is triggered by an earlier call to
3855 * btrfs_start_dirty_block_groups() and iterations of the following
3856 * loop.
3857 * Also we want to do the cache_save_setup first and then run the
3858 * delayed refs to make sure we have the best chance at doing this all
3859 * in one shot.
3860 */
3861 spin_lock(&cur_trans->dirty_bgs_lock);
3862 while (!list_empty(&cur_trans->dirty_bgs)) {
3863 cache = list_first_entry(&cur_trans->dirty_bgs,
3864 struct btrfs_block_group_cache,
3865 dirty_list);
3866
3867 /*
3868 * this can happen if cache_save_setup re-dirties a block
3869 * group that is already under IO. Just wait for it to
3870 * finish and then do it all again
3871 */
3872 if (!list_empty(&cache->io_list)) {
3873 spin_unlock(&cur_trans->dirty_bgs_lock);
3874 list_del_init(&cache->io_list);
3875 btrfs_wait_cache_io(trans, cache, path);
3876 btrfs_put_block_group(cache);
3877 spin_lock(&cur_trans->dirty_bgs_lock);
3878 }
3879
3880 /*
3881 * don't remove from the dirty list until after we've waited
3882 * on any pending IO
3883 */
3884 list_del_init(&cache->dirty_list);
3885 spin_unlock(&cur_trans->dirty_bgs_lock);
3886 should_put = 1;
3887
3888 cache_save_setup(cache, trans, path);
3889
3890 if (!ret)
3891 ret = btrfs_run_delayed_refs(trans, fs_info,
3892 (unsigned long) -1);
3893
3894 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3895 cache->io_ctl.inode = NULL;
3896 ret = btrfs_write_out_cache(fs_info, trans,
3897 cache, path);
3898 if (ret == 0 && cache->io_ctl.inode) {
3899 num_started++;
3900 should_put = 0;
3901 list_add_tail(&cache->io_list, io);
3902 } else {
3903 /*
3904 * if we failed to write the cache, the
3905 * generation will be bad and life goes on
3906 */
3907 ret = 0;
3908 }
3909 }
3910 if (!ret) {
3911 ret = write_one_cache_group(trans, fs_info,
3912 path, cache);
3913 /*
3914 * One of the free space endio workers might have
3915 * created a new block group while updating a free space
3916 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3917 * and hasn't released its transaction handle yet, in
3918 * which case the new block group is still attached to
3919 * its transaction handle and its creation has not
3920 * finished yet (no block group item in the extent tree
3921 * yet, etc). If this is the case, wait for all free
3922 * space endio workers to finish and retry. This is a
3923 * a very rare case so no need for a more efficient and
3924 * complex approach.
3925 */
3926 if (ret == -ENOENT) {
3927 wait_event(cur_trans->writer_wait,
3928 atomic_read(&cur_trans->num_writers) == 1);
3929 ret = write_one_cache_group(trans, fs_info,
3930 path, cache);
3931 }
3932 if (ret)
3933 btrfs_abort_transaction(trans, ret);
3934 }
3935
3936 /* if its not on the io list, we need to put the block group */
3937 if (should_put)
3938 btrfs_put_block_group(cache);
3939 spin_lock(&cur_trans->dirty_bgs_lock);
3940 }
3941 spin_unlock(&cur_trans->dirty_bgs_lock);
3942
3943 while (!list_empty(io)) {
3944 cache = list_first_entry(io, struct btrfs_block_group_cache,
3945 io_list);
3946 list_del_init(&cache->io_list);
3947 btrfs_wait_cache_io(trans, cache, path);
3948 btrfs_put_block_group(cache);
3949 }
3950
3951 btrfs_free_path(path);
3952 return ret;
3953 }
3954
3955 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3956 {
3957 struct btrfs_block_group_cache *block_group;
3958 int readonly = 0;
3959
3960 block_group = btrfs_lookup_block_group(fs_info, bytenr);
3961 if (!block_group || block_group->ro)
3962 readonly = 1;
3963 if (block_group)
3964 btrfs_put_block_group(block_group);
3965 return readonly;
3966 }
3967
3968 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3969 {
3970 struct btrfs_block_group_cache *bg;
3971 bool ret = true;
3972
3973 bg = btrfs_lookup_block_group(fs_info, bytenr);
3974 if (!bg)
3975 return false;
3976
3977 spin_lock(&bg->lock);
3978 if (bg->ro)
3979 ret = false;
3980 else
3981 atomic_inc(&bg->nocow_writers);
3982 spin_unlock(&bg->lock);
3983
3984 /* no put on block group, done by btrfs_dec_nocow_writers */
3985 if (!ret)
3986 btrfs_put_block_group(bg);
3987
3988 return ret;
3989
3990 }
3991
3992 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3993 {
3994 struct btrfs_block_group_cache *bg;
3995
3996 bg = btrfs_lookup_block_group(fs_info, bytenr);
3997 ASSERT(bg);
3998 if (atomic_dec_and_test(&bg->nocow_writers))
3999 wake_up_atomic_t(&bg->nocow_writers);
4000 /*
4001 * Once for our lookup and once for the lookup done by a previous call
4002 * to btrfs_inc_nocow_writers()
4003 */
4004 btrfs_put_block_group(bg);
4005 btrfs_put_block_group(bg);
4006 }
4007
4008 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
4009 {
4010 wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
4011 TASK_UNINTERRUPTIBLE);
4012 }
4013
4014 static const char *alloc_name(u64 flags)
4015 {
4016 switch (flags) {
4017 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
4018 return "mixed";
4019 case BTRFS_BLOCK_GROUP_METADATA:
4020 return "metadata";
4021 case BTRFS_BLOCK_GROUP_DATA:
4022 return "data";
4023 case BTRFS_BLOCK_GROUP_SYSTEM:
4024 return "system";
4025 default:
4026 WARN_ON(1);
4027 return "invalid-combination";
4028 };
4029 }
4030
4031 static int create_space_info(struct btrfs_fs_info *info, u64 flags,
4032 struct btrfs_space_info **new)
4033 {
4034
4035 struct btrfs_space_info *space_info;
4036 int i;
4037 int ret;
4038
4039 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
4040 if (!space_info)
4041 return -ENOMEM;
4042
4043 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
4044 GFP_KERNEL);
4045 if (ret) {
4046 kfree(space_info);
4047 return ret;
4048 }
4049
4050 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
4051 INIT_LIST_HEAD(&space_info->block_groups[i]);
4052 init_rwsem(&space_info->groups_sem);
4053 spin_lock_init(&space_info->lock);
4054 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
4055 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4056 init_waitqueue_head(&space_info->wait);
4057 INIT_LIST_HEAD(&space_info->ro_bgs);
4058 INIT_LIST_HEAD(&space_info->tickets);
4059 INIT_LIST_HEAD(&space_info->priority_tickets);
4060
4061 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
4062 info->space_info_kobj, "%s",
4063 alloc_name(space_info->flags));
4064 if (ret) {
4065 kobject_put(&space_info->kobj);
4066 return ret;
4067 }
4068
4069 *new = space_info;
4070 list_add_rcu(&space_info->list, &info->space_info);
4071 if (flags & BTRFS_BLOCK_GROUP_DATA)
4072 info->data_sinfo = space_info;
4073
4074 return ret;
4075 }
4076
4077 static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4078 u64 total_bytes, u64 bytes_used,
4079 u64 bytes_readonly,
4080 struct btrfs_space_info **space_info)
4081 {
4082 struct btrfs_space_info *found;
4083 int factor;
4084
4085 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
4086 BTRFS_BLOCK_GROUP_RAID10))
4087 factor = 2;
4088 else
4089 factor = 1;
4090
4091 found = __find_space_info(info, flags);
4092 ASSERT(found);
4093 spin_lock(&found->lock);
4094 found->total_bytes += total_bytes;
4095 found->disk_total += total_bytes * factor;
4096 found->bytes_used += bytes_used;
4097 found->disk_used += bytes_used * factor;
4098 found->bytes_readonly += bytes_readonly;
4099 if (total_bytes > 0)
4100 found->full = 0;
4101 space_info_add_new_bytes(info, found, total_bytes -
4102 bytes_used - bytes_readonly);
4103 spin_unlock(&found->lock);
4104 *space_info = found;
4105 }
4106
4107 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4108 {
4109 u64 extra_flags = chunk_to_extended(flags) &
4110 BTRFS_EXTENDED_PROFILE_MASK;
4111
4112 write_seqlock(&fs_info->profiles_lock);
4113 if (flags & BTRFS_BLOCK_GROUP_DATA)
4114 fs_info->avail_data_alloc_bits |= extra_flags;
4115 if (flags & BTRFS_BLOCK_GROUP_METADATA)
4116 fs_info->avail_metadata_alloc_bits |= extra_flags;
4117 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4118 fs_info->avail_system_alloc_bits |= extra_flags;
4119 write_sequnlock(&fs_info->profiles_lock);
4120 }
4121
4122 /*
4123 * returns target flags in extended format or 0 if restripe for this
4124 * chunk_type is not in progress
4125 *
4126 * should be called with either volume_mutex or balance_lock held
4127 */
4128 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4129 {
4130 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4131 u64 target = 0;
4132
4133 if (!bctl)
4134 return 0;
4135
4136 if (flags & BTRFS_BLOCK_GROUP_DATA &&
4137 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4138 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4139 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4140 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4141 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4142 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4143 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4144 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4145 }
4146
4147 return target;
4148 }
4149
4150 /*
4151 * @flags: available profiles in extended format (see ctree.h)
4152 *
4153 * Returns reduced profile in chunk format. If profile changing is in
4154 * progress (either running or paused) picks the target profile (if it's
4155 * already available), otherwise falls back to plain reducing.
4156 */
4157 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4158 {
4159 u64 num_devices = fs_info->fs_devices->rw_devices;
4160 u64 target;
4161 u64 raid_type;
4162 u64 allowed = 0;
4163
4164 /*
4165 * see if restripe for this chunk_type is in progress, if so
4166 * try to reduce to the target profile
4167 */
4168 spin_lock(&fs_info->balance_lock);
4169 target = get_restripe_target(fs_info, flags);
4170 if (target) {
4171 /* pick target profile only if it's already available */
4172 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4173 spin_unlock(&fs_info->balance_lock);
4174 return extended_to_chunk(target);
4175 }
4176 }
4177 spin_unlock(&fs_info->balance_lock);
4178
4179 /* First, mask out the RAID levels which aren't possible */
4180 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4181 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4182 allowed |= btrfs_raid_group[raid_type];
4183 }
4184 allowed &= flags;
4185
4186 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4187 allowed = BTRFS_BLOCK_GROUP_RAID6;
4188 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4189 allowed = BTRFS_BLOCK_GROUP_RAID5;
4190 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4191 allowed = BTRFS_BLOCK_GROUP_RAID10;
4192 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4193 allowed = BTRFS_BLOCK_GROUP_RAID1;
4194 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4195 allowed = BTRFS_BLOCK_GROUP_RAID0;
4196
4197 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4198
4199 return extended_to_chunk(flags | allowed);
4200 }
4201
4202 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4203 {
4204 unsigned seq;
4205 u64 flags;
4206
4207 do {
4208 flags = orig_flags;
4209 seq = read_seqbegin(&fs_info->profiles_lock);
4210
4211 if (flags & BTRFS_BLOCK_GROUP_DATA)
4212 flags |= fs_info->avail_data_alloc_bits;
4213 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4214 flags |= fs_info->avail_system_alloc_bits;
4215 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4216 flags |= fs_info->avail_metadata_alloc_bits;
4217 } while (read_seqretry(&fs_info->profiles_lock, seq));
4218
4219 return btrfs_reduce_alloc_profile(fs_info, flags);
4220 }
4221
4222 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4223 {
4224 struct btrfs_fs_info *fs_info = root->fs_info;
4225 u64 flags;
4226 u64 ret;
4227
4228 if (data)
4229 flags = BTRFS_BLOCK_GROUP_DATA;
4230 else if (root == fs_info->chunk_root)
4231 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4232 else
4233 flags = BTRFS_BLOCK_GROUP_METADATA;
4234
4235 ret = get_alloc_profile(fs_info, flags);
4236 return ret;
4237 }
4238
4239 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4240 {
4241 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4242 }
4243
4244 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4245 {
4246 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4247 }
4248
4249 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4250 {
4251 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4252 }
4253
4254 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4255 bool may_use_included)
4256 {
4257 ASSERT(s_info);
4258 return s_info->bytes_used + s_info->bytes_reserved +
4259 s_info->bytes_pinned + s_info->bytes_readonly +
4260 (may_use_included ? s_info->bytes_may_use : 0);
4261 }
4262
4263 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4264 {
4265 struct btrfs_root *root = inode->root;
4266 struct btrfs_fs_info *fs_info = root->fs_info;
4267 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4268 u64 used;
4269 int ret = 0;
4270 int need_commit = 2;
4271 int have_pinned_space;
4272
4273 /* make sure bytes are sectorsize aligned */
4274 bytes = ALIGN(bytes, fs_info->sectorsize);
4275
4276 if (btrfs_is_free_space_inode(inode)) {
4277 need_commit = 0;
4278 ASSERT(current->journal_info);
4279 }
4280
4281 again:
4282 /* make sure we have enough space to handle the data first */
4283 spin_lock(&data_sinfo->lock);
4284 used = btrfs_space_info_used(data_sinfo, true);
4285
4286 if (used + bytes > data_sinfo->total_bytes) {
4287 struct btrfs_trans_handle *trans;
4288
4289 /*
4290 * if we don't have enough free bytes in this space then we need
4291 * to alloc a new chunk.
4292 */
4293 if (!data_sinfo->full) {
4294 u64 alloc_target;
4295
4296 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4297 spin_unlock(&data_sinfo->lock);
4298
4299 alloc_target = btrfs_data_alloc_profile(fs_info);
4300 /*
4301 * It is ugly that we don't call nolock join
4302 * transaction for the free space inode case here.
4303 * But it is safe because we only do the data space
4304 * reservation for the free space cache in the
4305 * transaction context, the common join transaction
4306 * just increase the counter of the current transaction
4307 * handler, doesn't try to acquire the trans_lock of
4308 * the fs.
4309 */
4310 trans = btrfs_join_transaction(root);
4311 if (IS_ERR(trans))
4312 return PTR_ERR(trans);
4313
4314 ret = do_chunk_alloc(trans, fs_info, alloc_target,
4315 CHUNK_ALLOC_NO_FORCE);
4316 btrfs_end_transaction(trans);
4317 if (ret < 0) {
4318 if (ret != -ENOSPC)
4319 return ret;
4320 else {
4321 have_pinned_space = 1;
4322 goto commit_trans;
4323 }
4324 }
4325
4326 goto again;
4327 }
4328
4329 /*
4330 * If we don't have enough pinned space to deal with this
4331 * allocation, and no removed chunk in current transaction,
4332 * don't bother committing the transaction.
4333 */
4334 have_pinned_space = percpu_counter_compare(
4335 &data_sinfo->total_bytes_pinned,
4336 used + bytes - data_sinfo->total_bytes);
4337 spin_unlock(&data_sinfo->lock);
4338
4339 /* commit the current transaction and try again */
4340 commit_trans:
4341 if (need_commit &&
4342 !atomic_read(&fs_info->open_ioctl_trans)) {
4343 need_commit--;
4344
4345 if (need_commit > 0) {
4346 btrfs_start_delalloc_roots(fs_info, 0, -1);
4347 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4348 (u64)-1);
4349 }
4350
4351 trans = btrfs_join_transaction(root);
4352 if (IS_ERR(trans))
4353 return PTR_ERR(trans);
4354 if (have_pinned_space >= 0 ||
4355 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4356 &trans->transaction->flags) ||
4357 need_commit > 0) {
4358 ret = btrfs_commit_transaction(trans);
4359 if (ret)
4360 return ret;
4361 /*
4362 * The cleaner kthread might still be doing iput
4363 * operations. Wait for it to finish so that
4364 * more space is released.
4365 */
4366 mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4367 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4368 goto again;
4369 } else {
4370 btrfs_end_transaction(trans);
4371 }
4372 }
4373
4374 trace_btrfs_space_reservation(fs_info,
4375 "space_info:enospc",
4376 data_sinfo->flags, bytes, 1);
4377 return -ENOSPC;
4378 }
4379 data_sinfo->bytes_may_use += bytes;
4380 trace_btrfs_space_reservation(fs_info, "space_info",
4381 data_sinfo->flags, bytes, 1);
4382 spin_unlock(&data_sinfo->lock);
4383
4384 return 0;
4385 }
4386
4387 int btrfs_check_data_free_space(struct inode *inode,
4388 struct extent_changeset **reserved, u64 start, u64 len)
4389 {
4390 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4391 int ret;
4392
4393 /* align the range */
4394 len = round_up(start + len, fs_info->sectorsize) -
4395 round_down(start, fs_info->sectorsize);
4396 start = round_down(start, fs_info->sectorsize);
4397
4398 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4399 if (ret < 0)
4400 return ret;
4401
4402 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4403 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4404 if (ret < 0)
4405 btrfs_free_reserved_data_space_noquota(inode, start, len);
4406 else
4407 ret = 0;
4408 return ret;
4409 }
4410
4411 /*
4412 * Called if we need to clear a data reservation for this inode
4413 * Normally in a error case.
4414 *
4415 * This one will *NOT* use accurate qgroup reserved space API, just for case
4416 * which we can't sleep and is sure it won't affect qgroup reserved space.
4417 * Like clear_bit_hook().
4418 */
4419 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4420 u64 len)
4421 {
4422 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4423 struct btrfs_space_info *data_sinfo;
4424
4425 /* Make sure the range is aligned to sectorsize */
4426 len = round_up(start + len, fs_info->sectorsize) -
4427 round_down(start, fs_info->sectorsize);
4428 start = round_down(start, fs_info->sectorsize);
4429
4430 data_sinfo = fs_info->data_sinfo;
4431 spin_lock(&data_sinfo->lock);
4432 if (WARN_ON(data_sinfo->bytes_may_use < len))
4433 data_sinfo->bytes_may_use = 0;
4434 else
4435 data_sinfo->bytes_may_use -= len;
4436 trace_btrfs_space_reservation(fs_info, "space_info",
4437 data_sinfo->flags, len, 0);
4438 spin_unlock(&data_sinfo->lock);
4439 }
4440
4441 /*
4442 * Called if we need to clear a data reservation for this inode
4443 * Normally in a error case.
4444 *
4445 * This one will handle the per-inode data rsv map for accurate reserved
4446 * space framework.
4447 */
4448 void btrfs_free_reserved_data_space(struct inode *inode,
4449 struct extent_changeset *reserved, u64 start, u64 len)
4450 {
4451 struct btrfs_root *root = BTRFS_I(inode)->root;
4452
4453 /* Make sure the range is aligned to sectorsize */
4454 len = round_up(start + len, root->fs_info->sectorsize) -
4455 round_down(start, root->fs_info->sectorsize);
4456 start = round_down(start, root->fs_info->sectorsize);
4457
4458 btrfs_free_reserved_data_space_noquota(inode, start, len);
4459 btrfs_qgroup_free_data(inode, reserved, start, len);
4460 }
4461
4462 static void force_metadata_allocation(struct btrfs_fs_info *info)
4463 {
4464 struct list_head *head = &info->space_info;
4465 struct btrfs_space_info *found;
4466
4467 rcu_read_lock();
4468 list_for_each_entry_rcu(found, head, list) {
4469 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4470 found->force_alloc = CHUNK_ALLOC_FORCE;
4471 }
4472 rcu_read_unlock();
4473 }
4474
4475 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4476 {
4477 return (global->size << 1);
4478 }
4479
4480 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4481 struct btrfs_space_info *sinfo, int force)
4482 {
4483 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4484 u64 bytes_used = btrfs_space_info_used(sinfo, false);
4485 u64 thresh;
4486
4487 if (force == CHUNK_ALLOC_FORCE)
4488 return 1;
4489
4490 /*
4491 * We need to take into account the global rsv because for all intents
4492 * and purposes it's used space. Don't worry about locking the
4493 * global_rsv, it doesn't change except when the transaction commits.
4494 */
4495 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4496 bytes_used += calc_global_rsv_need_space(global_rsv);
4497
4498 /*
4499 * in limited mode, we want to have some free space up to
4500 * about 1% of the FS size.
4501 */
4502 if (force == CHUNK_ALLOC_LIMITED) {
4503 thresh = btrfs_super_total_bytes(fs_info->super_copy);
4504 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4505
4506 if (sinfo->total_bytes - bytes_used < thresh)
4507 return 1;
4508 }
4509
4510 if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4511 return 0;
4512 return 1;
4513 }
4514
4515 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4516 {
4517 u64 num_dev;
4518
4519 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4520 BTRFS_BLOCK_GROUP_RAID0 |
4521 BTRFS_BLOCK_GROUP_RAID5 |
4522 BTRFS_BLOCK_GROUP_RAID6))
4523 num_dev = fs_info->fs_devices->rw_devices;
4524 else if (type & BTRFS_BLOCK_GROUP_RAID1)
4525 num_dev = 2;
4526 else
4527 num_dev = 1; /* DUP or single */
4528
4529 return num_dev;
4530 }
4531
4532 /*
4533 * If @is_allocation is true, reserve space in the system space info necessary
4534 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4535 * removing a chunk.
4536 */
4537 void check_system_chunk(struct btrfs_trans_handle *trans,
4538 struct btrfs_fs_info *fs_info, u64 type)
4539 {
4540 struct btrfs_space_info *info;
4541 u64 left;
4542 u64 thresh;
4543 int ret = 0;
4544 u64 num_devs;
4545
4546 /*
4547 * Needed because we can end up allocating a system chunk and for an
4548 * atomic and race free space reservation in the chunk block reserve.
4549 */
4550 ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
4551
4552 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4553 spin_lock(&info->lock);
4554 left = info->total_bytes - btrfs_space_info_used(info, true);
4555 spin_unlock(&info->lock);
4556
4557 num_devs = get_profile_num_devs(fs_info, type);
4558
4559 /* num_devs device items to update and 1 chunk item to add or remove */
4560 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4561 btrfs_calc_trans_metadata_size(fs_info, 1);
4562
4563 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4564 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4565 left, thresh, type);
4566 dump_space_info(fs_info, info, 0, 0);
4567 }
4568
4569 if (left < thresh) {
4570 u64 flags = btrfs_system_alloc_profile(fs_info);
4571
4572 /*
4573 * Ignore failure to create system chunk. We might end up not
4574 * needing it, as we might not need to COW all nodes/leafs from
4575 * the paths we visit in the chunk tree (they were already COWed
4576 * or created in the current transaction for example).
4577 */
4578 ret = btrfs_alloc_chunk(trans, fs_info, flags);
4579 }
4580
4581 if (!ret) {
4582 ret = btrfs_block_rsv_add(fs_info->chunk_root,
4583 &fs_info->chunk_block_rsv,
4584 thresh, BTRFS_RESERVE_NO_FLUSH);
4585 if (!ret)
4586 trans->chunk_bytes_reserved += thresh;
4587 }
4588 }
4589
4590 /*
4591 * If force is CHUNK_ALLOC_FORCE:
4592 * - return 1 if it successfully allocates a chunk,
4593 * - return errors including -ENOSPC otherwise.
4594 * If force is NOT CHUNK_ALLOC_FORCE:
4595 * - return 0 if it doesn't need to allocate a new chunk,
4596 * - return 1 if it successfully allocates a chunk,
4597 * - return errors including -ENOSPC otherwise.
4598 */
4599 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4600 struct btrfs_fs_info *fs_info, u64 flags, int force)
4601 {
4602 struct btrfs_space_info *space_info;
4603 int wait_for_alloc = 0;
4604 int ret = 0;
4605
4606 /* Don't re-enter if we're already allocating a chunk */
4607 if (trans->allocating_chunk)
4608 return -ENOSPC;
4609
4610 space_info = __find_space_info(fs_info, flags);
4611 if (!space_info) {
4612 ret = create_space_info(fs_info, flags, &space_info);
4613 if (ret)
4614 return ret;
4615 }
4616
4617 again:
4618 spin_lock(&space_info->lock);
4619 if (force < space_info->force_alloc)
4620 force = space_info->force_alloc;
4621 if (space_info->full) {
4622 if (should_alloc_chunk(fs_info, space_info, force))
4623 ret = -ENOSPC;
4624 else
4625 ret = 0;
4626 spin_unlock(&space_info->lock);
4627 return ret;
4628 }
4629
4630 if (!should_alloc_chunk(fs_info, space_info, force)) {
4631 spin_unlock(&space_info->lock);
4632 return 0;
4633 } else if (space_info->chunk_alloc) {
4634 wait_for_alloc = 1;
4635 } else {
4636 space_info->chunk_alloc = 1;
4637 }
4638
4639 spin_unlock(&space_info->lock);
4640
4641 mutex_lock(&fs_info->chunk_mutex);
4642
4643 /*
4644 * The chunk_mutex is held throughout the entirety of a chunk
4645 * allocation, so once we've acquired the chunk_mutex we know that the
4646 * other guy is done and we need to recheck and see if we should
4647 * allocate.
4648 */
4649 if (wait_for_alloc) {
4650 mutex_unlock(&fs_info->chunk_mutex);
4651 wait_for_alloc = 0;
4652 cond_resched();
4653 goto again;
4654 }
4655
4656 trans->allocating_chunk = true;
4657
4658 /*
4659 * If we have mixed data/metadata chunks we want to make sure we keep
4660 * allocating mixed chunks instead of individual chunks.
4661 */
4662 if (btrfs_mixed_space_info(space_info))
4663 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4664
4665 /*
4666 * if we're doing a data chunk, go ahead and make sure that
4667 * we keep a reasonable number of metadata chunks allocated in the
4668 * FS as well.
4669 */
4670 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4671 fs_info->data_chunk_allocations++;
4672 if (!(fs_info->data_chunk_allocations %
4673 fs_info->metadata_ratio))
4674 force_metadata_allocation(fs_info);
4675 }
4676
4677 /*
4678 * Check if we have enough space in SYSTEM chunk because we may need
4679 * to update devices.
4680 */
4681 check_system_chunk(trans, fs_info, flags);
4682
4683 ret = btrfs_alloc_chunk(trans, fs_info, flags);
4684 trans->allocating_chunk = false;
4685
4686 spin_lock(&space_info->lock);
4687 if (ret < 0 && ret != -ENOSPC)
4688 goto out;
4689 if (ret) {
4690 space_info->full = 1;
4691 } else {
4692 ret = 1;
4693 space_info->max_extent_size = 0;
4694 }
4695
4696 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4697 out:
4698 space_info->chunk_alloc = 0;
4699 spin_unlock(&space_info->lock);
4700 mutex_unlock(&fs_info->chunk_mutex);
4701 /*
4702 * When we allocate a new chunk we reserve space in the chunk block
4703 * reserve to make sure we can COW nodes/leafs in the chunk tree or
4704 * add new nodes/leafs to it if we end up needing to do it when
4705 * inserting the chunk item and updating device items as part of the
4706 * second phase of chunk allocation, performed by
4707 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4708 * large number of new block groups to create in our transaction
4709 * handle's new_bgs list to avoid exhausting the chunk block reserve
4710 * in extreme cases - like having a single transaction create many new
4711 * block groups when starting to write out the free space caches of all
4712 * the block groups that were made dirty during the lifetime of the
4713 * transaction.
4714 */
4715 if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4716 btrfs_create_pending_block_groups(trans, fs_info);
4717
4718 return ret;
4719 }
4720
4721 static int can_overcommit(struct btrfs_fs_info *fs_info,
4722 struct btrfs_space_info *space_info, u64 bytes,
4723 enum btrfs_reserve_flush_enum flush,
4724 bool system_chunk)
4725 {
4726 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4727 u64 profile;
4728 u64 space_size;
4729 u64 avail;
4730 u64 used;
4731
4732 /* Don't overcommit when in mixed mode. */
4733 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4734 return 0;
4735
4736 if (system_chunk)
4737 profile = btrfs_system_alloc_profile(fs_info);
4738 else
4739 profile = btrfs_metadata_alloc_profile(fs_info);
4740
4741 used = btrfs_space_info_used(space_info, false);
4742
4743 /*
4744 * We only want to allow over committing if we have lots of actual space
4745 * free, but if we don't have enough space to handle the global reserve
4746 * space then we could end up having a real enospc problem when trying
4747 * to allocate a chunk or some other such important allocation.
4748 */
4749 spin_lock(&global_rsv->lock);
4750 space_size = calc_global_rsv_need_space(global_rsv);
4751 spin_unlock(&global_rsv->lock);
4752 if (used + space_size >= space_info->total_bytes)
4753 return 0;
4754
4755 used += space_info->bytes_may_use;
4756
4757 avail = atomic64_read(&fs_info->free_chunk_space);
4758
4759 /*
4760 * If we have dup, raid1 or raid10 then only half of the free
4761 * space is actually useable. For raid56, the space info used
4762 * doesn't include the parity drive, so we don't have to
4763 * change the math
4764 */
4765 if (profile & (BTRFS_BLOCK_GROUP_DUP |
4766 BTRFS_BLOCK_GROUP_RAID1 |
4767 BTRFS_BLOCK_GROUP_RAID10))
4768 avail >>= 1;
4769
4770 /*
4771 * If we aren't flushing all things, let us overcommit up to
4772 * 1/2th of the space. If we can flush, don't let us overcommit
4773 * too much, let it overcommit up to 1/8 of the space.
4774 */
4775 if (flush == BTRFS_RESERVE_FLUSH_ALL)
4776 avail >>= 3;
4777 else
4778 avail >>= 1;
4779
4780 if (used + bytes < space_info->total_bytes + avail)
4781 return 1;
4782 return 0;
4783 }
4784
4785 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4786 unsigned long nr_pages, int nr_items)
4787 {
4788 struct super_block *sb = fs_info->sb;
4789
4790 if (down_read_trylock(&sb->s_umount)) {
4791 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4792 up_read(&sb->s_umount);
4793 } else {
4794 /*
4795 * We needn't worry the filesystem going from r/w to r/o though
4796 * we don't acquire ->s_umount mutex, because the filesystem
4797 * should guarantee the delalloc inodes list be empty after
4798 * the filesystem is readonly(all dirty pages are written to
4799 * the disk).
4800 */
4801 btrfs_start_delalloc_roots(fs_info, 0, nr_items);
4802 if (!current->journal_info)
4803 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4804 }
4805 }
4806
4807 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4808 u64 to_reclaim)
4809 {
4810 u64 bytes;
4811 u64 nr;
4812
4813 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4814 nr = div64_u64(to_reclaim, bytes);
4815 if (!nr)
4816 nr = 1;
4817 return nr;
4818 }
4819
4820 #define EXTENT_SIZE_PER_ITEM SZ_256K
4821
4822 /*
4823 * shrink metadata reservation for delalloc
4824 */
4825 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4826 u64 orig, bool wait_ordered)
4827 {
4828 struct btrfs_space_info *space_info;
4829 struct btrfs_trans_handle *trans;
4830 u64 delalloc_bytes;
4831 u64 max_reclaim;
4832 u64 items;
4833 long time_left;
4834 unsigned long nr_pages;
4835 int loops;
4836 enum btrfs_reserve_flush_enum flush;
4837
4838 /* Calc the number of the pages we need flush for space reservation */
4839 items = calc_reclaim_items_nr(fs_info, to_reclaim);
4840 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4841
4842 trans = (struct btrfs_trans_handle *)current->journal_info;
4843 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4844
4845 delalloc_bytes = percpu_counter_sum_positive(
4846 &fs_info->delalloc_bytes);
4847 if (delalloc_bytes == 0) {
4848 if (trans)
4849 return;
4850 if (wait_ordered)
4851 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4852 return;
4853 }
4854
4855 loops = 0;
4856 while (delalloc_bytes && loops < 3) {
4857 max_reclaim = min(delalloc_bytes, to_reclaim);
4858 nr_pages = max_reclaim >> PAGE_SHIFT;
4859 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4860 /*
4861 * We need to wait for the async pages to actually start before
4862 * we do anything.
4863 */
4864 max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4865 if (!max_reclaim)
4866 goto skip_async;
4867
4868 if (max_reclaim <= nr_pages)
4869 max_reclaim = 0;
4870 else
4871 max_reclaim -= nr_pages;
4872
4873 wait_event(fs_info->async_submit_wait,
4874 atomic_read(&fs_info->async_delalloc_pages) <=
4875 (int)max_reclaim);
4876 skip_async:
4877 if (!trans)
4878 flush = BTRFS_RESERVE_FLUSH_ALL;
4879 else
4880 flush = BTRFS_RESERVE_NO_FLUSH;
4881 spin_lock(&space_info->lock);
4882 if (list_empty(&space_info->tickets) &&
4883 list_empty(&space_info->priority_tickets)) {
4884 spin_unlock(&space_info->lock);
4885 break;
4886 }
4887 spin_unlock(&space_info->lock);
4888
4889 loops++;
4890 if (wait_ordered && !trans) {
4891 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4892 } else {
4893 time_left = schedule_timeout_killable(1);
4894 if (time_left)
4895 break;
4896 }
4897 delalloc_bytes = percpu_counter_sum_positive(
4898 &fs_info->delalloc_bytes);
4899 }
4900 }
4901
4902 struct reserve_ticket {
4903 u64 bytes;
4904 int error;
4905 struct list_head list;
4906 wait_queue_head_t wait;
4907 };
4908
4909 /**
4910 * maybe_commit_transaction - possibly commit the transaction if its ok to
4911 * @root - the root we're allocating for
4912 * @bytes - the number of bytes we want to reserve
4913 * @force - force the commit
4914 *
4915 * This will check to make sure that committing the transaction will actually
4916 * get us somewhere and then commit the transaction if it does. Otherwise it
4917 * will return -ENOSPC.
4918 */
4919 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4920 struct btrfs_space_info *space_info)
4921 {
4922 struct reserve_ticket *ticket = NULL;
4923 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4924 struct btrfs_trans_handle *trans;
4925 u64 bytes;
4926
4927 trans = (struct btrfs_trans_handle *)current->journal_info;
4928 if (trans)
4929 return -EAGAIN;
4930
4931 spin_lock(&space_info->lock);
4932 if (!list_empty(&space_info->priority_tickets))
4933 ticket = list_first_entry(&space_info->priority_tickets,
4934 struct reserve_ticket, list);
4935 else if (!list_empty(&space_info->tickets))
4936 ticket = list_first_entry(&space_info->tickets,
4937 struct reserve_ticket, list);
4938 bytes = (ticket) ? ticket->bytes : 0;
4939 spin_unlock(&space_info->lock);
4940
4941 if (!bytes)
4942 return 0;
4943
4944 /* See if there is enough pinned space to make this reservation */
4945 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4946 bytes) >= 0)
4947 goto commit;
4948
4949 /*
4950 * See if there is some space in the delayed insertion reservation for
4951 * this reservation.
4952 */
4953 if (space_info != delayed_rsv->space_info)
4954 return -ENOSPC;
4955
4956 spin_lock(&delayed_rsv->lock);
4957 if (delayed_rsv->size > bytes)
4958 bytes = 0;
4959 else
4960 bytes -= delayed_rsv->size;
4961 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4962 bytes) < 0) {
4963 spin_unlock(&delayed_rsv->lock);
4964 return -ENOSPC;
4965 }
4966 spin_unlock(&delayed_rsv->lock);
4967
4968 commit:
4969 trans = btrfs_join_transaction(fs_info->extent_root);
4970 if (IS_ERR(trans))
4971 return -ENOSPC;
4972
4973 return btrfs_commit_transaction(trans);
4974 }
4975
4976 /*
4977 * Try to flush some data based on policy set by @state. This is only advisory
4978 * and may fail for various reasons. The caller is supposed to examine the
4979 * state of @space_info to detect the outcome.
4980 */
4981 static void flush_space(struct btrfs_fs_info *fs_info,
4982 struct btrfs_space_info *space_info, u64 num_bytes,
4983 int state)
4984 {
4985 struct btrfs_root *root = fs_info->extent_root;
4986 struct btrfs_trans_handle *trans;
4987 int nr;
4988 int ret = 0;
4989
4990 switch (state) {
4991 case FLUSH_DELAYED_ITEMS_NR:
4992 case FLUSH_DELAYED_ITEMS:
4993 if (state == FLUSH_DELAYED_ITEMS_NR)
4994 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4995 else
4996 nr = -1;
4997
4998 trans = btrfs_join_transaction(root);
4999 if (IS_ERR(trans)) {
5000 ret = PTR_ERR(trans);
5001 break;
5002 }
5003 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
5004 btrfs_end_transaction(trans);
5005 break;
5006 case FLUSH_DELALLOC:
5007 case FLUSH_DELALLOC_WAIT:
5008 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
5009 state == FLUSH_DELALLOC_WAIT);
5010 break;
5011 case ALLOC_CHUNK:
5012 trans = btrfs_join_transaction(root);
5013 if (IS_ERR(trans)) {
5014 ret = PTR_ERR(trans);
5015 break;
5016 }
5017 ret = do_chunk_alloc(trans, fs_info,
5018 btrfs_metadata_alloc_profile(fs_info),
5019 CHUNK_ALLOC_NO_FORCE);
5020 btrfs_end_transaction(trans);
5021 if (ret > 0 || ret == -ENOSPC)
5022 ret = 0;
5023 break;
5024 case COMMIT_TRANS:
5025 ret = may_commit_transaction(fs_info, space_info);
5026 break;
5027 default:
5028 ret = -ENOSPC;
5029 break;
5030 }
5031
5032 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
5033 ret);
5034 return;
5035 }
5036
5037 static inline u64
5038 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
5039 struct btrfs_space_info *space_info,
5040 bool system_chunk)
5041 {
5042 struct reserve_ticket *ticket;
5043 u64 used;
5044 u64 expected;
5045 u64 to_reclaim = 0;
5046
5047 list_for_each_entry(ticket, &space_info->tickets, list)
5048 to_reclaim += ticket->bytes;
5049 list_for_each_entry(ticket, &space_info->priority_tickets, list)
5050 to_reclaim += ticket->bytes;
5051 if (to_reclaim)
5052 return to_reclaim;
5053
5054 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
5055 if (can_overcommit(fs_info, space_info, to_reclaim,
5056 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5057 return 0;
5058
5059 used = btrfs_space_info_used(space_info, true);
5060
5061 if (can_overcommit(fs_info, space_info, SZ_1M,
5062 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5063 expected = div_factor_fine(space_info->total_bytes, 95);
5064 else
5065 expected = div_factor_fine(space_info->total_bytes, 90);
5066
5067 if (used > expected)
5068 to_reclaim = used - expected;
5069 else
5070 to_reclaim = 0;
5071 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5072 space_info->bytes_reserved);
5073 return to_reclaim;
5074 }
5075
5076 static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5077 struct btrfs_space_info *space_info,
5078 u64 used, bool system_chunk)
5079 {
5080 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5081
5082 /* If we're just plain full then async reclaim just slows us down. */
5083 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5084 return 0;
5085
5086 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5087 system_chunk))
5088 return 0;
5089
5090 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5091 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5092 }
5093
5094 static void wake_all_tickets(struct list_head *head)
5095 {
5096 struct reserve_ticket *ticket;
5097
5098 while (!list_empty(head)) {
5099 ticket = list_first_entry(head, struct reserve_ticket, list);
5100 list_del_init(&ticket->list);
5101 ticket->error = -ENOSPC;
5102 wake_up(&ticket->wait);
5103 }
5104 }
5105
5106 /*
5107 * This is for normal flushers, we can wait all goddamned day if we want to. We
5108 * will loop and continuously try to flush as long as we are making progress.
5109 * We count progress as clearing off tickets each time we have to loop.
5110 */
5111 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5112 {
5113 struct btrfs_fs_info *fs_info;
5114 struct btrfs_space_info *space_info;
5115 u64 to_reclaim;
5116 int flush_state;
5117 int commit_cycles = 0;
5118 u64 last_tickets_id;
5119
5120 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5121 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5122
5123 spin_lock(&space_info->lock);
5124 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5125 false);
5126 if (!to_reclaim) {
5127 space_info->flush = 0;
5128 spin_unlock(&space_info->lock);
5129 return;
5130 }
5131 last_tickets_id = space_info->tickets_id;
5132 spin_unlock(&space_info->lock);
5133
5134 flush_state = FLUSH_DELAYED_ITEMS_NR;
5135 do {
5136 flush_space(fs_info, space_info, to_reclaim, flush_state);
5137 spin_lock(&space_info->lock);
5138 if (list_empty(&space_info->tickets)) {
5139 space_info->flush = 0;
5140 spin_unlock(&space_info->lock);
5141 return;
5142 }
5143 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5144 space_info,
5145 false);
5146 if (last_tickets_id == space_info->tickets_id) {
5147 flush_state++;
5148 } else {
5149 last_tickets_id = space_info->tickets_id;
5150 flush_state = FLUSH_DELAYED_ITEMS_NR;
5151 if (commit_cycles)
5152 commit_cycles--;
5153 }
5154
5155 if (flush_state > COMMIT_TRANS) {
5156 commit_cycles++;
5157 if (commit_cycles > 2) {
5158 wake_all_tickets(&space_info->tickets);
5159 space_info->flush = 0;
5160 } else {
5161 flush_state = FLUSH_DELAYED_ITEMS_NR;
5162 }
5163 }
5164 spin_unlock(&space_info->lock);
5165 } while (flush_state <= COMMIT_TRANS);
5166 }
5167
5168 void btrfs_init_async_reclaim_work(struct work_struct *work)
5169 {
5170 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5171 }
5172
5173 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5174 struct btrfs_space_info *space_info,
5175 struct reserve_ticket *ticket)
5176 {
5177 u64 to_reclaim;
5178 int flush_state = FLUSH_DELAYED_ITEMS_NR;
5179
5180 spin_lock(&space_info->lock);
5181 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5182 false);
5183 if (!to_reclaim) {
5184 spin_unlock(&space_info->lock);
5185 return;
5186 }
5187 spin_unlock(&space_info->lock);
5188
5189 do {
5190 flush_space(fs_info, space_info, to_reclaim, flush_state);
5191 flush_state++;
5192 spin_lock(&space_info->lock);
5193 if (ticket->bytes == 0) {
5194 spin_unlock(&space_info->lock);
5195 return;
5196 }
5197 spin_unlock(&space_info->lock);
5198
5199 /*
5200 * Priority flushers can't wait on delalloc without
5201 * deadlocking.
5202 */
5203 if (flush_state == FLUSH_DELALLOC ||
5204 flush_state == FLUSH_DELALLOC_WAIT)
5205 flush_state = ALLOC_CHUNK;
5206 } while (flush_state < COMMIT_TRANS);
5207 }
5208
5209 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5210 struct btrfs_space_info *space_info,
5211 struct reserve_ticket *ticket, u64 orig_bytes)
5212
5213 {
5214 DEFINE_WAIT(wait);
5215 int ret = 0;
5216
5217 spin_lock(&space_info->lock);
5218 while (ticket->bytes > 0 && ticket->error == 0) {
5219 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5220 if (ret) {
5221 ret = -EINTR;
5222 break;
5223 }
5224 spin_unlock(&space_info->lock);
5225
5226 schedule();
5227
5228 finish_wait(&ticket->wait, &wait);
5229 spin_lock(&space_info->lock);
5230 }
5231 if (!ret)
5232 ret = ticket->error;
5233 if (!list_empty(&ticket->list))
5234 list_del_init(&ticket->list);
5235 if (ticket->bytes && ticket->bytes < orig_bytes) {
5236 u64 num_bytes = orig_bytes - ticket->bytes;
5237 space_info->bytes_may_use -= num_bytes;
5238 trace_btrfs_space_reservation(fs_info, "space_info",
5239 space_info->flags, num_bytes, 0);
5240 }
5241 spin_unlock(&space_info->lock);
5242
5243 return ret;
5244 }
5245
5246 /**
5247 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5248 * @root - the root we're allocating for
5249 * @space_info - the space info we want to allocate from
5250 * @orig_bytes - the number of bytes we want
5251 * @flush - whether or not we can flush to make our reservation
5252 *
5253 * This will reserve orig_bytes number of bytes from the space info associated
5254 * with the block_rsv. If there is not enough space it will make an attempt to
5255 * flush out space to make room. It will do this by flushing delalloc if
5256 * possible or committing the transaction. If flush is 0 then no attempts to
5257 * regain reservations will be made and this will fail if there is not enough
5258 * space already.
5259 */
5260 static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5261 struct btrfs_space_info *space_info,
5262 u64 orig_bytes,
5263 enum btrfs_reserve_flush_enum flush,
5264 bool system_chunk)
5265 {
5266 struct reserve_ticket ticket;
5267 u64 used;
5268 int ret = 0;
5269
5270 ASSERT(orig_bytes);
5271 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5272
5273 spin_lock(&space_info->lock);
5274 ret = -ENOSPC;
5275 used = btrfs_space_info_used(space_info, true);
5276
5277 /*
5278 * If we have enough space then hooray, make our reservation and carry
5279 * on. If not see if we can overcommit, and if we can, hooray carry on.
5280 * If not things get more complicated.
5281 */
5282 if (used + orig_bytes <= space_info->total_bytes) {
5283 space_info->bytes_may_use += orig_bytes;
5284 trace_btrfs_space_reservation(fs_info, "space_info",
5285 space_info->flags, orig_bytes, 1);
5286 ret = 0;
5287 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5288 system_chunk)) {
5289 space_info->bytes_may_use += orig_bytes;
5290 trace_btrfs_space_reservation(fs_info, "space_info",
5291 space_info->flags, orig_bytes, 1);
5292 ret = 0;
5293 }
5294
5295 /*
5296 * If we couldn't make a reservation then setup our reservation ticket
5297 * and kick the async worker if it's not already running.
5298 *
5299 * If we are a priority flusher then we just need to add our ticket to
5300 * the list and we will do our own flushing further down.
5301 */
5302 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5303 ticket.bytes = orig_bytes;
5304 ticket.error = 0;
5305 init_waitqueue_head(&ticket.wait);
5306 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5307 list_add_tail(&ticket.list, &space_info->tickets);
5308 if (!space_info->flush) {
5309 space_info->flush = 1;
5310 trace_btrfs_trigger_flush(fs_info,
5311 space_info->flags,
5312 orig_bytes, flush,
5313 "enospc");
5314 queue_work(system_unbound_wq,
5315 &fs_info->async_reclaim_work);
5316 }
5317 } else {
5318 list_add_tail(&ticket.list,
5319 &space_info->priority_tickets);
5320 }
5321 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5322 used += orig_bytes;
5323 /*
5324 * We will do the space reservation dance during log replay,
5325 * which means we won't have fs_info->fs_root set, so don't do
5326 * the async reclaim as we will panic.
5327 */
5328 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5329 need_do_async_reclaim(fs_info, space_info,
5330 used, system_chunk) &&
5331 !work_busy(&fs_info->async_reclaim_work)) {
5332 trace_btrfs_trigger_flush(fs_info, space_info->flags,
5333 orig_bytes, flush, "preempt");
5334 queue_work(system_unbound_wq,
5335 &fs_info->async_reclaim_work);
5336 }
5337 }
5338 spin_unlock(&space_info->lock);
5339 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5340 return ret;
5341
5342 if (flush == BTRFS_RESERVE_FLUSH_ALL)
5343 return wait_reserve_ticket(fs_info, space_info, &ticket,
5344 orig_bytes);
5345
5346 ret = 0;
5347 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5348 spin_lock(&space_info->lock);
5349 if (ticket.bytes) {
5350 if (ticket.bytes < orig_bytes) {
5351 u64 num_bytes = orig_bytes - ticket.bytes;
5352 space_info->bytes_may_use -= num_bytes;
5353 trace_btrfs_space_reservation(fs_info, "space_info",
5354 space_info->flags,
5355 num_bytes, 0);
5356
5357 }
5358 list_del_init(&ticket.list);
5359 ret = -ENOSPC;
5360 }
5361 spin_unlock(&space_info->lock);
5362 ASSERT(list_empty(&ticket.list));
5363 return ret;
5364 }
5365
5366 /**
5367 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5368 * @root - the root we're allocating for
5369 * @block_rsv - the block_rsv we're allocating for
5370 * @orig_bytes - the number of bytes we want
5371 * @flush - whether or not we can flush to make our reservation
5372 *
5373 * This will reserve orgi_bytes number of bytes from the space info associated
5374 * with the block_rsv. If there is not enough space it will make an attempt to
5375 * flush out space to make room. It will do this by flushing delalloc if
5376 * possible or committing the transaction. If flush is 0 then no attempts to
5377 * regain reservations will be made and this will fail if there is not enough
5378 * space already.
5379 */
5380 static int reserve_metadata_bytes(struct btrfs_root *root,
5381 struct btrfs_block_rsv *block_rsv,
5382 u64 orig_bytes,
5383 enum btrfs_reserve_flush_enum flush)
5384 {
5385 struct btrfs_fs_info *fs_info = root->fs_info;
5386 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5387 int ret;
5388 bool system_chunk = (root == fs_info->chunk_root);
5389
5390 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5391 orig_bytes, flush, system_chunk);
5392 if (ret == -ENOSPC &&
5393 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5394 if (block_rsv != global_rsv &&
5395 !block_rsv_use_bytes(global_rsv, orig_bytes))
5396 ret = 0;
5397 }
5398 if (ret == -ENOSPC)
5399 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5400 block_rsv->space_info->flags,
5401 orig_bytes, 1);
5402 return ret;
5403 }
5404
5405 static struct btrfs_block_rsv *get_block_rsv(
5406 const struct btrfs_trans_handle *trans,
5407 const struct btrfs_root *root)
5408 {
5409 struct btrfs_fs_info *fs_info = root->fs_info;
5410 struct btrfs_block_rsv *block_rsv = NULL;
5411
5412 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5413 (root == fs_info->csum_root && trans->adding_csums) ||
5414 (root == fs_info->uuid_root))
5415 block_rsv = trans->block_rsv;
5416
5417 if (!block_rsv)
5418 block_rsv = root->block_rsv;
5419
5420 if (!block_rsv)
5421 block_rsv = &fs_info->empty_block_rsv;
5422
5423 return block_rsv;
5424 }
5425
5426 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5427 u64 num_bytes)
5428 {
5429 int ret = -ENOSPC;
5430 spin_lock(&block_rsv->lock);
5431 if (block_rsv->reserved >= num_bytes) {
5432 block_rsv->reserved -= num_bytes;
5433 if (block_rsv->reserved < block_rsv->size)
5434 block_rsv->full = 0;
5435 ret = 0;
5436 }
5437 spin_unlock(&block_rsv->lock);
5438 return ret;
5439 }
5440
5441 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5442 u64 num_bytes, int update_size)
5443 {
5444 spin_lock(&block_rsv->lock);
5445 block_rsv->reserved += num_bytes;
5446 if (update_size)
5447 block_rsv->size += num_bytes;
5448 else if (block_rsv->reserved >= block_rsv->size)
5449 block_rsv->full = 1;
5450 spin_unlock(&block_rsv->lock);
5451 }
5452
5453 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5454 struct btrfs_block_rsv *dest, u64 num_bytes,
5455 int min_factor)
5456 {
5457 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5458 u64 min_bytes;
5459
5460 if (global_rsv->space_info != dest->space_info)
5461 return -ENOSPC;
5462
5463 spin_lock(&global_rsv->lock);
5464 min_bytes = div_factor(global_rsv->size, min_factor);
5465 if (global_rsv->reserved < min_bytes + num_bytes) {
5466 spin_unlock(&global_rsv->lock);
5467 return -ENOSPC;
5468 }
5469 global_rsv->reserved -= num_bytes;
5470 if (global_rsv->reserved < global_rsv->size)
5471 global_rsv->full = 0;
5472 spin_unlock(&global_rsv->lock);
5473
5474 block_rsv_add_bytes(dest, num_bytes, 1);
5475 return 0;
5476 }
5477
5478 /*
5479 * This is for space we already have accounted in space_info->bytes_may_use, so
5480 * basically when we're returning space from block_rsv's.
5481 */
5482 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5483 struct btrfs_space_info *space_info,
5484 u64 num_bytes)
5485 {
5486 struct reserve_ticket *ticket;
5487 struct list_head *head;
5488 u64 used;
5489 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5490 bool check_overcommit = false;
5491
5492 spin_lock(&space_info->lock);
5493 head = &space_info->priority_tickets;
5494
5495 /*
5496 * If we are over our limit then we need to check and see if we can
5497 * overcommit, and if we can't then we just need to free up our space
5498 * and not satisfy any requests.
5499 */
5500 used = btrfs_space_info_used(space_info, true);
5501 if (used - num_bytes >= space_info->total_bytes)
5502 check_overcommit = true;
5503 again:
5504 while (!list_empty(head) && num_bytes) {
5505 ticket = list_first_entry(head, struct reserve_ticket,
5506 list);
5507 /*
5508 * We use 0 bytes because this space is already reserved, so
5509 * adding the ticket space would be a double count.
5510 */
5511 if (check_overcommit &&
5512 !can_overcommit(fs_info, space_info, 0, flush, false))
5513 break;
5514 if (num_bytes >= ticket->bytes) {
5515 list_del_init(&ticket->list);
5516 num_bytes -= ticket->bytes;
5517 ticket->bytes = 0;
5518 space_info->tickets_id++;
5519 wake_up(&ticket->wait);
5520 } else {
5521 ticket->bytes -= num_bytes;
5522 num_bytes = 0;
5523 }
5524 }
5525
5526 if (num_bytes && head == &space_info->priority_tickets) {
5527 head = &space_info->tickets;
5528 flush = BTRFS_RESERVE_FLUSH_ALL;
5529 goto again;
5530 }
5531 space_info->bytes_may_use -= num_bytes;
5532 trace_btrfs_space_reservation(fs_info, "space_info",
5533 space_info->flags, num_bytes, 0);
5534 spin_unlock(&space_info->lock);
5535 }
5536
5537 /*
5538 * This is for newly allocated space that isn't accounted in
5539 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5540 * we use this helper.
5541 */
5542 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5543 struct btrfs_space_info *space_info,
5544 u64 num_bytes)
5545 {
5546 struct reserve_ticket *ticket;
5547 struct list_head *head = &space_info->priority_tickets;
5548
5549 again:
5550 while (!list_empty(head) && num_bytes) {
5551 ticket = list_first_entry(head, struct reserve_ticket,
5552 list);
5553 if (num_bytes >= ticket->bytes) {
5554 trace_btrfs_space_reservation(fs_info, "space_info",
5555 space_info->flags,
5556 ticket->bytes, 1);
5557 list_del_init(&ticket->list);
5558 num_bytes -= ticket->bytes;
5559 space_info->bytes_may_use += ticket->bytes;
5560 ticket->bytes = 0;
5561 space_info->tickets_id++;
5562 wake_up(&ticket->wait);
5563 } else {
5564 trace_btrfs_space_reservation(fs_info, "space_info",
5565 space_info->flags,
5566 num_bytes, 1);
5567 space_info->bytes_may_use += num_bytes;
5568 ticket->bytes -= num_bytes;
5569 num_bytes = 0;
5570 }
5571 }
5572
5573 if (num_bytes && head == &space_info->priority_tickets) {
5574 head = &space_info->tickets;
5575 goto again;
5576 }
5577 }
5578
5579 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5580 struct btrfs_block_rsv *block_rsv,
5581 struct btrfs_block_rsv *dest, u64 num_bytes)
5582 {
5583 struct btrfs_space_info *space_info = block_rsv->space_info;
5584 u64 ret;
5585
5586 spin_lock(&block_rsv->lock);
5587 if (num_bytes == (u64)-1)
5588 num_bytes = block_rsv->size;
5589 block_rsv->size -= num_bytes;
5590 if (block_rsv->reserved >= block_rsv->size) {
5591 num_bytes = block_rsv->reserved - block_rsv->size;
5592 block_rsv->reserved = block_rsv->size;
5593 block_rsv->full = 1;
5594 } else {
5595 num_bytes = 0;
5596 }
5597 spin_unlock(&block_rsv->lock);
5598
5599 ret = num_bytes;
5600 if (num_bytes > 0) {
5601 if (dest) {
5602 spin_lock(&dest->lock);
5603 if (!dest->full) {
5604 u64 bytes_to_add;
5605
5606 bytes_to_add = dest->size - dest->reserved;
5607 bytes_to_add = min(num_bytes, bytes_to_add);
5608 dest->reserved += bytes_to_add;
5609 if (dest->reserved >= dest->size)
5610 dest->full = 1;
5611 num_bytes -= bytes_to_add;
5612 }
5613 spin_unlock(&dest->lock);
5614 }
5615 if (num_bytes)
5616 space_info_add_old_bytes(fs_info, space_info,
5617 num_bytes);
5618 }
5619 return ret;
5620 }
5621
5622 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5623 struct btrfs_block_rsv *dst, u64 num_bytes,
5624 int update_size)
5625 {
5626 int ret;
5627
5628 ret = block_rsv_use_bytes(src, num_bytes);
5629 if (ret)
5630 return ret;
5631
5632 block_rsv_add_bytes(dst, num_bytes, update_size);
5633 return 0;
5634 }
5635
5636 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5637 {
5638 memset(rsv, 0, sizeof(*rsv));
5639 spin_lock_init(&rsv->lock);
5640 rsv->type = type;
5641 }
5642
5643 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5644 struct btrfs_block_rsv *rsv,
5645 unsigned short type)
5646 {
5647 btrfs_init_block_rsv(rsv, type);
5648 rsv->space_info = __find_space_info(fs_info,
5649 BTRFS_BLOCK_GROUP_METADATA);
5650 }
5651
5652 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5653 unsigned short type)
5654 {
5655 struct btrfs_block_rsv *block_rsv;
5656
5657 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5658 if (!block_rsv)
5659 return NULL;
5660
5661 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5662 return block_rsv;
5663 }
5664
5665 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5666 struct btrfs_block_rsv *rsv)
5667 {
5668 if (!rsv)
5669 return;
5670 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5671 kfree(rsv);
5672 }
5673
5674 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5675 {
5676 kfree(rsv);
5677 }
5678
5679 int btrfs_block_rsv_add(struct btrfs_root *root,
5680 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5681 enum btrfs_reserve_flush_enum flush)
5682 {
5683 int ret;
5684
5685 if (num_bytes == 0)
5686 return 0;
5687
5688 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5689 if (!ret) {
5690 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5691 return 0;
5692 }
5693
5694 return ret;
5695 }
5696
5697 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5698 {
5699 u64 num_bytes = 0;
5700 int ret = -ENOSPC;
5701
5702 if (!block_rsv)
5703 return 0;
5704
5705 spin_lock(&block_rsv->lock);
5706 num_bytes = div_factor(block_rsv->size, min_factor);
5707 if (block_rsv->reserved >= num_bytes)
5708 ret = 0;
5709 spin_unlock(&block_rsv->lock);
5710
5711 return ret;
5712 }
5713
5714 int btrfs_block_rsv_refill(struct btrfs_root *root,
5715 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5716 enum btrfs_reserve_flush_enum flush)
5717 {
5718 u64 num_bytes = 0;
5719 int ret = -ENOSPC;
5720
5721 if (!block_rsv)
5722 return 0;
5723
5724 spin_lock(&block_rsv->lock);
5725 num_bytes = min_reserved;
5726 if (block_rsv->reserved >= num_bytes)
5727 ret = 0;
5728 else
5729 num_bytes -= block_rsv->reserved;
5730 spin_unlock(&block_rsv->lock);
5731
5732 if (!ret)
5733 return 0;
5734
5735 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5736 if (!ret) {
5737 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5738 return 0;
5739 }
5740
5741 return ret;
5742 }
5743
5744 /**
5745 * btrfs_inode_rsv_refill - refill the inode block rsv.
5746 * @inode - the inode we are refilling.
5747 * @flush - the flusing restriction.
5748 *
5749 * Essentially the same as btrfs_block_rsv_refill, except it uses the
5750 * block_rsv->size as the minimum size. We'll either refill the missing amount
5751 * or return if we already have enough space. This will also handle the resreve
5752 * tracepoint for the reserved amount.
5753 */
5754 int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5755 enum btrfs_reserve_flush_enum flush)
5756 {
5757 struct btrfs_root *root = inode->root;
5758 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5759 u64 num_bytes = 0;
5760 int ret = -ENOSPC;
5761
5762 spin_lock(&block_rsv->lock);
5763 if (block_rsv->reserved < block_rsv->size)
5764 num_bytes = block_rsv->size - block_rsv->reserved;
5765 spin_unlock(&block_rsv->lock);
5766
5767 if (num_bytes == 0)
5768 return 0;
5769
5770 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5771 if (!ret) {
5772 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5773 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5774 btrfs_ino(inode), num_bytes, 1);
5775 }
5776 return ret;
5777 }
5778
5779 /**
5780 * btrfs_inode_rsv_release - release any excessive reservation.
5781 * @inode - the inode we need to release from.
5782 *
5783 * This is the same as btrfs_block_rsv_release, except that it handles the
5784 * tracepoint for the reservation.
5785 */
5786 void btrfs_inode_rsv_release(struct btrfs_inode *inode)
5787 {
5788 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5789 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5790 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5791 u64 released = 0;
5792
5793 /*
5794 * Since we statically set the block_rsv->size we just want to say we
5795 * are releasing 0 bytes, and then we'll just get the reservation over
5796 * the size free'd.
5797 */
5798 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
5799 if (released > 0)
5800 trace_btrfs_space_reservation(fs_info, "delalloc",
5801 btrfs_ino(inode), released, 0);
5802 }
5803
5804 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5805 struct btrfs_block_rsv *block_rsv,
5806 u64 num_bytes)
5807 {
5808 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5809
5810 if (global_rsv == block_rsv ||
5811 block_rsv->space_info != global_rsv->space_info)
5812 global_rsv = NULL;
5813 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
5814 }
5815
5816 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5817 {
5818 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5819 struct btrfs_space_info *sinfo = block_rsv->space_info;
5820 u64 num_bytes;
5821
5822 /*
5823 * The global block rsv is based on the size of the extent tree, the
5824 * checksum tree and the root tree. If the fs is empty we want to set
5825 * it to a minimal amount for safety.
5826 */
5827 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5828 btrfs_root_used(&fs_info->csum_root->root_item) +
5829 btrfs_root_used(&fs_info->tree_root->root_item);
5830 num_bytes = max_t(u64, num_bytes, SZ_16M);
5831
5832 spin_lock(&sinfo->lock);
5833 spin_lock(&block_rsv->lock);
5834
5835 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5836
5837 if (block_rsv->reserved < block_rsv->size) {
5838 num_bytes = btrfs_space_info_used(sinfo, true);
5839 if (sinfo->total_bytes > num_bytes) {
5840 num_bytes = sinfo->total_bytes - num_bytes;
5841 num_bytes = min(num_bytes,
5842 block_rsv->size - block_rsv->reserved);
5843 block_rsv->reserved += num_bytes;
5844 sinfo->bytes_may_use += num_bytes;
5845 trace_btrfs_space_reservation(fs_info, "space_info",
5846 sinfo->flags, num_bytes,
5847 1);
5848 }
5849 } else if (block_rsv->reserved > block_rsv->size) {
5850 num_bytes = block_rsv->reserved - block_rsv->size;
5851 sinfo->bytes_may_use -= num_bytes;
5852 trace_btrfs_space_reservation(fs_info, "space_info",
5853 sinfo->flags, num_bytes, 0);
5854 block_rsv->reserved = block_rsv->size;
5855 }
5856
5857 if (block_rsv->reserved == block_rsv->size)
5858 block_rsv->full = 1;
5859 else
5860 block_rsv->full = 0;
5861
5862 spin_unlock(&block_rsv->lock);
5863 spin_unlock(&sinfo->lock);
5864 }
5865
5866 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5867 {
5868 struct btrfs_space_info *space_info;
5869
5870 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5871 fs_info->chunk_block_rsv.space_info = space_info;
5872
5873 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5874 fs_info->global_block_rsv.space_info = space_info;
5875 fs_info->trans_block_rsv.space_info = space_info;
5876 fs_info->empty_block_rsv.space_info = space_info;
5877 fs_info->delayed_block_rsv.space_info = space_info;
5878
5879 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5880 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5881 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5882 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5883 if (fs_info->quota_root)
5884 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5885 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5886
5887 update_global_block_rsv(fs_info);
5888 }
5889
5890 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5891 {
5892 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5893 (u64)-1);
5894 WARN_ON(fs_info->trans_block_rsv.size > 0);
5895 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5896 WARN_ON(fs_info->chunk_block_rsv.size > 0);
5897 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5898 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5899 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5900 }
5901
5902 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5903 struct btrfs_fs_info *fs_info)
5904 {
5905 if (!trans->block_rsv) {
5906 ASSERT(!trans->bytes_reserved);
5907 return;
5908 }
5909
5910 if (!trans->bytes_reserved)
5911 return;
5912
5913 ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
5914 trace_btrfs_space_reservation(fs_info, "transaction",
5915 trans->transid, trans->bytes_reserved, 0);
5916 btrfs_block_rsv_release(fs_info, trans->block_rsv,
5917 trans->bytes_reserved);
5918 trans->bytes_reserved = 0;
5919 }
5920
5921 /*
5922 * To be called after all the new block groups attached to the transaction
5923 * handle have been created (btrfs_create_pending_block_groups()).
5924 */
5925 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5926 {
5927 struct btrfs_fs_info *fs_info = trans->fs_info;
5928
5929 if (!trans->chunk_bytes_reserved)
5930 return;
5931
5932 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5933
5934 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5935 trans->chunk_bytes_reserved);
5936 trans->chunk_bytes_reserved = 0;
5937 }
5938
5939 /* Can only return 0 or -ENOSPC */
5940 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5941 struct btrfs_inode *inode)
5942 {
5943 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5944 struct btrfs_root *root = inode->root;
5945 /*
5946 * We always use trans->block_rsv here as we will have reserved space
5947 * for our orphan when starting the transaction, using get_block_rsv()
5948 * here will sometimes make us choose the wrong block rsv as we could be
5949 * doing a reloc inode for a non refcounted root.
5950 */
5951 struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5952 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5953
5954 /*
5955 * We need to hold space in order to delete our orphan item once we've
5956 * added it, so this takes the reservation so we can release it later
5957 * when we are truly done with the orphan item.
5958 */
5959 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5960
5961 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5962 num_bytes, 1);
5963 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5964 }
5965
5966 void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
5967 {
5968 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5969 struct btrfs_root *root = inode->root;
5970 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5971
5972 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5973 num_bytes, 0);
5974 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
5975 }
5976
5977 /*
5978 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5979 * root: the root of the parent directory
5980 * rsv: block reservation
5981 * items: the number of items that we need do reservation
5982 * qgroup_reserved: used to return the reserved size in qgroup
5983 *
5984 * This function is used to reserve the space for snapshot/subvolume
5985 * creation and deletion. Those operations are different with the
5986 * common file/directory operations, they change two fs/file trees
5987 * and root tree, the number of items that the qgroup reserves is
5988 * different with the free space reservation. So we can not use
5989 * the space reservation mechanism in start_transaction().
5990 */
5991 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5992 struct btrfs_block_rsv *rsv,
5993 int items,
5994 u64 *qgroup_reserved,
5995 bool use_global_rsv)
5996 {
5997 u64 num_bytes;
5998 int ret;
5999 struct btrfs_fs_info *fs_info = root->fs_info;
6000 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6001
6002 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6003 /* One for parent inode, two for dir entries */
6004 num_bytes = 3 * fs_info->nodesize;
6005 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
6006 if (ret)
6007 return ret;
6008 } else {
6009 num_bytes = 0;
6010 }
6011
6012 *qgroup_reserved = num_bytes;
6013
6014 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
6015 rsv->space_info = __find_space_info(fs_info,
6016 BTRFS_BLOCK_GROUP_METADATA);
6017 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
6018 BTRFS_RESERVE_FLUSH_ALL);
6019
6020 if (ret == -ENOSPC && use_global_rsv)
6021 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
6022
6023 if (ret && *qgroup_reserved)
6024 btrfs_qgroup_free_meta(root, *qgroup_reserved);
6025
6026 return ret;
6027 }
6028
6029 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
6030 struct btrfs_block_rsv *rsv)
6031 {
6032 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
6033 }
6034
6035 static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
6036 struct btrfs_inode *inode)
6037 {
6038 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6039 u64 reserve_size = 0;
6040 u64 csum_leaves;
6041 unsigned outstanding_extents;
6042
6043 lockdep_assert_held(&inode->lock);
6044 outstanding_extents = inode->outstanding_extents;
6045 if (outstanding_extents)
6046 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6047 outstanding_extents + 1);
6048 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6049 inode->csum_bytes);
6050 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6051 csum_leaves);
6052
6053 spin_lock(&block_rsv->lock);
6054 block_rsv->size = reserve_size;
6055 spin_unlock(&block_rsv->lock);
6056 }
6057
6058 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6059 {
6060 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6061 struct btrfs_root *root = inode->root;
6062 unsigned nr_extents;
6063 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6064 int ret = 0;
6065 bool delalloc_lock = true;
6066
6067 /* If we are a free space inode we need to not flush since we will be in
6068 * the middle of a transaction commit. We also don't need the delalloc
6069 * mutex since we won't race with anybody. We need this mostly to make
6070 * lockdep shut its filthy mouth.
6071 *
6072 * If we have a transaction open (can happen if we call truncate_block
6073 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6074 */
6075 if (btrfs_is_free_space_inode(inode)) {
6076 flush = BTRFS_RESERVE_NO_FLUSH;
6077 delalloc_lock = false;
6078 } else if (current->journal_info) {
6079 flush = BTRFS_RESERVE_FLUSH_LIMIT;
6080 }
6081
6082 if (flush != BTRFS_RESERVE_NO_FLUSH &&
6083 btrfs_transaction_in_commit(fs_info))
6084 schedule_timeout(1);
6085
6086 if (delalloc_lock)
6087 mutex_lock(&inode->delalloc_mutex);
6088
6089 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6090
6091 /* Add our new extents and calculate the new rsv size. */
6092 spin_lock(&inode->lock);
6093 nr_extents = count_max_extents(num_bytes);
6094 btrfs_mod_outstanding_extents(inode, nr_extents);
6095 inode->csum_bytes += num_bytes;
6096 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6097 spin_unlock(&inode->lock);
6098
6099 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6100 ret = btrfs_qgroup_reserve_meta(root,
6101 nr_extents * fs_info->nodesize, true);
6102 if (ret)
6103 goto out_fail;
6104 }
6105
6106 ret = btrfs_inode_rsv_refill(inode, flush);
6107 if (unlikely(ret)) {
6108 btrfs_qgroup_free_meta(root,
6109 nr_extents * fs_info->nodesize);
6110 goto out_fail;
6111 }
6112
6113 if (delalloc_lock)
6114 mutex_unlock(&inode->delalloc_mutex);
6115 return 0;
6116
6117 out_fail:
6118 spin_lock(&inode->lock);
6119 nr_extents = count_max_extents(num_bytes);
6120 btrfs_mod_outstanding_extents(inode, -nr_extents);
6121 inode->csum_bytes -= num_bytes;
6122 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6123 spin_unlock(&inode->lock);
6124
6125 btrfs_inode_rsv_release(inode);
6126 if (delalloc_lock)
6127 mutex_unlock(&inode->delalloc_mutex);
6128 return ret;
6129 }
6130
6131 /**
6132 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6133 * @inode: the inode to release the reservation for.
6134 * @num_bytes: the number of bytes we are releasing.
6135 *
6136 * This will release the metadata reservation for an inode. This can be called
6137 * once we complete IO for a given set of bytes to release their metadata
6138 * reservations, or on error for the same reason.
6139 */
6140 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6141 {
6142 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6143
6144 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6145 spin_lock(&inode->lock);
6146 inode->csum_bytes -= num_bytes;
6147 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6148 spin_unlock(&inode->lock);
6149
6150 if (btrfs_is_testing(fs_info))
6151 return;
6152
6153 btrfs_inode_rsv_release(inode);
6154 }
6155
6156 /**
6157 * btrfs_delalloc_release_extents - release our outstanding_extents
6158 * @inode: the inode to balance the reservation for.
6159 * @num_bytes: the number of bytes we originally reserved with
6160 *
6161 * When we reserve space we increase outstanding_extents for the extents we may
6162 * add. Once we've set the range as delalloc or created our ordered extents we
6163 * have outstanding_extents to track the real usage, so we use this to free our
6164 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
6165 * with btrfs_delalloc_reserve_metadata.
6166 */
6167 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
6168 {
6169 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6170 unsigned num_extents;
6171
6172 spin_lock(&inode->lock);
6173 num_extents = count_max_extents(num_bytes);
6174 btrfs_mod_outstanding_extents(inode, -num_extents);
6175 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6176 spin_unlock(&inode->lock);
6177
6178 if (btrfs_is_testing(fs_info))
6179 return;
6180
6181 btrfs_inode_rsv_release(inode);
6182 }
6183
6184 /**
6185 * btrfs_delalloc_reserve_space - reserve data and metadata space for
6186 * delalloc
6187 * @inode: inode we're writing to
6188 * @start: start range we are writing to
6189 * @len: how long the range we are writing to
6190 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6191 * current reservation.
6192 *
6193 * This will do the following things
6194 *
6195 * o reserve space in data space info for num bytes
6196 * and reserve precious corresponding qgroup space
6197 * (Done in check_data_free_space)
6198 *
6199 * o reserve space for metadata space, based on the number of outstanding
6200 * extents and how much csums will be needed
6201 * also reserve metadata space in a per root over-reserve method.
6202 * o add to the inodes->delalloc_bytes
6203 * o add it to the fs_info's delalloc inodes list.
6204 * (Above 3 all done in delalloc_reserve_metadata)
6205 *
6206 * Return 0 for success
6207 * Return <0 for error(-ENOSPC or -EQUOT)
6208 */
6209 int btrfs_delalloc_reserve_space(struct inode *inode,
6210 struct extent_changeset **reserved, u64 start, u64 len)
6211 {
6212 int ret;
6213
6214 ret = btrfs_check_data_free_space(inode, reserved, start, len);
6215 if (ret < 0)
6216 return ret;
6217 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6218 if (ret < 0)
6219 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6220 return ret;
6221 }
6222
6223 /**
6224 * btrfs_delalloc_release_space - release data and metadata space for delalloc
6225 * @inode: inode we're releasing space for
6226 * @start: start position of the space already reserved
6227 * @len: the len of the space already reserved
6228 * @release_bytes: the len of the space we consumed or didn't use
6229 *
6230 * This function will release the metadata space that was not used and will
6231 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6232 * list if there are no delalloc bytes left.
6233 * Also it will handle the qgroup reserved space.
6234 */
6235 void btrfs_delalloc_release_space(struct inode *inode,
6236 struct extent_changeset *reserved,
6237 u64 start, u64 len)
6238 {
6239 btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
6240 btrfs_free_reserved_data_space(inode, reserved, start, len);
6241 }
6242
6243 static int update_block_group(struct btrfs_trans_handle *trans,
6244 struct btrfs_fs_info *info, u64 bytenr,
6245 u64 num_bytes, int alloc)
6246 {
6247 struct btrfs_block_group_cache *cache = NULL;
6248 u64 total = num_bytes;
6249 u64 old_val;
6250 u64 byte_in_group;
6251 int factor;
6252
6253 /* block accounting for super block */
6254 spin_lock(&info->delalloc_root_lock);
6255 old_val = btrfs_super_bytes_used(info->super_copy);
6256 if (alloc)
6257 old_val += num_bytes;
6258 else
6259 old_val -= num_bytes;
6260 btrfs_set_super_bytes_used(info->super_copy, old_val);
6261 spin_unlock(&info->delalloc_root_lock);
6262
6263 while (total) {
6264 cache = btrfs_lookup_block_group(info, bytenr);
6265 if (!cache)
6266 return -ENOENT;
6267 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6268 BTRFS_BLOCK_GROUP_RAID1 |
6269 BTRFS_BLOCK_GROUP_RAID10))
6270 factor = 2;
6271 else
6272 factor = 1;
6273 /*
6274 * If this block group has free space cache written out, we
6275 * need to make sure to load it if we are removing space. This
6276 * is because we need the unpinning stage to actually add the
6277 * space back to the block group, otherwise we will leak space.
6278 */
6279 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6280 cache_block_group(cache, 1);
6281
6282 byte_in_group = bytenr - cache->key.objectid;
6283 WARN_ON(byte_in_group > cache->key.offset);
6284
6285 spin_lock(&cache->space_info->lock);
6286 spin_lock(&cache->lock);
6287
6288 if (btrfs_test_opt(info, SPACE_CACHE) &&
6289 cache->disk_cache_state < BTRFS_DC_CLEAR)
6290 cache->disk_cache_state = BTRFS_DC_CLEAR;
6291
6292 old_val = btrfs_block_group_used(&cache->item);
6293 num_bytes = min(total, cache->key.offset - byte_in_group);
6294 if (alloc) {
6295 old_val += num_bytes;
6296 btrfs_set_block_group_used(&cache->item, old_val);
6297 cache->reserved -= num_bytes;
6298 cache->space_info->bytes_reserved -= num_bytes;
6299 cache->space_info->bytes_used += num_bytes;
6300 cache->space_info->disk_used += num_bytes * factor;
6301 spin_unlock(&cache->lock);
6302 spin_unlock(&cache->space_info->lock);
6303 } else {
6304 old_val -= num_bytes;
6305 btrfs_set_block_group_used(&cache->item, old_val);
6306 cache->pinned += num_bytes;
6307 cache->space_info->bytes_pinned += num_bytes;
6308 cache->space_info->bytes_used -= num_bytes;
6309 cache->space_info->disk_used -= num_bytes * factor;
6310 spin_unlock(&cache->lock);
6311 spin_unlock(&cache->space_info->lock);
6312
6313 trace_btrfs_space_reservation(info, "pinned",
6314 cache->space_info->flags,
6315 num_bytes, 1);
6316 percpu_counter_add(&cache->space_info->total_bytes_pinned,
6317 num_bytes);
6318 set_extent_dirty(info->pinned_extents,
6319 bytenr, bytenr + num_bytes - 1,
6320 GFP_NOFS | __GFP_NOFAIL);
6321 }
6322
6323 spin_lock(&trans->transaction->dirty_bgs_lock);
6324 if (list_empty(&cache->dirty_list)) {
6325 list_add_tail(&cache->dirty_list,
6326 &trans->transaction->dirty_bgs);
6327 trans->transaction->num_dirty_bgs++;
6328 btrfs_get_block_group(cache);
6329 }
6330 spin_unlock(&trans->transaction->dirty_bgs_lock);
6331
6332 /*
6333 * No longer have used bytes in this block group, queue it for
6334 * deletion. We do this after adding the block group to the
6335 * dirty list to avoid races between cleaner kthread and space
6336 * cache writeout.
6337 */
6338 if (!alloc && old_val == 0) {
6339 spin_lock(&info->unused_bgs_lock);
6340 if (list_empty(&cache->bg_list)) {
6341 btrfs_get_block_group(cache);
6342 list_add_tail(&cache->bg_list,
6343 &info->unused_bgs);
6344 }
6345 spin_unlock(&info->unused_bgs_lock);
6346 }
6347
6348 btrfs_put_block_group(cache);
6349 total -= num_bytes;
6350 bytenr += num_bytes;
6351 }
6352 return 0;
6353 }
6354
6355 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6356 {
6357 struct btrfs_block_group_cache *cache;
6358 u64 bytenr;
6359
6360 spin_lock(&fs_info->block_group_cache_lock);
6361 bytenr = fs_info->first_logical_byte;
6362 spin_unlock(&fs_info->block_group_cache_lock);
6363
6364 if (bytenr < (u64)-1)
6365 return bytenr;
6366
6367 cache = btrfs_lookup_first_block_group(fs_info, search_start);
6368 if (!cache)
6369 return 0;
6370
6371 bytenr = cache->key.objectid;
6372 btrfs_put_block_group(cache);
6373
6374 return bytenr;
6375 }
6376
6377 static int pin_down_extent(struct btrfs_fs_info *fs_info,
6378 struct btrfs_block_group_cache *cache,
6379 u64 bytenr, u64 num_bytes, int reserved)
6380 {
6381 spin_lock(&cache->space_info->lock);
6382 spin_lock(&cache->lock);
6383 cache->pinned += num_bytes;
6384 cache->space_info->bytes_pinned += num_bytes;
6385 if (reserved) {
6386 cache->reserved -= num_bytes;
6387 cache->space_info->bytes_reserved -= num_bytes;
6388 }
6389 spin_unlock(&cache->lock);
6390 spin_unlock(&cache->space_info->lock);
6391
6392 trace_btrfs_space_reservation(fs_info, "pinned",
6393 cache->space_info->flags, num_bytes, 1);
6394 percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
6395 set_extent_dirty(fs_info->pinned_extents, bytenr,
6396 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6397 return 0;
6398 }
6399
6400 /*
6401 * this function must be called within transaction
6402 */
6403 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6404 u64 bytenr, u64 num_bytes, int reserved)
6405 {
6406 struct btrfs_block_group_cache *cache;
6407
6408 cache = btrfs_lookup_block_group(fs_info, bytenr);
6409 BUG_ON(!cache); /* Logic error */
6410
6411 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6412
6413 btrfs_put_block_group(cache);
6414 return 0;
6415 }
6416
6417 /*
6418 * this function must be called within transaction
6419 */
6420 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6421 u64 bytenr, u64 num_bytes)
6422 {
6423 struct btrfs_block_group_cache *cache;
6424 int ret;
6425
6426 cache = btrfs_lookup_block_group(fs_info, bytenr);
6427 if (!cache)
6428 return -EINVAL;
6429
6430 /*
6431 * pull in the free space cache (if any) so that our pin
6432 * removes the free space from the cache. We have load_only set
6433 * to one because the slow code to read in the free extents does check
6434 * the pinned extents.
6435 */
6436 cache_block_group(cache, 1);
6437
6438 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6439
6440 /* remove us from the free space cache (if we're there at all) */
6441 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6442 btrfs_put_block_group(cache);
6443 return ret;
6444 }
6445
6446 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6447 u64 start, u64 num_bytes)
6448 {
6449 int ret;
6450 struct btrfs_block_group_cache *block_group;
6451 struct btrfs_caching_control *caching_ctl;
6452
6453 block_group = btrfs_lookup_block_group(fs_info, start);
6454 if (!block_group)
6455 return -EINVAL;
6456
6457 cache_block_group(block_group, 0);
6458 caching_ctl = get_caching_control(block_group);
6459
6460 if (!caching_ctl) {
6461 /* Logic error */
6462 BUG_ON(!block_group_cache_done(block_group));
6463 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6464 } else {
6465 mutex_lock(&caching_ctl->mutex);
6466
6467 if (start >= caching_ctl->progress) {
6468 ret = add_excluded_extent(fs_info, start, num_bytes);
6469 } else if (start + num_bytes <= caching_ctl->progress) {
6470 ret = btrfs_remove_free_space(block_group,
6471 start, num_bytes);
6472 } else {
6473 num_bytes = caching_ctl->progress - start;
6474 ret = btrfs_remove_free_space(block_group,
6475 start, num_bytes);
6476 if (ret)
6477 goto out_lock;
6478
6479 num_bytes = (start + num_bytes) -
6480 caching_ctl->progress;
6481 start = caching_ctl->progress;
6482 ret = add_excluded_extent(fs_info, start, num_bytes);
6483 }
6484 out_lock:
6485 mutex_unlock(&caching_ctl->mutex);
6486 put_caching_control(caching_ctl);
6487 }
6488 btrfs_put_block_group(block_group);
6489 return ret;
6490 }
6491
6492 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6493 struct extent_buffer *eb)
6494 {
6495 struct btrfs_file_extent_item *item;
6496 struct btrfs_key key;
6497 int found_type;
6498 int i;
6499
6500 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6501 return 0;
6502
6503 for (i = 0; i < btrfs_header_nritems(eb); i++) {
6504 btrfs_item_key_to_cpu(eb, &key, i);
6505 if (key.type != BTRFS_EXTENT_DATA_KEY)
6506 continue;
6507 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6508 found_type = btrfs_file_extent_type(eb, item);
6509 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6510 continue;
6511 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6512 continue;
6513 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6514 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6515 __exclude_logged_extent(fs_info, key.objectid, key.offset);
6516 }
6517
6518 return 0;
6519 }
6520
6521 static void
6522 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6523 {
6524 atomic_inc(&bg->reservations);
6525 }
6526
6527 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6528 const u64 start)
6529 {
6530 struct btrfs_block_group_cache *bg;
6531
6532 bg = btrfs_lookup_block_group(fs_info, start);
6533 ASSERT(bg);
6534 if (atomic_dec_and_test(&bg->reservations))
6535 wake_up_atomic_t(&bg->reservations);
6536 btrfs_put_block_group(bg);
6537 }
6538
6539 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6540 {
6541 struct btrfs_space_info *space_info = bg->space_info;
6542
6543 ASSERT(bg->ro);
6544
6545 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6546 return;
6547
6548 /*
6549 * Our block group is read only but before we set it to read only,
6550 * some task might have had allocated an extent from it already, but it
6551 * has not yet created a respective ordered extent (and added it to a
6552 * root's list of ordered extents).
6553 * Therefore wait for any task currently allocating extents, since the
6554 * block group's reservations counter is incremented while a read lock
6555 * on the groups' semaphore is held and decremented after releasing
6556 * the read access on that semaphore and creating the ordered extent.
6557 */
6558 down_write(&space_info->groups_sem);
6559 up_write(&space_info->groups_sem);
6560
6561 wait_on_atomic_t(&bg->reservations, atomic_t_wait,
6562 TASK_UNINTERRUPTIBLE);
6563 }
6564
6565 /**
6566 * btrfs_add_reserved_bytes - update the block_group and space info counters
6567 * @cache: The cache we are manipulating
6568 * @ram_bytes: The number of bytes of file content, and will be same to
6569 * @num_bytes except for the compress path.
6570 * @num_bytes: The number of bytes in question
6571 * @delalloc: The blocks are allocated for the delalloc write
6572 *
6573 * This is called by the allocator when it reserves space. If this is a
6574 * reservation and the block group has become read only we cannot make the
6575 * reservation and return -EAGAIN, otherwise this function always succeeds.
6576 */
6577 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6578 u64 ram_bytes, u64 num_bytes, int delalloc)
6579 {
6580 struct btrfs_space_info *space_info = cache->space_info;
6581 int ret = 0;
6582
6583 spin_lock(&space_info->lock);
6584 spin_lock(&cache->lock);
6585 if (cache->ro) {
6586 ret = -EAGAIN;
6587 } else {
6588 cache->reserved += num_bytes;
6589 space_info->bytes_reserved += num_bytes;
6590
6591 trace_btrfs_space_reservation(cache->fs_info,
6592 "space_info", space_info->flags,
6593 ram_bytes, 0);
6594 space_info->bytes_may_use -= ram_bytes;
6595 if (delalloc)
6596 cache->delalloc_bytes += num_bytes;
6597 }
6598 spin_unlock(&cache->lock);
6599 spin_unlock(&space_info->lock);
6600 return ret;
6601 }
6602
6603 /**
6604 * btrfs_free_reserved_bytes - update the block_group and space info counters
6605 * @cache: The cache we are manipulating
6606 * @num_bytes: The number of bytes in question
6607 * @delalloc: The blocks are allocated for the delalloc write
6608 *
6609 * This is called by somebody who is freeing space that was never actually used
6610 * on disk. For example if you reserve some space for a new leaf in transaction
6611 * A and before transaction A commits you free that leaf, you call this with
6612 * reserve set to 0 in order to clear the reservation.
6613 */
6614
6615 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6616 u64 num_bytes, int delalloc)
6617 {
6618 struct btrfs_space_info *space_info = cache->space_info;
6619 int ret = 0;
6620
6621 spin_lock(&space_info->lock);
6622 spin_lock(&cache->lock);
6623 if (cache->ro)
6624 space_info->bytes_readonly += num_bytes;
6625 cache->reserved -= num_bytes;
6626 space_info->bytes_reserved -= num_bytes;
6627 space_info->max_extent_size = 0;
6628
6629 if (delalloc)
6630 cache->delalloc_bytes -= num_bytes;
6631 spin_unlock(&cache->lock);
6632 spin_unlock(&space_info->lock);
6633 return ret;
6634 }
6635 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6636 {
6637 struct btrfs_caching_control *next;
6638 struct btrfs_caching_control *caching_ctl;
6639 struct btrfs_block_group_cache *cache;
6640
6641 down_write(&fs_info->commit_root_sem);
6642
6643 list_for_each_entry_safe(caching_ctl, next,
6644 &fs_info->caching_block_groups, list) {
6645 cache = caching_ctl->block_group;
6646 if (block_group_cache_done(cache)) {
6647 cache->last_byte_to_unpin = (u64)-1;
6648 list_del_init(&caching_ctl->list);
6649 put_caching_control(caching_ctl);
6650 } else {
6651 cache->last_byte_to_unpin = caching_ctl->progress;
6652 }
6653 }
6654
6655 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6656 fs_info->pinned_extents = &fs_info->freed_extents[1];
6657 else
6658 fs_info->pinned_extents = &fs_info->freed_extents[0];
6659
6660 up_write(&fs_info->commit_root_sem);
6661
6662 update_global_block_rsv(fs_info);
6663 }
6664
6665 /*
6666 * Returns the free cluster for the given space info and sets empty_cluster to
6667 * what it should be based on the mount options.
6668 */
6669 static struct btrfs_free_cluster *
6670 fetch_cluster_info(struct btrfs_fs_info *fs_info,
6671 struct btrfs_space_info *space_info, u64 *empty_cluster)
6672 {
6673 struct btrfs_free_cluster *ret = NULL;
6674
6675 *empty_cluster = 0;
6676 if (btrfs_mixed_space_info(space_info))
6677 return ret;
6678
6679 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6680 ret = &fs_info->meta_alloc_cluster;
6681 if (btrfs_test_opt(fs_info, SSD))
6682 *empty_cluster = SZ_2M;
6683 else
6684 *empty_cluster = SZ_64K;
6685 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6686 btrfs_test_opt(fs_info, SSD_SPREAD)) {
6687 *empty_cluster = SZ_2M;
6688 ret = &fs_info->data_alloc_cluster;
6689 }
6690
6691 return ret;
6692 }
6693
6694 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6695 u64 start, u64 end,
6696 const bool return_free_space)
6697 {
6698 struct btrfs_block_group_cache *cache = NULL;
6699 struct btrfs_space_info *space_info;
6700 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6701 struct btrfs_free_cluster *cluster = NULL;
6702 u64 len;
6703 u64 total_unpinned = 0;
6704 u64 empty_cluster = 0;
6705 bool readonly;
6706
6707 while (start <= end) {
6708 readonly = false;
6709 if (!cache ||
6710 start >= cache->key.objectid + cache->key.offset) {
6711 if (cache)
6712 btrfs_put_block_group(cache);
6713 total_unpinned = 0;
6714 cache = btrfs_lookup_block_group(fs_info, start);
6715 BUG_ON(!cache); /* Logic error */
6716
6717 cluster = fetch_cluster_info(fs_info,
6718 cache->space_info,
6719 &empty_cluster);
6720 empty_cluster <<= 1;
6721 }
6722
6723 len = cache->key.objectid + cache->key.offset - start;
6724 len = min(len, end + 1 - start);
6725
6726 if (start < cache->last_byte_to_unpin) {
6727 len = min(len, cache->last_byte_to_unpin - start);
6728 if (return_free_space)
6729 btrfs_add_free_space(cache, start, len);
6730 }
6731
6732 start += len;
6733 total_unpinned += len;
6734 space_info = cache->space_info;
6735
6736 /*
6737 * If this space cluster has been marked as fragmented and we've
6738 * unpinned enough in this block group to potentially allow a
6739 * cluster to be created inside of it go ahead and clear the
6740 * fragmented check.
6741 */
6742 if (cluster && cluster->fragmented &&
6743 total_unpinned > empty_cluster) {
6744 spin_lock(&cluster->lock);
6745 cluster->fragmented = 0;
6746 spin_unlock(&cluster->lock);
6747 }
6748
6749 spin_lock(&space_info->lock);
6750 spin_lock(&cache->lock);
6751 cache->pinned -= len;
6752 space_info->bytes_pinned -= len;
6753
6754 trace_btrfs_space_reservation(fs_info, "pinned",
6755 space_info->flags, len, 0);
6756 space_info->max_extent_size = 0;
6757 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6758 if (cache->ro) {
6759 space_info->bytes_readonly += len;
6760 readonly = true;
6761 }
6762 spin_unlock(&cache->lock);
6763 if (!readonly && return_free_space &&
6764 global_rsv->space_info == space_info) {
6765 u64 to_add = len;
6766
6767 spin_lock(&global_rsv->lock);
6768 if (!global_rsv->full) {
6769 to_add = min(len, global_rsv->size -
6770 global_rsv->reserved);
6771 global_rsv->reserved += to_add;
6772 space_info->bytes_may_use += to_add;
6773 if (global_rsv->reserved >= global_rsv->size)
6774 global_rsv->full = 1;
6775 trace_btrfs_space_reservation(fs_info,
6776 "space_info",
6777 space_info->flags,
6778 to_add, 1);
6779 len -= to_add;
6780 }
6781 spin_unlock(&global_rsv->lock);
6782 /* Add to any tickets we may have */
6783 if (len)
6784 space_info_add_new_bytes(fs_info, space_info,
6785 len);
6786 }
6787 spin_unlock(&space_info->lock);
6788 }
6789
6790 if (cache)
6791 btrfs_put_block_group(cache);
6792 return 0;
6793 }
6794
6795 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6796 struct btrfs_fs_info *fs_info)
6797 {
6798 struct btrfs_block_group_cache *block_group, *tmp;
6799 struct list_head *deleted_bgs;
6800 struct extent_io_tree *unpin;
6801 u64 start;
6802 u64 end;
6803 int ret;
6804
6805 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6806 unpin = &fs_info->freed_extents[1];
6807 else
6808 unpin = &fs_info->freed_extents[0];
6809
6810 while (!trans->aborted) {
6811 struct extent_state *cached_state = NULL;
6812
6813 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6814 ret = find_first_extent_bit(unpin, 0, &start, &end,
6815 EXTENT_DIRTY, &cached_state);
6816 if (ret) {
6817 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6818 break;
6819 }
6820
6821 if (btrfs_test_opt(fs_info, DISCARD))
6822 ret = btrfs_discard_extent(fs_info, start,
6823 end + 1 - start, NULL);
6824
6825 clear_extent_dirty(unpin, start, end, &cached_state);
6826 unpin_extent_range(fs_info, start, end, true);
6827 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6828 free_extent_state(cached_state);
6829 cond_resched();
6830 }
6831
6832 /*
6833 * Transaction is finished. We don't need the lock anymore. We
6834 * do need to clean up the block groups in case of a transaction
6835 * abort.
6836 */
6837 deleted_bgs = &trans->transaction->deleted_bgs;
6838 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6839 u64 trimmed = 0;
6840
6841 ret = -EROFS;
6842 if (!trans->aborted)
6843 ret = btrfs_discard_extent(fs_info,
6844 block_group->key.objectid,
6845 block_group->key.offset,
6846 &trimmed);
6847
6848 list_del_init(&block_group->bg_list);
6849 btrfs_put_block_group_trimming(block_group);
6850 btrfs_put_block_group(block_group);
6851
6852 if (ret) {
6853 const char *errstr = btrfs_decode_error(ret);
6854 btrfs_warn(fs_info,
6855 "discard failed while removing blockgroup: errno=%d %s",
6856 ret, errstr);
6857 }
6858 }
6859
6860 return 0;
6861 }
6862
6863 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6864 struct btrfs_fs_info *info,
6865 struct btrfs_delayed_ref_node *node, u64 parent,
6866 u64 root_objectid, u64 owner_objectid,
6867 u64 owner_offset, int refs_to_drop,
6868 struct btrfs_delayed_extent_op *extent_op)
6869 {
6870 struct btrfs_key key;
6871 struct btrfs_path *path;
6872 struct btrfs_root *extent_root = info->extent_root;
6873 struct extent_buffer *leaf;
6874 struct btrfs_extent_item *ei;
6875 struct btrfs_extent_inline_ref *iref;
6876 int ret;
6877 int is_data;
6878 int extent_slot = 0;
6879 int found_extent = 0;
6880 int num_to_del = 1;
6881 u32 item_size;
6882 u64 refs;
6883 u64 bytenr = node->bytenr;
6884 u64 num_bytes = node->num_bytes;
6885 int last_ref = 0;
6886 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6887
6888 path = btrfs_alloc_path();
6889 if (!path)
6890 return -ENOMEM;
6891
6892 path->reada = READA_FORWARD;
6893 path->leave_spinning = 1;
6894
6895 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6896 BUG_ON(!is_data && refs_to_drop != 1);
6897
6898 if (is_data)
6899 skinny_metadata = false;
6900
6901 ret = lookup_extent_backref(trans, info, path, &iref,
6902 bytenr, num_bytes, parent,
6903 root_objectid, owner_objectid,
6904 owner_offset);
6905 if (ret == 0) {
6906 extent_slot = path->slots[0];
6907 while (extent_slot >= 0) {
6908 btrfs_item_key_to_cpu(path->nodes[0], &key,
6909 extent_slot);
6910 if (key.objectid != bytenr)
6911 break;
6912 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6913 key.offset == num_bytes) {
6914 found_extent = 1;
6915 break;
6916 }
6917 if (key.type == BTRFS_METADATA_ITEM_KEY &&
6918 key.offset == owner_objectid) {
6919 found_extent = 1;
6920 break;
6921 }
6922 if (path->slots[0] - extent_slot > 5)
6923 break;
6924 extent_slot--;
6925 }
6926 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6927 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6928 if (found_extent && item_size < sizeof(*ei))
6929 found_extent = 0;
6930 #endif
6931 if (!found_extent) {
6932 BUG_ON(iref);
6933 ret = remove_extent_backref(trans, info, path, NULL,
6934 refs_to_drop,
6935 is_data, &last_ref);
6936 if (ret) {
6937 btrfs_abort_transaction(trans, ret);
6938 goto out;
6939 }
6940 btrfs_release_path(path);
6941 path->leave_spinning = 1;
6942
6943 key.objectid = bytenr;
6944 key.type = BTRFS_EXTENT_ITEM_KEY;
6945 key.offset = num_bytes;
6946
6947 if (!is_data && skinny_metadata) {
6948 key.type = BTRFS_METADATA_ITEM_KEY;
6949 key.offset = owner_objectid;
6950 }
6951
6952 ret = btrfs_search_slot(trans, extent_root,
6953 &key, path, -1, 1);
6954 if (ret > 0 && skinny_metadata && path->slots[0]) {
6955 /*
6956 * Couldn't find our skinny metadata item,
6957 * see if we have ye olde extent item.
6958 */
6959 path->slots[0]--;
6960 btrfs_item_key_to_cpu(path->nodes[0], &key,
6961 path->slots[0]);
6962 if (key.objectid == bytenr &&
6963 key.type == BTRFS_EXTENT_ITEM_KEY &&
6964 key.offset == num_bytes)
6965 ret = 0;
6966 }
6967
6968 if (ret > 0 && skinny_metadata) {
6969 skinny_metadata = false;
6970 key.objectid = bytenr;
6971 key.type = BTRFS_EXTENT_ITEM_KEY;
6972 key.offset = num_bytes;
6973 btrfs_release_path(path);
6974 ret = btrfs_search_slot(trans, extent_root,
6975 &key, path, -1, 1);
6976 }
6977
6978 if (ret) {
6979 btrfs_err(info,
6980 "umm, got %d back from search, was looking for %llu",
6981 ret, bytenr);
6982 if (ret > 0)
6983 btrfs_print_leaf(path->nodes[0]);
6984 }
6985 if (ret < 0) {
6986 btrfs_abort_transaction(trans, ret);
6987 goto out;
6988 }
6989 extent_slot = path->slots[0];
6990 }
6991 } else if (WARN_ON(ret == -ENOENT)) {
6992 btrfs_print_leaf(path->nodes[0]);
6993 btrfs_err(info,
6994 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
6995 bytenr, parent, root_objectid, owner_objectid,
6996 owner_offset);
6997 btrfs_abort_transaction(trans, ret);
6998 goto out;
6999 } else {
7000 btrfs_abort_transaction(trans, ret);
7001 goto out;
7002 }
7003
7004 leaf = path->nodes[0];
7005 item_size = btrfs_item_size_nr(leaf, extent_slot);
7006 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
7007 if (item_size < sizeof(*ei)) {
7008 BUG_ON(found_extent || extent_slot != path->slots[0]);
7009 ret = convert_extent_item_v0(trans, info, path, owner_objectid,
7010 0);
7011 if (ret < 0) {
7012 btrfs_abort_transaction(trans, ret);
7013 goto out;
7014 }
7015
7016 btrfs_release_path(path);
7017 path->leave_spinning = 1;
7018
7019 key.objectid = bytenr;
7020 key.type = BTRFS_EXTENT_ITEM_KEY;
7021 key.offset = num_bytes;
7022
7023 ret = btrfs_search_slot(trans, extent_root, &key, path,
7024 -1, 1);
7025 if (ret) {
7026 btrfs_err(info,
7027 "umm, got %d back from search, was looking for %llu",
7028 ret, bytenr);
7029 btrfs_print_leaf(path->nodes[0]);
7030 }
7031 if (ret < 0) {
7032 btrfs_abort_transaction(trans, ret);
7033 goto out;
7034 }
7035
7036 extent_slot = path->slots[0];
7037 leaf = path->nodes[0];
7038 item_size = btrfs_item_size_nr(leaf, extent_slot);
7039 }
7040 #endif
7041 BUG_ON(item_size < sizeof(*ei));
7042 ei = btrfs_item_ptr(leaf, extent_slot,
7043 struct btrfs_extent_item);
7044 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7045 key.type == BTRFS_EXTENT_ITEM_KEY) {
7046 struct btrfs_tree_block_info *bi;
7047 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7048 bi = (struct btrfs_tree_block_info *)(ei + 1);
7049 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7050 }
7051
7052 refs = btrfs_extent_refs(leaf, ei);
7053 if (refs < refs_to_drop) {
7054 btrfs_err(info,
7055 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7056 refs_to_drop, refs, bytenr);
7057 ret = -EINVAL;
7058 btrfs_abort_transaction(trans, ret);
7059 goto out;
7060 }
7061 refs -= refs_to_drop;
7062
7063 if (refs > 0) {
7064 if (extent_op)
7065 __run_delayed_extent_op(extent_op, leaf, ei);
7066 /*
7067 * In the case of inline back ref, reference count will
7068 * be updated by remove_extent_backref
7069 */
7070 if (iref) {
7071 BUG_ON(!found_extent);
7072 } else {
7073 btrfs_set_extent_refs(leaf, ei, refs);
7074 btrfs_mark_buffer_dirty(leaf);
7075 }
7076 if (found_extent) {
7077 ret = remove_extent_backref(trans, info, path,
7078 iref, refs_to_drop,
7079 is_data, &last_ref);
7080 if (ret) {
7081 btrfs_abort_transaction(trans, ret);
7082 goto out;
7083 }
7084 }
7085 } else {
7086 if (found_extent) {
7087 BUG_ON(is_data && refs_to_drop !=
7088 extent_data_ref_count(path, iref));
7089 if (iref) {
7090 BUG_ON(path->slots[0] != extent_slot);
7091 } else {
7092 BUG_ON(path->slots[0] != extent_slot + 1);
7093 path->slots[0] = extent_slot;
7094 num_to_del = 2;
7095 }
7096 }
7097
7098 last_ref = 1;
7099 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7100 num_to_del);
7101 if (ret) {
7102 btrfs_abort_transaction(trans, ret);
7103 goto out;
7104 }
7105 btrfs_release_path(path);
7106
7107 if (is_data) {
7108 ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7109 if (ret) {
7110 btrfs_abort_transaction(trans, ret);
7111 goto out;
7112 }
7113 }
7114
7115 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
7116 if (ret) {
7117 btrfs_abort_transaction(trans, ret);
7118 goto out;
7119 }
7120
7121 ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7122 if (ret) {
7123 btrfs_abort_transaction(trans, ret);
7124 goto out;
7125 }
7126 }
7127 btrfs_release_path(path);
7128
7129 out:
7130 btrfs_free_path(path);
7131 return ret;
7132 }
7133
7134 /*
7135 * when we free an block, it is possible (and likely) that we free the last
7136 * delayed ref for that extent as well. This searches the delayed ref tree for
7137 * a given extent, and if there are no other delayed refs to be processed, it
7138 * removes it from the tree.
7139 */
7140 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7141 u64 bytenr)
7142 {
7143 struct btrfs_delayed_ref_head *head;
7144 struct btrfs_delayed_ref_root *delayed_refs;
7145 int ret = 0;
7146
7147 delayed_refs = &trans->transaction->delayed_refs;
7148 spin_lock(&delayed_refs->lock);
7149 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7150 if (!head)
7151 goto out_delayed_unlock;
7152
7153 spin_lock(&head->lock);
7154 if (!RB_EMPTY_ROOT(&head->ref_tree))
7155 goto out;
7156
7157 if (head->extent_op) {
7158 if (!head->must_insert_reserved)
7159 goto out;
7160 btrfs_free_delayed_extent_op(head->extent_op);
7161 head->extent_op = NULL;
7162 }
7163
7164 /*
7165 * waiting for the lock here would deadlock. If someone else has it
7166 * locked they are already in the process of dropping it anyway
7167 */
7168 if (!mutex_trylock(&head->mutex))
7169 goto out;
7170
7171 /*
7172 * at this point we have a head with no other entries. Go
7173 * ahead and process it.
7174 */
7175 rb_erase(&head->href_node, &delayed_refs->href_root);
7176 RB_CLEAR_NODE(&head->href_node);
7177 atomic_dec(&delayed_refs->num_entries);
7178
7179 /*
7180 * we don't take a ref on the node because we're removing it from the
7181 * tree, so we just steal the ref the tree was holding.
7182 */
7183 delayed_refs->num_heads--;
7184 if (head->processing == 0)
7185 delayed_refs->num_heads_ready--;
7186 head->processing = 0;
7187 spin_unlock(&head->lock);
7188 spin_unlock(&delayed_refs->lock);
7189
7190 BUG_ON(head->extent_op);
7191 if (head->must_insert_reserved)
7192 ret = 1;
7193
7194 mutex_unlock(&head->mutex);
7195 btrfs_put_delayed_ref_head(head);
7196 return ret;
7197 out:
7198 spin_unlock(&head->lock);
7199
7200 out_delayed_unlock:
7201 spin_unlock(&delayed_refs->lock);
7202 return 0;
7203 }
7204
7205 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7206 struct btrfs_root *root,
7207 struct extent_buffer *buf,
7208 u64 parent, int last_ref)
7209 {
7210 struct btrfs_fs_info *fs_info = root->fs_info;
7211 int pin = 1;
7212 int ret;
7213
7214 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7215 int old_ref_mod, new_ref_mod;
7216
7217 btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7218 root->root_key.objectid,
7219 btrfs_header_level(buf), 0,
7220 BTRFS_DROP_DELAYED_REF);
7221 ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7222 buf->len, parent,
7223 root->root_key.objectid,
7224 btrfs_header_level(buf),
7225 BTRFS_DROP_DELAYED_REF, NULL,
7226 &old_ref_mod, &new_ref_mod);
7227 BUG_ON(ret); /* -ENOMEM */
7228 pin = old_ref_mod >= 0 && new_ref_mod < 0;
7229 }
7230
7231 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7232 struct btrfs_block_group_cache *cache;
7233
7234 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7235 ret = check_ref_cleanup(trans, buf->start);
7236 if (!ret)
7237 goto out;
7238 }
7239
7240 pin = 0;
7241 cache = btrfs_lookup_block_group(fs_info, buf->start);
7242
7243 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7244 pin_down_extent(fs_info, cache, buf->start,
7245 buf->len, 1);
7246 btrfs_put_block_group(cache);
7247 goto out;
7248 }
7249
7250 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7251
7252 btrfs_add_free_space(cache, buf->start, buf->len);
7253 btrfs_free_reserved_bytes(cache, buf->len, 0);
7254 btrfs_put_block_group(cache);
7255 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7256 }
7257 out:
7258 if (pin)
7259 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
7260 root->root_key.objectid);
7261
7262 if (last_ref) {
7263 /*
7264 * Deleting the buffer, clear the corrupt flag since it doesn't
7265 * matter anymore.
7266 */
7267 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7268 }
7269 }
7270
7271 /* Can return -ENOMEM */
7272 int btrfs_free_extent(struct btrfs_trans_handle *trans,
7273 struct btrfs_root *root,
7274 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7275 u64 owner, u64 offset)
7276 {
7277 struct btrfs_fs_info *fs_info = root->fs_info;
7278 int old_ref_mod, new_ref_mod;
7279 int ret;
7280
7281 if (btrfs_is_testing(fs_info))
7282 return 0;
7283
7284 if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7285 btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7286 root_objectid, owner, offset,
7287 BTRFS_DROP_DELAYED_REF);
7288
7289 /*
7290 * tree log blocks never actually go into the extent allocation
7291 * tree, just update pinning info and exit early.
7292 */
7293 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7294 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7295 /* unlocks the pinned mutex */
7296 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7297 old_ref_mod = new_ref_mod = 0;
7298 ret = 0;
7299 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7300 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7301 num_bytes, parent,
7302 root_objectid, (int)owner,
7303 BTRFS_DROP_DELAYED_REF, NULL,
7304 &old_ref_mod, &new_ref_mod);
7305 } else {
7306 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7307 num_bytes, parent,
7308 root_objectid, owner, offset,
7309 0, BTRFS_DROP_DELAYED_REF,
7310 &old_ref_mod, &new_ref_mod);
7311 }
7312
7313 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7314 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
7315
7316 return ret;
7317 }
7318
7319 /*
7320 * when we wait for progress in the block group caching, its because
7321 * our allocation attempt failed at least once. So, we must sleep
7322 * and let some progress happen before we try again.
7323 *
7324 * This function will sleep at least once waiting for new free space to
7325 * show up, and then it will check the block group free space numbers
7326 * for our min num_bytes. Another option is to have it go ahead
7327 * and look in the rbtree for a free extent of a given size, but this
7328 * is a good start.
7329 *
7330 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7331 * any of the information in this block group.
7332 */
7333 static noinline void
7334 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7335 u64 num_bytes)
7336 {
7337 struct btrfs_caching_control *caching_ctl;
7338
7339 caching_ctl = get_caching_control(cache);
7340 if (!caching_ctl)
7341 return;
7342
7343 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7344 (cache->free_space_ctl->free_space >= num_bytes));
7345
7346 put_caching_control(caching_ctl);
7347 }
7348
7349 static noinline int
7350 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7351 {
7352 struct btrfs_caching_control *caching_ctl;
7353 int ret = 0;
7354
7355 caching_ctl = get_caching_control(cache);
7356 if (!caching_ctl)
7357 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7358
7359 wait_event(caching_ctl->wait, block_group_cache_done(cache));
7360 if (cache->cached == BTRFS_CACHE_ERROR)
7361 ret = -EIO;
7362 put_caching_control(caching_ctl);
7363 return ret;
7364 }
7365
7366 int __get_raid_index(u64 flags)
7367 {
7368 if (flags & BTRFS_BLOCK_GROUP_RAID10)
7369 return BTRFS_RAID_RAID10;
7370 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7371 return BTRFS_RAID_RAID1;
7372 else if (flags & BTRFS_BLOCK_GROUP_DUP)
7373 return BTRFS_RAID_DUP;
7374 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7375 return BTRFS_RAID_RAID0;
7376 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7377 return BTRFS_RAID_RAID5;
7378 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7379 return BTRFS_RAID_RAID6;
7380
7381 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7382 }
7383
7384 int get_block_group_index(struct btrfs_block_group_cache *cache)
7385 {
7386 return __get_raid_index(cache->flags);
7387 }
7388
7389 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7390 [BTRFS_RAID_RAID10] = "raid10",
7391 [BTRFS_RAID_RAID1] = "raid1",
7392 [BTRFS_RAID_DUP] = "dup",
7393 [BTRFS_RAID_RAID0] = "raid0",
7394 [BTRFS_RAID_SINGLE] = "single",
7395 [BTRFS_RAID_RAID5] = "raid5",
7396 [BTRFS_RAID_RAID6] = "raid6",
7397 };
7398
7399 static const char *get_raid_name(enum btrfs_raid_types type)
7400 {
7401 if (type >= BTRFS_NR_RAID_TYPES)
7402 return NULL;
7403
7404 return btrfs_raid_type_names[type];
7405 }
7406
7407 enum btrfs_loop_type {
7408 LOOP_CACHING_NOWAIT = 0,
7409 LOOP_CACHING_WAIT = 1,
7410 LOOP_ALLOC_CHUNK = 2,
7411 LOOP_NO_EMPTY_SIZE = 3,
7412 };
7413
7414 static inline void
7415 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7416 int delalloc)
7417 {
7418 if (delalloc)
7419 down_read(&cache->data_rwsem);
7420 }
7421
7422 static inline void
7423 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7424 int delalloc)
7425 {
7426 btrfs_get_block_group(cache);
7427 if (delalloc)
7428 down_read(&cache->data_rwsem);
7429 }
7430
7431 static struct btrfs_block_group_cache *
7432 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7433 struct btrfs_free_cluster *cluster,
7434 int delalloc)
7435 {
7436 struct btrfs_block_group_cache *used_bg = NULL;
7437
7438 spin_lock(&cluster->refill_lock);
7439 while (1) {
7440 used_bg = cluster->block_group;
7441 if (!used_bg)
7442 return NULL;
7443
7444 if (used_bg == block_group)
7445 return used_bg;
7446
7447 btrfs_get_block_group(used_bg);
7448
7449 if (!delalloc)
7450 return used_bg;
7451
7452 if (down_read_trylock(&used_bg->data_rwsem))
7453 return used_bg;
7454
7455 spin_unlock(&cluster->refill_lock);
7456
7457 /* We should only have one-level nested. */
7458 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7459
7460 spin_lock(&cluster->refill_lock);
7461 if (used_bg == cluster->block_group)
7462 return used_bg;
7463
7464 up_read(&used_bg->data_rwsem);
7465 btrfs_put_block_group(used_bg);
7466 }
7467 }
7468
7469 static inline void
7470 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7471 int delalloc)
7472 {
7473 if (delalloc)
7474 up_read(&cache->data_rwsem);
7475 btrfs_put_block_group(cache);
7476 }
7477
7478 /*
7479 * walks the btree of allocated extents and find a hole of a given size.
7480 * The key ins is changed to record the hole:
7481 * ins->objectid == start position
7482 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7483 * ins->offset == the size of the hole.
7484 * Any available blocks before search_start are skipped.
7485 *
7486 * If there is no suitable free space, we will record the max size of
7487 * the free space extent currently.
7488 */
7489 static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7490 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7491 u64 hint_byte, struct btrfs_key *ins,
7492 u64 flags, int delalloc)
7493 {
7494 int ret = 0;
7495 struct btrfs_root *root = fs_info->extent_root;
7496 struct btrfs_free_cluster *last_ptr = NULL;
7497 struct btrfs_block_group_cache *block_group = NULL;
7498 u64 search_start = 0;
7499 u64 max_extent_size = 0;
7500 u64 max_free_space = 0;
7501 u64 empty_cluster = 0;
7502 struct btrfs_space_info *space_info;
7503 int loop = 0;
7504 int index = __get_raid_index(flags);
7505 bool failed_cluster_refill = false;
7506 bool failed_alloc = false;
7507 bool use_cluster = true;
7508 bool have_caching_bg = false;
7509 bool orig_have_caching_bg = false;
7510 bool full_search = false;
7511
7512 WARN_ON(num_bytes < fs_info->sectorsize);
7513 ins->type = BTRFS_EXTENT_ITEM_KEY;
7514 ins->objectid = 0;
7515 ins->offset = 0;
7516
7517 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7518
7519 space_info = __find_space_info(fs_info, flags);
7520 if (!space_info) {
7521 btrfs_err(fs_info, "No space info for %llu", flags);
7522 return -ENOSPC;
7523 }
7524
7525 /*
7526 * If our free space is heavily fragmented we may not be able to make
7527 * big contiguous allocations, so instead of doing the expensive search
7528 * for free space, simply return ENOSPC with our max_extent_size so we
7529 * can go ahead and search for a more manageable chunk.
7530 *
7531 * If our max_extent_size is large enough for our allocation simply
7532 * disable clustering since we will likely not be able to find enough
7533 * space to create a cluster and induce latency trying.
7534 */
7535 if (unlikely(space_info->max_extent_size)) {
7536 spin_lock(&space_info->lock);
7537 if (space_info->max_extent_size &&
7538 num_bytes > space_info->max_extent_size) {
7539 ins->offset = space_info->max_extent_size;
7540 spin_unlock(&space_info->lock);
7541 return -ENOSPC;
7542 } else if (space_info->max_extent_size) {
7543 use_cluster = false;
7544 }
7545 spin_unlock(&space_info->lock);
7546 }
7547
7548 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7549 if (last_ptr) {
7550 spin_lock(&last_ptr->lock);
7551 if (last_ptr->block_group)
7552 hint_byte = last_ptr->window_start;
7553 if (last_ptr->fragmented) {
7554 /*
7555 * We still set window_start so we can keep track of the
7556 * last place we found an allocation to try and save
7557 * some time.
7558 */
7559 hint_byte = last_ptr->window_start;
7560 use_cluster = false;
7561 }
7562 spin_unlock(&last_ptr->lock);
7563 }
7564
7565 search_start = max(search_start, first_logical_byte(fs_info, 0));
7566 search_start = max(search_start, hint_byte);
7567 if (search_start == hint_byte) {
7568 block_group = btrfs_lookup_block_group(fs_info, search_start);
7569 /*
7570 * we don't want to use the block group if it doesn't match our
7571 * allocation bits, or if its not cached.
7572 *
7573 * However if we are re-searching with an ideal block group
7574 * picked out then we don't care that the block group is cached.
7575 */
7576 if (block_group && block_group_bits(block_group, flags) &&
7577 block_group->cached != BTRFS_CACHE_NO) {
7578 down_read(&space_info->groups_sem);
7579 if (list_empty(&block_group->list) ||
7580 block_group->ro) {
7581 /*
7582 * someone is removing this block group,
7583 * we can't jump into the have_block_group
7584 * target because our list pointers are not
7585 * valid
7586 */
7587 btrfs_put_block_group(block_group);
7588 up_read(&space_info->groups_sem);
7589 } else {
7590 index = get_block_group_index(block_group);
7591 btrfs_lock_block_group(block_group, delalloc);
7592 goto have_block_group;
7593 }
7594 } else if (block_group) {
7595 btrfs_put_block_group(block_group);
7596 }
7597 }
7598 search:
7599 have_caching_bg = false;
7600 if (index == 0 || index == __get_raid_index(flags))
7601 full_search = true;
7602 down_read(&space_info->groups_sem);
7603 list_for_each_entry(block_group, &space_info->block_groups[index],
7604 list) {
7605 u64 offset;
7606 int cached;
7607
7608 /* If the block group is read-only, we can skip it entirely. */
7609 if (unlikely(block_group->ro))
7610 continue;
7611
7612 btrfs_grab_block_group(block_group, delalloc);
7613 search_start = block_group->key.objectid;
7614
7615 /*
7616 * this can happen if we end up cycling through all the
7617 * raid types, but we want to make sure we only allocate
7618 * for the proper type.
7619 */
7620 if (!block_group_bits(block_group, flags)) {
7621 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7622 BTRFS_BLOCK_GROUP_RAID1 |
7623 BTRFS_BLOCK_GROUP_RAID5 |
7624 BTRFS_BLOCK_GROUP_RAID6 |
7625 BTRFS_BLOCK_GROUP_RAID10;
7626
7627 /*
7628 * if they asked for extra copies and this block group
7629 * doesn't provide them, bail. This does allow us to
7630 * fill raid0 from raid1.
7631 */
7632 if ((flags & extra) && !(block_group->flags & extra))
7633 goto loop;
7634
7635 /*
7636 * This block group has different flags than we want.
7637 * It's possible that we have MIXED_GROUP flag but no
7638 * block group is mixed. Just skip such block group.
7639 */
7640 btrfs_release_block_group(block_group, delalloc);
7641 continue;
7642 }
7643
7644 have_block_group:
7645 cached = block_group_cache_done(block_group);
7646 if (unlikely(!cached)) {
7647 have_caching_bg = true;
7648 ret = cache_block_group(block_group, 0);
7649 BUG_ON(ret < 0);
7650 ret = 0;
7651 }
7652
7653 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7654 goto loop;
7655
7656 /*
7657 * Ok we want to try and use the cluster allocator, so
7658 * lets look there
7659 */
7660 if (last_ptr && use_cluster) {
7661 struct btrfs_block_group_cache *used_block_group;
7662 unsigned long aligned_cluster;
7663 /*
7664 * the refill lock keeps out other
7665 * people trying to start a new cluster
7666 */
7667 used_block_group = btrfs_lock_cluster(block_group,
7668 last_ptr,
7669 delalloc);
7670 if (!used_block_group)
7671 goto refill_cluster;
7672
7673 if (used_block_group != block_group &&
7674 (used_block_group->ro ||
7675 !block_group_bits(used_block_group, flags)))
7676 goto release_cluster;
7677
7678 offset = btrfs_alloc_from_cluster(used_block_group,
7679 last_ptr,
7680 num_bytes,
7681 used_block_group->key.objectid,
7682 &max_extent_size);
7683 if (offset) {
7684 /* we have a block, we're done */
7685 spin_unlock(&last_ptr->refill_lock);
7686 trace_btrfs_reserve_extent_cluster(fs_info,
7687 used_block_group,
7688 search_start, num_bytes);
7689 if (used_block_group != block_group) {
7690 btrfs_release_block_group(block_group,
7691 delalloc);
7692 block_group = used_block_group;
7693 }
7694 goto checks;
7695 }
7696
7697 WARN_ON(last_ptr->block_group != used_block_group);
7698 release_cluster:
7699 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7700 * set up a new clusters, so lets just skip it
7701 * and let the allocator find whatever block
7702 * it can find. If we reach this point, we
7703 * will have tried the cluster allocator
7704 * plenty of times and not have found
7705 * anything, so we are likely way too
7706 * fragmented for the clustering stuff to find
7707 * anything.
7708 *
7709 * However, if the cluster is taken from the
7710 * current block group, release the cluster
7711 * first, so that we stand a better chance of
7712 * succeeding in the unclustered
7713 * allocation. */
7714 if (loop >= LOOP_NO_EMPTY_SIZE &&
7715 used_block_group != block_group) {
7716 spin_unlock(&last_ptr->refill_lock);
7717 btrfs_release_block_group(used_block_group,
7718 delalloc);
7719 goto unclustered_alloc;
7720 }
7721
7722 /*
7723 * this cluster didn't work out, free it and
7724 * start over
7725 */
7726 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7727
7728 if (used_block_group != block_group)
7729 btrfs_release_block_group(used_block_group,
7730 delalloc);
7731 refill_cluster:
7732 if (loop >= LOOP_NO_EMPTY_SIZE) {
7733 spin_unlock(&last_ptr->refill_lock);
7734 goto unclustered_alloc;
7735 }
7736
7737 aligned_cluster = max_t(unsigned long,
7738 empty_cluster + empty_size,
7739 block_group->full_stripe_len);
7740
7741 /* allocate a cluster in this block group */
7742 ret = btrfs_find_space_cluster(fs_info, block_group,
7743 last_ptr, search_start,
7744 num_bytes,
7745 aligned_cluster);
7746 if (ret == 0) {
7747 /*
7748 * now pull our allocation out of this
7749 * cluster
7750 */
7751 offset = btrfs_alloc_from_cluster(block_group,
7752 last_ptr,
7753 num_bytes,
7754 search_start,
7755 &max_extent_size);
7756 if (offset) {
7757 /* we found one, proceed */
7758 spin_unlock(&last_ptr->refill_lock);
7759 trace_btrfs_reserve_extent_cluster(fs_info,
7760 block_group, search_start,
7761 num_bytes);
7762 goto checks;
7763 }
7764 } else if (!cached && loop > LOOP_CACHING_NOWAIT
7765 && !failed_cluster_refill) {
7766 spin_unlock(&last_ptr->refill_lock);
7767
7768 failed_cluster_refill = true;
7769 wait_block_group_cache_progress(block_group,
7770 num_bytes + empty_cluster + empty_size);
7771 goto have_block_group;
7772 }
7773
7774 /*
7775 * at this point we either didn't find a cluster
7776 * or we weren't able to allocate a block from our
7777 * cluster. Free the cluster we've been trying
7778 * to use, and go to the next block group
7779 */
7780 btrfs_return_cluster_to_free_space(NULL, last_ptr);
7781 spin_unlock(&last_ptr->refill_lock);
7782 goto loop;
7783 }
7784
7785 unclustered_alloc:
7786 /*
7787 * We are doing an unclustered alloc, set the fragmented flag so
7788 * we don't bother trying to setup a cluster again until we get
7789 * more space.
7790 */
7791 if (unlikely(last_ptr)) {
7792 spin_lock(&last_ptr->lock);
7793 last_ptr->fragmented = 1;
7794 spin_unlock(&last_ptr->lock);
7795 }
7796 if (cached) {
7797 struct btrfs_free_space_ctl *ctl =
7798 block_group->free_space_ctl;
7799
7800 spin_lock(&ctl->tree_lock);
7801 if (ctl->free_space <
7802 num_bytes + empty_cluster + empty_size) {
7803 max_free_space = max(max_free_space,
7804 ctl->free_space);
7805 spin_unlock(&ctl->tree_lock);
7806 goto loop;
7807 }
7808 spin_unlock(&ctl->tree_lock);
7809 }
7810
7811 offset = btrfs_find_space_for_alloc(block_group, search_start,
7812 num_bytes, empty_size,
7813 &max_extent_size);
7814 /*
7815 * If we didn't find a chunk, and we haven't failed on this
7816 * block group before, and this block group is in the middle of
7817 * caching and we are ok with waiting, then go ahead and wait
7818 * for progress to be made, and set failed_alloc to true.
7819 *
7820 * If failed_alloc is true then we've already waited on this
7821 * block group once and should move on to the next block group.
7822 */
7823 if (!offset && !failed_alloc && !cached &&
7824 loop > LOOP_CACHING_NOWAIT) {
7825 wait_block_group_cache_progress(block_group,
7826 num_bytes + empty_size);
7827 failed_alloc = true;
7828 goto have_block_group;
7829 } else if (!offset) {
7830 goto loop;
7831 }
7832 checks:
7833 search_start = ALIGN(offset, fs_info->stripesize);
7834
7835 /* move on to the next group */
7836 if (search_start + num_bytes >
7837 block_group->key.objectid + block_group->key.offset) {
7838 btrfs_add_free_space(block_group, offset, num_bytes);
7839 goto loop;
7840 }
7841
7842 if (offset < search_start)
7843 btrfs_add_free_space(block_group, offset,
7844 search_start - offset);
7845 BUG_ON(offset > search_start);
7846
7847 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7848 num_bytes, delalloc);
7849 if (ret == -EAGAIN) {
7850 btrfs_add_free_space(block_group, offset, num_bytes);
7851 goto loop;
7852 }
7853 btrfs_inc_block_group_reservations(block_group);
7854
7855 /* we are all good, lets return */
7856 ins->objectid = search_start;
7857 ins->offset = num_bytes;
7858
7859 trace_btrfs_reserve_extent(fs_info, block_group,
7860 search_start, num_bytes);
7861 btrfs_release_block_group(block_group, delalloc);
7862 break;
7863 loop:
7864 failed_cluster_refill = false;
7865 failed_alloc = false;
7866 BUG_ON(index != get_block_group_index(block_group));
7867 btrfs_release_block_group(block_group, delalloc);
7868 cond_resched();
7869 }
7870 up_read(&space_info->groups_sem);
7871
7872 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7873 && !orig_have_caching_bg)
7874 orig_have_caching_bg = true;
7875
7876 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7877 goto search;
7878
7879 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7880 goto search;
7881
7882 /*
7883 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7884 * caching kthreads as we move along
7885 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7886 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7887 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7888 * again
7889 */
7890 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7891 index = 0;
7892 if (loop == LOOP_CACHING_NOWAIT) {
7893 /*
7894 * We want to skip the LOOP_CACHING_WAIT step if we
7895 * don't have any uncached bgs and we've already done a
7896 * full search through.
7897 */
7898 if (orig_have_caching_bg || !full_search)
7899 loop = LOOP_CACHING_WAIT;
7900 else
7901 loop = LOOP_ALLOC_CHUNK;
7902 } else {
7903 loop++;
7904 }
7905
7906 if (loop == LOOP_ALLOC_CHUNK) {
7907 struct btrfs_trans_handle *trans;
7908 int exist = 0;
7909
7910 trans = current->journal_info;
7911 if (trans)
7912 exist = 1;
7913 else
7914 trans = btrfs_join_transaction(root);
7915
7916 if (IS_ERR(trans)) {
7917 ret = PTR_ERR(trans);
7918 goto out;
7919 }
7920
7921 ret = do_chunk_alloc(trans, fs_info, flags,
7922 CHUNK_ALLOC_FORCE);
7923
7924 /*
7925 * If we can't allocate a new chunk we've already looped
7926 * through at least once, move on to the NO_EMPTY_SIZE
7927 * case.
7928 */
7929 if (ret == -ENOSPC)
7930 loop = LOOP_NO_EMPTY_SIZE;
7931
7932 /*
7933 * Do not bail out on ENOSPC since we
7934 * can do more things.
7935 */
7936 if (ret < 0 && ret != -ENOSPC)
7937 btrfs_abort_transaction(trans, ret);
7938 else
7939 ret = 0;
7940 if (!exist)
7941 btrfs_end_transaction(trans);
7942 if (ret)
7943 goto out;
7944 }
7945
7946 if (loop == LOOP_NO_EMPTY_SIZE) {
7947 /*
7948 * Don't loop again if we already have no empty_size and
7949 * no empty_cluster.
7950 */
7951 if (empty_size == 0 &&
7952 empty_cluster == 0) {
7953 ret = -ENOSPC;
7954 goto out;
7955 }
7956 empty_size = 0;
7957 empty_cluster = 0;
7958 }
7959
7960 goto search;
7961 } else if (!ins->objectid) {
7962 ret = -ENOSPC;
7963 } else if (ins->objectid) {
7964 if (!use_cluster && last_ptr) {
7965 spin_lock(&last_ptr->lock);
7966 last_ptr->window_start = ins->objectid;
7967 spin_unlock(&last_ptr->lock);
7968 }
7969 ret = 0;
7970 }
7971 out:
7972 if (ret == -ENOSPC) {
7973 if (!max_extent_size)
7974 max_extent_size = max_free_space;
7975 spin_lock(&space_info->lock);
7976 space_info->max_extent_size = max_extent_size;
7977 spin_unlock(&space_info->lock);
7978 ins->offset = max_extent_size;
7979 }
7980 return ret;
7981 }
7982
7983 static void dump_space_info(struct btrfs_fs_info *fs_info,
7984 struct btrfs_space_info *info, u64 bytes,
7985 int dump_block_groups)
7986 {
7987 struct btrfs_block_group_cache *cache;
7988 int index = 0;
7989
7990 spin_lock(&info->lock);
7991 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7992 info->flags,
7993 info->total_bytes - btrfs_space_info_used(info, true),
7994 info->full ? "" : "not ");
7995 btrfs_info(fs_info,
7996 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7997 info->total_bytes, info->bytes_used, info->bytes_pinned,
7998 info->bytes_reserved, info->bytes_may_use,
7999 info->bytes_readonly);
8000 spin_unlock(&info->lock);
8001
8002 if (!dump_block_groups)
8003 return;
8004
8005 down_read(&info->groups_sem);
8006 again:
8007 list_for_each_entry(cache, &info->block_groups[index], list) {
8008 spin_lock(&cache->lock);
8009 btrfs_info(fs_info,
8010 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8011 cache->key.objectid, cache->key.offset,
8012 btrfs_block_group_used(&cache->item), cache->pinned,
8013 cache->reserved, cache->ro ? "[readonly]" : "");
8014 btrfs_dump_free_space(cache, bytes);
8015 spin_unlock(&cache->lock);
8016 }
8017 if (++index < BTRFS_NR_RAID_TYPES)
8018 goto again;
8019 up_read(&info->groups_sem);
8020 }
8021
8022 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8023 u64 num_bytes, u64 min_alloc_size,
8024 u64 empty_size, u64 hint_byte,
8025 struct btrfs_key *ins, int is_data, int delalloc)
8026 {
8027 struct btrfs_fs_info *fs_info = root->fs_info;
8028 bool final_tried = num_bytes == min_alloc_size;
8029 u64 flags;
8030 int ret;
8031
8032 flags = get_alloc_profile_by_root(root, is_data);
8033 again:
8034 WARN_ON(num_bytes < fs_info->sectorsize);
8035 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8036 hint_byte, ins, flags, delalloc);
8037 if (!ret && !is_data) {
8038 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8039 } else if (ret == -ENOSPC) {
8040 if (!final_tried && ins->offset) {
8041 num_bytes = min(num_bytes >> 1, ins->offset);
8042 num_bytes = round_down(num_bytes,
8043 fs_info->sectorsize);
8044 num_bytes = max(num_bytes, min_alloc_size);
8045 ram_bytes = num_bytes;
8046 if (num_bytes == min_alloc_size)
8047 final_tried = true;
8048 goto again;
8049 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8050 struct btrfs_space_info *sinfo;
8051
8052 sinfo = __find_space_info(fs_info, flags);
8053 btrfs_err(fs_info,
8054 "allocation failed flags %llu, wanted %llu",
8055 flags, num_bytes);
8056 if (sinfo)
8057 dump_space_info(fs_info, sinfo, num_bytes, 1);
8058 }
8059 }
8060
8061 return ret;
8062 }
8063
8064 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8065 u64 start, u64 len,
8066 int pin, int delalloc)
8067 {
8068 struct btrfs_block_group_cache *cache;
8069 int ret = 0;
8070
8071 cache = btrfs_lookup_block_group(fs_info, start);
8072 if (!cache) {
8073 btrfs_err(fs_info, "Unable to find block group for %llu",
8074 start);
8075 return -ENOSPC;
8076 }
8077
8078 if (pin)
8079 pin_down_extent(fs_info, cache, start, len, 1);
8080 else {
8081 if (btrfs_test_opt(fs_info, DISCARD))
8082 ret = btrfs_discard_extent(fs_info, start, len, NULL);
8083 btrfs_add_free_space(cache, start, len);
8084 btrfs_free_reserved_bytes(cache, len, delalloc);
8085 trace_btrfs_reserved_extent_free(fs_info, start, len);
8086 }
8087
8088 btrfs_put_block_group(cache);
8089 return ret;
8090 }
8091
8092 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8093 u64 start, u64 len, int delalloc)
8094 {
8095 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8096 }
8097
8098 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8099 u64 start, u64 len)
8100 {
8101 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8102 }
8103
8104 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8105 struct btrfs_fs_info *fs_info,
8106 u64 parent, u64 root_objectid,
8107 u64 flags, u64 owner, u64 offset,
8108 struct btrfs_key *ins, int ref_mod)
8109 {
8110 int ret;
8111 struct btrfs_extent_item *extent_item;
8112 struct btrfs_extent_inline_ref *iref;
8113 struct btrfs_path *path;
8114 struct extent_buffer *leaf;
8115 int type;
8116 u32 size;
8117
8118 if (parent > 0)
8119 type = BTRFS_SHARED_DATA_REF_KEY;
8120 else
8121 type = BTRFS_EXTENT_DATA_REF_KEY;
8122
8123 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8124
8125 path = btrfs_alloc_path();
8126 if (!path)
8127 return -ENOMEM;
8128
8129 path->leave_spinning = 1;
8130 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8131 ins, size);
8132 if (ret) {
8133 btrfs_free_path(path);
8134 return ret;
8135 }
8136
8137 leaf = path->nodes[0];
8138 extent_item = btrfs_item_ptr(leaf, path->slots[0],
8139 struct btrfs_extent_item);
8140 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8141 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8142 btrfs_set_extent_flags(leaf, extent_item,
8143 flags | BTRFS_EXTENT_FLAG_DATA);
8144
8145 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8146 btrfs_set_extent_inline_ref_type(leaf, iref, type);
8147 if (parent > 0) {
8148 struct btrfs_shared_data_ref *ref;
8149 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8150 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8151 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8152 } else {
8153 struct btrfs_extent_data_ref *ref;
8154 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8155 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8156 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8157 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8158 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8159 }
8160
8161 btrfs_mark_buffer_dirty(path->nodes[0]);
8162 btrfs_free_path(path);
8163
8164 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8165 ins->offset);
8166 if (ret)
8167 return ret;
8168
8169 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8170 if (ret) { /* -ENOENT, logic error */
8171 btrfs_err(fs_info, "update block group failed for %llu %llu",
8172 ins->objectid, ins->offset);
8173 BUG();
8174 }
8175 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8176 return ret;
8177 }
8178
8179 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8180 struct btrfs_fs_info *fs_info,
8181 u64 parent, u64 root_objectid,
8182 u64 flags, struct btrfs_disk_key *key,
8183 int level, struct btrfs_key *ins)
8184 {
8185 int ret;
8186 struct btrfs_extent_item *extent_item;
8187 struct btrfs_tree_block_info *block_info;
8188 struct btrfs_extent_inline_ref *iref;
8189 struct btrfs_path *path;
8190 struct extent_buffer *leaf;
8191 u32 size = sizeof(*extent_item) + sizeof(*iref);
8192 u64 num_bytes = ins->offset;
8193 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8194
8195 if (!skinny_metadata)
8196 size += sizeof(*block_info);
8197
8198 path = btrfs_alloc_path();
8199 if (!path)
8200 return -ENOMEM;
8201
8202 path->leave_spinning = 1;
8203 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8204 ins, size);
8205 if (ret) {
8206 btrfs_free_path(path);
8207 return ret;
8208 }
8209
8210 leaf = path->nodes[0];
8211 extent_item = btrfs_item_ptr(leaf, path->slots[0],
8212 struct btrfs_extent_item);
8213 btrfs_set_extent_refs(leaf, extent_item, 1);
8214 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8215 btrfs_set_extent_flags(leaf, extent_item,
8216 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8217
8218 if (skinny_metadata) {
8219 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8220 num_bytes = fs_info->nodesize;
8221 } else {
8222 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8223 btrfs_set_tree_block_key(leaf, block_info, key);
8224 btrfs_set_tree_block_level(leaf, block_info, level);
8225 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8226 }
8227
8228 if (parent > 0) {
8229 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8230 btrfs_set_extent_inline_ref_type(leaf, iref,
8231 BTRFS_SHARED_BLOCK_REF_KEY);
8232 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8233 } else {
8234 btrfs_set_extent_inline_ref_type(leaf, iref,
8235 BTRFS_TREE_BLOCK_REF_KEY);
8236 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8237 }
8238
8239 btrfs_mark_buffer_dirty(leaf);
8240 btrfs_free_path(path);
8241
8242 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8243 num_bytes);
8244 if (ret)
8245 return ret;
8246
8247 ret = update_block_group(trans, fs_info, ins->objectid,
8248 fs_info->nodesize, 1);
8249 if (ret) { /* -ENOENT, logic error */
8250 btrfs_err(fs_info, "update block group failed for %llu %llu",
8251 ins->objectid, ins->offset);
8252 BUG();
8253 }
8254
8255 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
8256 fs_info->nodesize);
8257 return ret;
8258 }
8259
8260 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8261 struct btrfs_root *root, u64 owner,
8262 u64 offset, u64 ram_bytes,
8263 struct btrfs_key *ins)
8264 {
8265 struct btrfs_fs_info *fs_info = root->fs_info;
8266 int ret;
8267
8268 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8269
8270 btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8271 root->root_key.objectid, owner, offset,
8272 BTRFS_ADD_DELAYED_EXTENT);
8273
8274 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8275 ins->offset, 0,
8276 root->root_key.objectid, owner,
8277 offset, ram_bytes,
8278 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8279 return ret;
8280 }
8281
8282 /*
8283 * this is used by the tree logging recovery code. It records that
8284 * an extent has been allocated and makes sure to clear the free
8285 * space cache bits as well
8286 */
8287 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8288 struct btrfs_fs_info *fs_info,
8289 u64 root_objectid, u64 owner, u64 offset,
8290 struct btrfs_key *ins)
8291 {
8292 int ret;
8293 struct btrfs_block_group_cache *block_group;
8294 struct btrfs_space_info *space_info;
8295
8296 /*
8297 * Mixed block groups will exclude before processing the log so we only
8298 * need to do the exclude dance if this fs isn't mixed.
8299 */
8300 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8301 ret = __exclude_logged_extent(fs_info, ins->objectid,
8302 ins->offset);
8303 if (ret)
8304 return ret;
8305 }
8306
8307 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8308 if (!block_group)
8309 return -EINVAL;
8310
8311 space_info = block_group->space_info;
8312 spin_lock(&space_info->lock);
8313 spin_lock(&block_group->lock);
8314 space_info->bytes_reserved += ins->offset;
8315 block_group->reserved += ins->offset;
8316 spin_unlock(&block_group->lock);
8317 spin_unlock(&space_info->lock);
8318
8319 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
8320 0, owner, offset, ins, 1);
8321 btrfs_put_block_group(block_group);
8322 return ret;
8323 }
8324
8325 static struct extent_buffer *
8326 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8327 u64 bytenr, int level)
8328 {
8329 struct btrfs_fs_info *fs_info = root->fs_info;
8330 struct extent_buffer *buf;
8331
8332 buf = btrfs_find_create_tree_block(fs_info, bytenr);
8333 if (IS_ERR(buf))
8334 return buf;
8335
8336 /*
8337 * Extra safety check in case the extent tree is corrupted and extent
8338 * allocator chooses to use a tree block which is already used and
8339 * locked.
8340 */
8341 if (buf->lock_owner == current->pid) {
8342 btrfs_err_rl(fs_info,
8343 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8344 buf->start, btrfs_header_owner(buf), current->pid);
8345 free_extent_buffer(buf);
8346 return ERR_PTR(-EUCLEAN);
8347 }
8348
8349 btrfs_set_header_generation(buf, trans->transid);
8350 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8351 btrfs_tree_lock(buf);
8352 clean_tree_block(fs_info, buf);
8353 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8354
8355 btrfs_set_lock_blocking(buf);
8356 set_extent_buffer_uptodate(buf);
8357
8358 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8359 buf->log_index = root->log_transid % 2;
8360 /*
8361 * we allow two log transactions at a time, use different
8362 * EXENT bit to differentiate dirty pages.
8363 */
8364 if (buf->log_index == 0)
8365 set_extent_dirty(&root->dirty_log_pages, buf->start,
8366 buf->start + buf->len - 1, GFP_NOFS);
8367 else
8368 set_extent_new(&root->dirty_log_pages, buf->start,
8369 buf->start + buf->len - 1);
8370 } else {
8371 buf->log_index = -1;
8372 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8373 buf->start + buf->len - 1, GFP_NOFS);
8374 }
8375 trans->dirty = true;
8376 /* this returns a buffer locked for blocking */
8377 return buf;
8378 }
8379
8380 static struct btrfs_block_rsv *
8381 use_block_rsv(struct btrfs_trans_handle *trans,
8382 struct btrfs_root *root, u32 blocksize)
8383 {
8384 struct btrfs_fs_info *fs_info = root->fs_info;
8385 struct btrfs_block_rsv *block_rsv;
8386 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8387 int ret;
8388 bool global_updated = false;
8389
8390 block_rsv = get_block_rsv(trans, root);
8391
8392 if (unlikely(block_rsv->size == 0))
8393 goto try_reserve;
8394 again:
8395 ret = block_rsv_use_bytes(block_rsv, blocksize);
8396 if (!ret)
8397 return block_rsv;
8398
8399 if (block_rsv->failfast)
8400 return ERR_PTR(ret);
8401
8402 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8403 global_updated = true;
8404 update_global_block_rsv(fs_info);
8405 goto again;
8406 }
8407
8408 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8409 static DEFINE_RATELIMIT_STATE(_rs,
8410 DEFAULT_RATELIMIT_INTERVAL * 10,
8411 /*DEFAULT_RATELIMIT_BURST*/ 1);
8412 if (__ratelimit(&_rs))
8413 WARN(1, KERN_DEBUG
8414 "BTRFS: block rsv returned %d\n", ret);
8415 }
8416 try_reserve:
8417 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8418 BTRFS_RESERVE_NO_FLUSH);
8419 if (!ret)
8420 return block_rsv;
8421 /*
8422 * If we couldn't reserve metadata bytes try and use some from
8423 * the global reserve if its space type is the same as the global
8424 * reservation.
8425 */
8426 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8427 block_rsv->space_info == global_rsv->space_info) {
8428 ret = block_rsv_use_bytes(global_rsv, blocksize);
8429 if (!ret)
8430 return global_rsv;
8431 }
8432 return ERR_PTR(ret);
8433 }
8434
8435 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8436 struct btrfs_block_rsv *block_rsv, u32 blocksize)
8437 {
8438 block_rsv_add_bytes(block_rsv, blocksize, 0);
8439 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8440 }
8441
8442 /*
8443 * finds a free extent and does all the dirty work required for allocation
8444 * returns the tree buffer or an ERR_PTR on error.
8445 */
8446 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8447 struct btrfs_root *root,
8448 u64 parent, u64 root_objectid,
8449 const struct btrfs_disk_key *key,
8450 int level, u64 hint,
8451 u64 empty_size)
8452 {
8453 struct btrfs_fs_info *fs_info = root->fs_info;
8454 struct btrfs_key ins;
8455 struct btrfs_block_rsv *block_rsv;
8456 struct extent_buffer *buf;
8457 struct btrfs_delayed_extent_op *extent_op;
8458 u64 flags = 0;
8459 int ret;
8460 u32 blocksize = fs_info->nodesize;
8461 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8462
8463 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8464 if (btrfs_is_testing(fs_info)) {
8465 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8466 level);
8467 if (!IS_ERR(buf))
8468 root->alloc_bytenr += blocksize;
8469 return buf;
8470 }
8471 #endif
8472
8473 block_rsv = use_block_rsv(trans, root, blocksize);
8474 if (IS_ERR(block_rsv))
8475 return ERR_CAST(block_rsv);
8476
8477 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8478 empty_size, hint, &ins, 0, 0);
8479 if (ret)
8480 goto out_unuse;
8481
8482 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8483 if (IS_ERR(buf)) {
8484 ret = PTR_ERR(buf);
8485 goto out_free_reserved;
8486 }
8487
8488 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8489 if (parent == 0)
8490 parent = ins.objectid;
8491 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8492 } else
8493 BUG_ON(parent > 0);
8494
8495 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8496 extent_op = btrfs_alloc_delayed_extent_op();
8497 if (!extent_op) {
8498 ret = -ENOMEM;
8499 goto out_free_buf;
8500 }
8501 if (key)
8502 memcpy(&extent_op->key, key, sizeof(extent_op->key));
8503 else
8504 memset(&extent_op->key, 0, sizeof(extent_op->key));
8505 extent_op->flags_to_set = flags;
8506 extent_op->update_key = skinny_metadata ? false : true;
8507 extent_op->update_flags = true;
8508 extent_op->is_data = false;
8509 extent_op->level = level;
8510
8511 btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8512 root_objectid, level, 0,
8513 BTRFS_ADD_DELAYED_EXTENT);
8514 ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8515 ins.offset, parent,
8516 root_objectid, level,
8517 BTRFS_ADD_DELAYED_EXTENT,
8518 extent_op, NULL, NULL);
8519 if (ret)
8520 goto out_free_delayed;
8521 }
8522 return buf;
8523
8524 out_free_delayed:
8525 btrfs_free_delayed_extent_op(extent_op);
8526 out_free_buf:
8527 free_extent_buffer(buf);
8528 out_free_reserved:
8529 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8530 out_unuse:
8531 unuse_block_rsv(fs_info, block_rsv, blocksize);
8532 return ERR_PTR(ret);
8533 }
8534
8535 struct walk_control {
8536 u64 refs[BTRFS_MAX_LEVEL];
8537 u64 flags[BTRFS_MAX_LEVEL];
8538 struct btrfs_key update_progress;
8539 int stage;
8540 int level;
8541 int shared_level;
8542 int update_ref;
8543 int keep_locks;
8544 int reada_slot;
8545 int reada_count;
8546 int for_reloc;
8547 };
8548
8549 #define DROP_REFERENCE 1
8550 #define UPDATE_BACKREF 2
8551
8552 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8553 struct btrfs_root *root,
8554 struct walk_control *wc,
8555 struct btrfs_path *path)
8556 {
8557 struct btrfs_fs_info *fs_info = root->fs_info;
8558 u64 bytenr;
8559 u64 generation;
8560 u64 refs;
8561 u64 flags;
8562 u32 nritems;
8563 struct btrfs_key key;
8564 struct extent_buffer *eb;
8565 int ret;
8566 int slot;
8567 int nread = 0;
8568
8569 if (path->slots[wc->level] < wc->reada_slot) {
8570 wc->reada_count = wc->reada_count * 2 / 3;
8571 wc->reada_count = max(wc->reada_count, 2);
8572 } else {
8573 wc->reada_count = wc->reada_count * 3 / 2;
8574 wc->reada_count = min_t(int, wc->reada_count,
8575 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8576 }
8577
8578 eb = path->nodes[wc->level];
8579 nritems = btrfs_header_nritems(eb);
8580
8581 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8582 if (nread >= wc->reada_count)
8583 break;
8584
8585 cond_resched();
8586 bytenr = btrfs_node_blockptr(eb, slot);
8587 generation = btrfs_node_ptr_generation(eb, slot);
8588
8589 if (slot == path->slots[wc->level])
8590 goto reada;
8591
8592 if (wc->stage == UPDATE_BACKREF &&
8593 generation <= root->root_key.offset)
8594 continue;
8595
8596 /* We don't lock the tree block, it's OK to be racy here */
8597 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8598 wc->level - 1, 1, &refs,
8599 &flags);
8600 /* We don't care about errors in readahead. */
8601 if (ret < 0)
8602 continue;
8603 BUG_ON(refs == 0);
8604
8605 if (wc->stage == DROP_REFERENCE) {
8606 if (refs == 1)
8607 goto reada;
8608
8609 if (wc->level == 1 &&
8610 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8611 continue;
8612 if (!wc->update_ref ||
8613 generation <= root->root_key.offset)
8614 continue;
8615 btrfs_node_key_to_cpu(eb, &key, slot);
8616 ret = btrfs_comp_cpu_keys(&key,
8617 &wc->update_progress);
8618 if (ret < 0)
8619 continue;
8620 } else {
8621 if (wc->level == 1 &&
8622 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8623 continue;
8624 }
8625 reada:
8626 readahead_tree_block(fs_info, bytenr);
8627 nread++;
8628 }
8629 wc->reada_slot = slot;
8630 }
8631
8632 /*
8633 * helper to process tree block while walking down the tree.
8634 *
8635 * when wc->stage == UPDATE_BACKREF, this function updates
8636 * back refs for pointers in the block.
8637 *
8638 * NOTE: return value 1 means we should stop walking down.
8639 */
8640 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8641 struct btrfs_root *root,
8642 struct btrfs_path *path,
8643 struct walk_control *wc, int lookup_info)
8644 {
8645 struct btrfs_fs_info *fs_info = root->fs_info;
8646 int level = wc->level;
8647 struct extent_buffer *eb = path->nodes[level];
8648 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8649 int ret;
8650
8651 if (wc->stage == UPDATE_BACKREF &&
8652 btrfs_header_owner(eb) != root->root_key.objectid)
8653 return 1;
8654
8655 /*
8656 * when reference count of tree block is 1, it won't increase
8657 * again. once full backref flag is set, we never clear it.
8658 */
8659 if (lookup_info &&
8660 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8661 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8662 BUG_ON(!path->locks[level]);
8663 ret = btrfs_lookup_extent_info(trans, fs_info,
8664 eb->start, level, 1,
8665 &wc->refs[level],
8666 &wc->flags[level]);
8667 BUG_ON(ret == -ENOMEM);
8668 if (ret)
8669 return ret;
8670 BUG_ON(wc->refs[level] == 0);
8671 }
8672
8673 if (wc->stage == DROP_REFERENCE) {
8674 if (wc->refs[level] > 1)
8675 return 1;
8676
8677 if (path->locks[level] && !wc->keep_locks) {
8678 btrfs_tree_unlock_rw(eb, path->locks[level]);
8679 path->locks[level] = 0;
8680 }
8681 return 0;
8682 }
8683
8684 /* wc->stage == UPDATE_BACKREF */
8685 if (!(wc->flags[level] & flag)) {
8686 BUG_ON(!path->locks[level]);
8687 ret = btrfs_inc_ref(trans, root, eb, 1);
8688 BUG_ON(ret); /* -ENOMEM */
8689 ret = btrfs_dec_ref(trans, root, eb, 0);
8690 BUG_ON(ret); /* -ENOMEM */
8691 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8692 eb->len, flag,
8693 btrfs_header_level(eb), 0);
8694 BUG_ON(ret); /* -ENOMEM */
8695 wc->flags[level] |= flag;
8696 }
8697
8698 /*
8699 * the block is shared by multiple trees, so it's not good to
8700 * keep the tree lock
8701 */
8702 if (path->locks[level] && level > 0) {
8703 btrfs_tree_unlock_rw(eb, path->locks[level]);
8704 path->locks[level] = 0;
8705 }
8706 return 0;
8707 }
8708
8709 /*
8710 * helper to process tree block pointer.
8711 *
8712 * when wc->stage == DROP_REFERENCE, this function checks
8713 * reference count of the block pointed to. if the block
8714 * is shared and we need update back refs for the subtree
8715 * rooted at the block, this function changes wc->stage to
8716 * UPDATE_BACKREF. if the block is shared and there is no
8717 * need to update back, this function drops the reference
8718 * to the block.
8719 *
8720 * NOTE: return value 1 means we should stop walking down.
8721 */
8722 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8723 struct btrfs_root *root,
8724 struct btrfs_path *path,
8725 struct walk_control *wc, int *lookup_info)
8726 {
8727 struct btrfs_fs_info *fs_info = root->fs_info;
8728 u64 bytenr;
8729 u64 generation;
8730 u64 parent;
8731 u32 blocksize;
8732 struct btrfs_key key;
8733 struct extent_buffer *next;
8734 int level = wc->level;
8735 int reada = 0;
8736 int ret = 0;
8737 bool need_account = false;
8738
8739 generation = btrfs_node_ptr_generation(path->nodes[level],
8740 path->slots[level]);
8741 /*
8742 * if the lower level block was created before the snapshot
8743 * was created, we know there is no need to update back refs
8744 * for the subtree
8745 */
8746 if (wc->stage == UPDATE_BACKREF &&
8747 generation <= root->root_key.offset) {
8748 *lookup_info = 1;
8749 return 1;
8750 }
8751
8752 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8753 blocksize = fs_info->nodesize;
8754
8755 next = find_extent_buffer(fs_info, bytenr);
8756 if (!next) {
8757 next = btrfs_find_create_tree_block(fs_info, bytenr);
8758 if (IS_ERR(next))
8759 return PTR_ERR(next);
8760
8761 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8762 level - 1);
8763 reada = 1;
8764 }
8765 btrfs_tree_lock(next);
8766 btrfs_set_lock_blocking(next);
8767
8768 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8769 &wc->refs[level - 1],
8770 &wc->flags[level - 1]);
8771 if (ret < 0)
8772 goto out_unlock;
8773
8774 if (unlikely(wc->refs[level - 1] == 0)) {
8775 btrfs_err(fs_info, "Missing references.");
8776 ret = -EIO;
8777 goto out_unlock;
8778 }
8779 *lookup_info = 0;
8780
8781 if (wc->stage == DROP_REFERENCE) {
8782 if (wc->refs[level - 1] > 1) {
8783 need_account = true;
8784 if (level == 1 &&
8785 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8786 goto skip;
8787
8788 if (!wc->update_ref ||
8789 generation <= root->root_key.offset)
8790 goto skip;
8791
8792 btrfs_node_key_to_cpu(path->nodes[level], &key,
8793 path->slots[level]);
8794 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8795 if (ret < 0)
8796 goto skip;
8797
8798 wc->stage = UPDATE_BACKREF;
8799 wc->shared_level = level - 1;
8800 }
8801 } else {
8802 if (level == 1 &&
8803 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8804 goto skip;
8805 }
8806
8807 if (!btrfs_buffer_uptodate(next, generation, 0)) {
8808 btrfs_tree_unlock(next);
8809 free_extent_buffer(next);
8810 next = NULL;
8811 *lookup_info = 1;
8812 }
8813
8814 if (!next) {
8815 if (reada && level == 1)
8816 reada_walk_down(trans, root, wc, path);
8817 next = read_tree_block(fs_info, bytenr, generation);
8818 if (IS_ERR(next)) {
8819 return PTR_ERR(next);
8820 } else if (!extent_buffer_uptodate(next)) {
8821 free_extent_buffer(next);
8822 return -EIO;
8823 }
8824 btrfs_tree_lock(next);
8825 btrfs_set_lock_blocking(next);
8826 }
8827
8828 level--;
8829 ASSERT(level == btrfs_header_level(next));
8830 if (level != btrfs_header_level(next)) {
8831 btrfs_err(root->fs_info, "mismatched level");
8832 ret = -EIO;
8833 goto out_unlock;
8834 }
8835 path->nodes[level] = next;
8836 path->slots[level] = 0;
8837 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8838 wc->level = level;
8839 if (wc->level == 1)
8840 wc->reada_slot = 0;
8841 return 0;
8842 skip:
8843 wc->refs[level - 1] = 0;
8844 wc->flags[level - 1] = 0;
8845 if (wc->stage == DROP_REFERENCE) {
8846 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8847 parent = path->nodes[level]->start;
8848 } else {
8849 ASSERT(root->root_key.objectid ==
8850 btrfs_header_owner(path->nodes[level]));
8851 if (root->root_key.objectid !=
8852 btrfs_header_owner(path->nodes[level])) {
8853 btrfs_err(root->fs_info,
8854 "mismatched block owner");
8855 ret = -EIO;
8856 goto out_unlock;
8857 }
8858 parent = 0;
8859 }
8860
8861 if (need_account) {
8862 ret = btrfs_qgroup_trace_subtree(trans, root, next,
8863 generation, level - 1);
8864 if (ret) {
8865 btrfs_err_rl(fs_info,
8866 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8867 ret);
8868 }
8869 }
8870 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8871 parent, root->root_key.objectid,
8872 level - 1, 0);
8873 if (ret)
8874 goto out_unlock;
8875 }
8876
8877 *lookup_info = 1;
8878 ret = 1;
8879
8880 out_unlock:
8881 btrfs_tree_unlock(next);
8882 free_extent_buffer(next);
8883
8884 return ret;
8885 }
8886
8887 /*
8888 * helper to process tree block while walking up the tree.
8889 *
8890 * when wc->stage == DROP_REFERENCE, this function drops
8891 * reference count on the block.
8892 *
8893 * when wc->stage == UPDATE_BACKREF, this function changes
8894 * wc->stage back to DROP_REFERENCE if we changed wc->stage
8895 * to UPDATE_BACKREF previously while processing the block.
8896 *
8897 * NOTE: return value 1 means we should stop walking up.
8898 */
8899 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8900 struct btrfs_root *root,
8901 struct btrfs_path *path,
8902 struct walk_control *wc)
8903 {
8904 struct btrfs_fs_info *fs_info = root->fs_info;
8905 int ret;
8906 int level = wc->level;
8907 struct extent_buffer *eb = path->nodes[level];
8908 u64 parent = 0;
8909
8910 if (wc->stage == UPDATE_BACKREF) {
8911 BUG_ON(wc->shared_level < level);
8912 if (level < wc->shared_level)
8913 goto out;
8914
8915 ret = find_next_key(path, level + 1, &wc->update_progress);
8916 if (ret > 0)
8917 wc->update_ref = 0;
8918
8919 wc->stage = DROP_REFERENCE;
8920 wc->shared_level = -1;
8921 path->slots[level] = 0;
8922
8923 /*
8924 * check reference count again if the block isn't locked.
8925 * we should start walking down the tree again if reference
8926 * count is one.
8927 */
8928 if (!path->locks[level]) {
8929 BUG_ON(level == 0);
8930 btrfs_tree_lock(eb);
8931 btrfs_set_lock_blocking(eb);
8932 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8933
8934 ret = btrfs_lookup_extent_info(trans, fs_info,
8935 eb->start, level, 1,
8936 &wc->refs[level],
8937 &wc->flags[level]);
8938 if (ret < 0) {
8939 btrfs_tree_unlock_rw(eb, path->locks[level]);
8940 path->locks[level] = 0;
8941 return ret;
8942 }
8943 BUG_ON(wc->refs[level] == 0);
8944 if (wc->refs[level] == 1) {
8945 btrfs_tree_unlock_rw(eb, path->locks[level]);
8946 path->locks[level] = 0;
8947 return 1;
8948 }
8949 }
8950 }
8951
8952 /* wc->stage == DROP_REFERENCE */
8953 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8954
8955 if (wc->refs[level] == 1) {
8956 if (level == 0) {
8957 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8958 ret = btrfs_dec_ref(trans, root, eb, 1);
8959 else
8960 ret = btrfs_dec_ref(trans, root, eb, 0);
8961 BUG_ON(ret); /* -ENOMEM */
8962 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
8963 if (ret) {
8964 btrfs_err_rl(fs_info,
8965 "error %d accounting leaf items. Quota is out of sync, rescan required.",
8966 ret);
8967 }
8968 }
8969 /* make block locked assertion in clean_tree_block happy */
8970 if (!path->locks[level] &&
8971 btrfs_header_generation(eb) == trans->transid) {
8972 btrfs_tree_lock(eb);
8973 btrfs_set_lock_blocking(eb);
8974 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8975 }
8976 clean_tree_block(fs_info, eb);
8977 }
8978
8979 if (eb == root->node) {
8980 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8981 parent = eb->start;
8982 else if (root->root_key.objectid != btrfs_header_owner(eb))
8983 goto owner_mismatch;
8984 } else {
8985 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8986 parent = path->nodes[level + 1]->start;
8987 else if (root->root_key.objectid !=
8988 btrfs_header_owner(path->nodes[level + 1]))
8989 goto owner_mismatch;
8990 }
8991
8992 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8993 out:
8994 wc->refs[level] = 0;
8995 wc->flags[level] = 0;
8996 return 0;
8997
8998 owner_mismatch:
8999 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
9000 btrfs_header_owner(eb), root->root_key.objectid);
9001 return -EUCLEAN;
9002 }
9003
9004 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9005 struct btrfs_root *root,
9006 struct btrfs_path *path,
9007 struct walk_control *wc)
9008 {
9009 int level = wc->level;
9010 int lookup_info = 1;
9011 int ret;
9012
9013 while (level >= 0) {
9014 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9015 if (ret > 0)
9016 break;
9017
9018 if (level == 0)
9019 break;
9020
9021 if (path->slots[level] >=
9022 btrfs_header_nritems(path->nodes[level]))
9023 break;
9024
9025 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9026 if (ret > 0) {
9027 path->slots[level]++;
9028 continue;
9029 } else if (ret < 0)
9030 return ret;
9031 level = wc->level;
9032 }
9033 return 0;
9034 }
9035
9036 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9037 struct btrfs_root *root,
9038 struct btrfs_path *path,
9039 struct walk_control *wc, int max_level)
9040 {
9041 int level = wc->level;
9042 int ret;
9043
9044 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9045 while (level < max_level && path->nodes[level]) {
9046 wc->level = level;
9047 if (path->slots[level] + 1 <
9048 btrfs_header_nritems(path->nodes[level])) {
9049 path->slots[level]++;
9050 return 0;
9051 } else {
9052 ret = walk_up_proc(trans, root, path, wc);
9053 if (ret > 0)
9054 return 0;
9055 if (ret < 0)
9056 return ret;
9057
9058 if (path->locks[level]) {
9059 btrfs_tree_unlock_rw(path->nodes[level],
9060 path->locks[level]);
9061 path->locks[level] = 0;
9062 }
9063 free_extent_buffer(path->nodes[level]);
9064 path->nodes[level] = NULL;
9065 level++;
9066 }
9067 }
9068 return 1;
9069 }
9070
9071 /*
9072 * drop a subvolume tree.
9073 *
9074 * this function traverses the tree freeing any blocks that only
9075 * referenced by the tree.
9076 *
9077 * when a shared tree block is found. this function decreases its
9078 * reference count by one. if update_ref is true, this function
9079 * also make sure backrefs for the shared block and all lower level
9080 * blocks are properly updated.
9081 *
9082 * If called with for_reloc == 0, may exit early with -EAGAIN
9083 */
9084 int btrfs_drop_snapshot(struct btrfs_root *root,
9085 struct btrfs_block_rsv *block_rsv, int update_ref,
9086 int for_reloc)
9087 {
9088 struct btrfs_fs_info *fs_info = root->fs_info;
9089 struct btrfs_path *path;
9090 struct btrfs_trans_handle *trans;
9091 struct btrfs_root *tree_root = fs_info->tree_root;
9092 struct btrfs_root_item *root_item = &root->root_item;
9093 struct walk_control *wc;
9094 struct btrfs_key key;
9095 int err = 0;
9096 int ret;
9097 int level;
9098 bool root_dropped = false;
9099
9100 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
9101
9102 path = btrfs_alloc_path();
9103 if (!path) {
9104 err = -ENOMEM;
9105 goto out;
9106 }
9107
9108 wc = kzalloc(sizeof(*wc), GFP_NOFS);
9109 if (!wc) {
9110 btrfs_free_path(path);
9111 err = -ENOMEM;
9112 goto out;
9113 }
9114
9115 trans = btrfs_start_transaction(tree_root, 0);
9116 if (IS_ERR(trans)) {
9117 err = PTR_ERR(trans);
9118 goto out_free;
9119 }
9120
9121 err = btrfs_run_delayed_items(trans, fs_info);
9122 if (err)
9123 goto out_end_trans;
9124
9125 if (block_rsv)
9126 trans->block_rsv = block_rsv;
9127
9128 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9129 level = btrfs_header_level(root->node);
9130 path->nodes[level] = btrfs_lock_root_node(root);
9131 btrfs_set_lock_blocking(path->nodes[level]);
9132 path->slots[level] = 0;
9133 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9134 memset(&wc->update_progress, 0,
9135 sizeof(wc->update_progress));
9136 } else {
9137 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9138 memcpy(&wc->update_progress, &key,
9139 sizeof(wc->update_progress));
9140
9141 level = root_item->drop_level;
9142 BUG_ON(level == 0);
9143 path->lowest_level = level;
9144 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9145 path->lowest_level = 0;
9146 if (ret < 0) {
9147 err = ret;
9148 goto out_end_trans;
9149 }
9150 WARN_ON(ret > 0);
9151
9152 /*
9153 * unlock our path, this is safe because only this
9154 * function is allowed to delete this snapshot
9155 */
9156 btrfs_unlock_up_safe(path, 0);
9157
9158 level = btrfs_header_level(root->node);
9159 while (1) {
9160 btrfs_tree_lock(path->nodes[level]);
9161 btrfs_set_lock_blocking(path->nodes[level]);
9162 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9163
9164 ret = btrfs_lookup_extent_info(trans, fs_info,
9165 path->nodes[level]->start,
9166 level, 1, &wc->refs[level],
9167 &wc->flags[level]);
9168 if (ret < 0) {
9169 err = ret;
9170 goto out_end_trans;
9171 }
9172 BUG_ON(wc->refs[level] == 0);
9173
9174 if (level == root_item->drop_level)
9175 break;
9176
9177 btrfs_tree_unlock(path->nodes[level]);
9178 path->locks[level] = 0;
9179 WARN_ON(wc->refs[level] != 1);
9180 level--;
9181 }
9182 }
9183
9184 wc->level = level;
9185 wc->shared_level = -1;
9186 wc->stage = DROP_REFERENCE;
9187 wc->update_ref = update_ref;
9188 wc->keep_locks = 0;
9189 wc->for_reloc = for_reloc;
9190 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9191
9192 while (1) {
9193
9194 ret = walk_down_tree(trans, root, path, wc);
9195 if (ret < 0) {
9196 err = ret;
9197 break;
9198 }
9199
9200 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9201 if (ret < 0) {
9202 err = ret;
9203 break;
9204 }
9205
9206 if (ret > 0) {
9207 BUG_ON(wc->stage != DROP_REFERENCE);
9208 break;
9209 }
9210
9211 if (wc->stage == DROP_REFERENCE) {
9212 level = wc->level;
9213 btrfs_node_key(path->nodes[level],
9214 &root_item->drop_progress,
9215 path->slots[level]);
9216 root_item->drop_level = level;
9217 }
9218
9219 BUG_ON(wc->level == 0);
9220 if (btrfs_should_end_transaction(trans) ||
9221 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9222 ret = btrfs_update_root(trans, tree_root,
9223 &root->root_key,
9224 root_item);
9225 if (ret) {
9226 btrfs_abort_transaction(trans, ret);
9227 err = ret;
9228 goto out_end_trans;
9229 }
9230
9231 btrfs_end_transaction_throttle(trans);
9232 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9233 btrfs_debug(fs_info,
9234 "drop snapshot early exit");
9235 err = -EAGAIN;
9236 goto out_free;
9237 }
9238
9239 trans = btrfs_start_transaction(tree_root, 0);
9240 if (IS_ERR(trans)) {
9241 err = PTR_ERR(trans);
9242 goto out_free;
9243 }
9244 if (block_rsv)
9245 trans->block_rsv = block_rsv;
9246 }
9247 }
9248 btrfs_release_path(path);
9249 if (err)
9250 goto out_end_trans;
9251
9252 ret = btrfs_del_root(trans, fs_info, &root->root_key);
9253 if (ret) {
9254 btrfs_abort_transaction(trans, ret);
9255 err = ret;
9256 goto out_end_trans;
9257 }
9258
9259 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9260 ret = btrfs_find_root(tree_root, &root->root_key, path,
9261 NULL, NULL);
9262 if (ret < 0) {
9263 btrfs_abort_transaction(trans, ret);
9264 err = ret;
9265 goto out_end_trans;
9266 } else if (ret > 0) {
9267 /* if we fail to delete the orphan item this time
9268 * around, it'll get picked up the next time.
9269 *
9270 * The most common failure here is just -ENOENT.
9271 */
9272 btrfs_del_orphan_item(trans, tree_root,
9273 root->root_key.objectid);
9274 }
9275 }
9276
9277 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9278 btrfs_add_dropped_root(trans, root);
9279 } else {
9280 free_extent_buffer(root->node);
9281 free_extent_buffer(root->commit_root);
9282 btrfs_put_fs_root(root);
9283 }
9284 root_dropped = true;
9285 out_end_trans:
9286 btrfs_end_transaction_throttle(trans);
9287 out_free:
9288 kfree(wc);
9289 btrfs_free_path(path);
9290 out:
9291 /*
9292 * So if we need to stop dropping the snapshot for whatever reason we
9293 * need to make sure to add it back to the dead root list so that we
9294 * keep trying to do the work later. This also cleans up roots if we
9295 * don't have it in the radix (like when we recover after a power fail
9296 * or unmount) so we don't leak memory.
9297 */
9298 if (!for_reloc && !root_dropped)
9299 btrfs_add_dead_root(root);
9300 if (err && err != -EAGAIN)
9301 btrfs_handle_fs_error(fs_info, err, NULL);
9302 return err;
9303 }
9304
9305 /*
9306 * drop subtree rooted at tree block 'node'.
9307 *
9308 * NOTE: this function will unlock and release tree block 'node'
9309 * only used by relocation code
9310 */
9311 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9312 struct btrfs_root *root,
9313 struct extent_buffer *node,
9314 struct extent_buffer *parent)
9315 {
9316 struct btrfs_fs_info *fs_info = root->fs_info;
9317 struct btrfs_path *path;
9318 struct walk_control *wc;
9319 int level;
9320 int parent_level;
9321 int ret = 0;
9322 int wret;
9323
9324 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9325
9326 path = btrfs_alloc_path();
9327 if (!path)
9328 return -ENOMEM;
9329
9330 wc = kzalloc(sizeof(*wc), GFP_NOFS);
9331 if (!wc) {
9332 btrfs_free_path(path);
9333 return -ENOMEM;
9334 }
9335
9336 btrfs_assert_tree_locked(parent);
9337 parent_level = btrfs_header_level(parent);
9338 extent_buffer_get(parent);
9339 path->nodes[parent_level] = parent;
9340 path->slots[parent_level] = btrfs_header_nritems(parent);
9341
9342 btrfs_assert_tree_locked(node);
9343 level = btrfs_header_level(node);
9344 path->nodes[level] = node;
9345 path->slots[level] = 0;
9346 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9347
9348 wc->refs[parent_level] = 1;
9349 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9350 wc->level = level;
9351 wc->shared_level = -1;
9352 wc->stage = DROP_REFERENCE;
9353 wc->update_ref = 0;
9354 wc->keep_locks = 1;
9355 wc->for_reloc = 1;
9356 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9357
9358 while (1) {
9359 wret = walk_down_tree(trans, root, path, wc);
9360 if (wret < 0) {
9361 ret = wret;
9362 break;
9363 }
9364
9365 wret = walk_up_tree(trans, root, path, wc, parent_level);
9366 if (wret < 0)
9367 ret = wret;
9368 if (wret != 0)
9369 break;
9370 }
9371
9372 kfree(wc);
9373 btrfs_free_path(path);
9374 return ret;
9375 }
9376
9377 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9378 {
9379 u64 num_devices;
9380 u64 stripped;
9381
9382 /*
9383 * if restripe for this chunk_type is on pick target profile and
9384 * return, otherwise do the usual balance
9385 */
9386 stripped = get_restripe_target(fs_info, flags);
9387 if (stripped)
9388 return extended_to_chunk(stripped);
9389
9390 num_devices = fs_info->fs_devices->rw_devices;
9391
9392 stripped = BTRFS_BLOCK_GROUP_RAID0 |
9393 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9394 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9395
9396 if (num_devices == 1) {
9397 stripped |= BTRFS_BLOCK_GROUP_DUP;
9398 stripped = flags & ~stripped;
9399
9400 /* turn raid0 into single device chunks */
9401 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9402 return stripped;
9403
9404 /* turn mirroring into duplication */
9405 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9406 BTRFS_BLOCK_GROUP_RAID10))
9407 return stripped | BTRFS_BLOCK_GROUP_DUP;
9408 } else {
9409 /* they already had raid on here, just return */
9410 if (flags & stripped)
9411 return flags;
9412
9413 stripped |= BTRFS_BLOCK_GROUP_DUP;
9414 stripped = flags & ~stripped;
9415
9416 /* switch duplicated blocks with raid1 */
9417 if (flags & BTRFS_BLOCK_GROUP_DUP)
9418 return stripped | BTRFS_BLOCK_GROUP_RAID1;
9419
9420 /* this is drive concat, leave it alone */
9421 }
9422
9423 return flags;
9424 }
9425
9426 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9427 {
9428 struct btrfs_space_info *sinfo = cache->space_info;
9429 u64 num_bytes;
9430 u64 min_allocable_bytes;
9431 int ret = -ENOSPC;
9432
9433 /*
9434 * We need some metadata space and system metadata space for
9435 * allocating chunks in some corner cases until we force to set
9436 * it to be readonly.
9437 */
9438 if ((sinfo->flags &
9439 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9440 !force)
9441 min_allocable_bytes = SZ_1M;
9442 else
9443 min_allocable_bytes = 0;
9444
9445 spin_lock(&sinfo->lock);
9446 spin_lock(&cache->lock);
9447
9448 if (cache->ro) {
9449 cache->ro++;
9450 ret = 0;
9451 goto out;
9452 }
9453
9454 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9455 cache->bytes_super - btrfs_block_group_used(&cache->item);
9456
9457 if (btrfs_space_info_used(sinfo, true) + num_bytes +
9458 min_allocable_bytes <= sinfo->total_bytes) {
9459 sinfo->bytes_readonly += num_bytes;
9460 cache->ro++;
9461 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9462 ret = 0;
9463 }
9464 out:
9465 spin_unlock(&cache->lock);
9466 spin_unlock(&sinfo->lock);
9467 return ret;
9468 }
9469
9470 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
9471 struct btrfs_block_group_cache *cache)
9472
9473 {
9474 struct btrfs_trans_handle *trans;
9475 u64 alloc_flags;
9476 int ret;
9477
9478 again:
9479 trans = btrfs_join_transaction(fs_info->extent_root);
9480 if (IS_ERR(trans))
9481 return PTR_ERR(trans);
9482
9483 /*
9484 * we're not allowed to set block groups readonly after the dirty
9485 * block groups cache has started writing. If it already started,
9486 * back off and let this transaction commit
9487 */
9488 mutex_lock(&fs_info->ro_block_group_mutex);
9489 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9490 u64 transid = trans->transid;
9491
9492 mutex_unlock(&fs_info->ro_block_group_mutex);
9493 btrfs_end_transaction(trans);
9494
9495 ret = btrfs_wait_for_commit(fs_info, transid);
9496 if (ret)
9497 return ret;
9498 goto again;
9499 }
9500
9501 /*
9502 * if we are changing raid levels, try to allocate a corresponding
9503 * block group with the new raid level.
9504 */
9505 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9506 if (alloc_flags != cache->flags) {
9507 ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9508 CHUNK_ALLOC_FORCE);
9509 /*
9510 * ENOSPC is allowed here, we may have enough space
9511 * already allocated at the new raid level to
9512 * carry on
9513 */
9514 if (ret == -ENOSPC)
9515 ret = 0;
9516 if (ret < 0)
9517 goto out;
9518 }
9519
9520 ret = inc_block_group_ro(cache, 0);
9521 if (!ret)
9522 goto out;
9523 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9524 ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9525 CHUNK_ALLOC_FORCE);
9526 if (ret < 0)
9527 goto out;
9528 ret = inc_block_group_ro(cache, 0);
9529 out:
9530 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9531 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9532 mutex_lock(&fs_info->chunk_mutex);
9533 check_system_chunk(trans, fs_info, alloc_flags);
9534 mutex_unlock(&fs_info->chunk_mutex);
9535 }
9536 mutex_unlock(&fs_info->ro_block_group_mutex);
9537
9538 btrfs_end_transaction(trans);
9539 return ret;
9540 }
9541
9542 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9543 struct btrfs_fs_info *fs_info, u64 type)
9544 {
9545 u64 alloc_flags = get_alloc_profile(fs_info, type);
9546
9547 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
9548 }
9549
9550 /*
9551 * helper to account the unused space of all the readonly block group in the
9552 * space_info. takes mirrors into account.
9553 */
9554 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9555 {
9556 struct btrfs_block_group_cache *block_group;
9557 u64 free_bytes = 0;
9558 int factor;
9559
9560 /* It's df, we don't care if it's racy */
9561 if (list_empty(&sinfo->ro_bgs))
9562 return 0;
9563
9564 spin_lock(&sinfo->lock);
9565 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9566 spin_lock(&block_group->lock);
9567
9568 if (!block_group->ro) {
9569 spin_unlock(&block_group->lock);
9570 continue;
9571 }
9572
9573 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9574 BTRFS_BLOCK_GROUP_RAID10 |
9575 BTRFS_BLOCK_GROUP_DUP))
9576 factor = 2;
9577 else
9578 factor = 1;
9579
9580 free_bytes += (block_group->key.offset -
9581 btrfs_block_group_used(&block_group->item)) *
9582 factor;
9583
9584 spin_unlock(&block_group->lock);
9585 }
9586 spin_unlock(&sinfo->lock);
9587
9588 return free_bytes;
9589 }
9590
9591 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9592 {
9593 struct btrfs_space_info *sinfo = cache->space_info;
9594 u64 num_bytes;
9595
9596 BUG_ON(!cache->ro);
9597
9598 spin_lock(&sinfo->lock);
9599 spin_lock(&cache->lock);
9600 if (!--cache->ro) {
9601 num_bytes = cache->key.offset - cache->reserved -
9602 cache->pinned - cache->bytes_super -
9603 btrfs_block_group_used(&cache->item);
9604 sinfo->bytes_readonly -= num_bytes;
9605 list_del_init(&cache->ro_list);
9606 }
9607 spin_unlock(&cache->lock);
9608 spin_unlock(&sinfo->lock);
9609 }
9610
9611 /*
9612 * checks to see if its even possible to relocate this block group.
9613 *
9614 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9615 * ok to go ahead and try.
9616 */
9617 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9618 {
9619 struct btrfs_root *root = fs_info->extent_root;
9620 struct btrfs_block_group_cache *block_group;
9621 struct btrfs_space_info *space_info;
9622 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9623 struct btrfs_device *device;
9624 struct btrfs_trans_handle *trans;
9625 u64 min_free;
9626 u64 dev_min = 1;
9627 u64 dev_nr = 0;
9628 u64 target;
9629 int debug;
9630 int index;
9631 int full = 0;
9632 int ret = 0;
9633
9634 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9635
9636 block_group = btrfs_lookup_block_group(fs_info, bytenr);
9637
9638 /* odd, couldn't find the block group, leave it alone */
9639 if (!block_group) {
9640 if (debug)
9641 btrfs_warn(fs_info,
9642 "can't find block group for bytenr %llu",
9643 bytenr);
9644 return -1;
9645 }
9646
9647 min_free = btrfs_block_group_used(&block_group->item);
9648
9649 /* no bytes used, we're good */
9650 if (!min_free)
9651 goto out;
9652
9653 space_info = block_group->space_info;
9654 spin_lock(&space_info->lock);
9655
9656 full = space_info->full;
9657
9658 /*
9659 * if this is the last block group we have in this space, we can't
9660 * relocate it unless we're able to allocate a new chunk below.
9661 *
9662 * Otherwise, we need to make sure we have room in the space to handle
9663 * all of the extents from this block group. If we can, we're good
9664 */
9665 if ((space_info->total_bytes != block_group->key.offset) &&
9666 (btrfs_space_info_used(space_info, false) + min_free <
9667 space_info->total_bytes)) {
9668 spin_unlock(&space_info->lock);
9669 goto out;
9670 }
9671 spin_unlock(&space_info->lock);
9672
9673 /*
9674 * ok we don't have enough space, but maybe we have free space on our
9675 * devices to allocate new chunks for relocation, so loop through our
9676 * alloc devices and guess if we have enough space. if this block
9677 * group is going to be restriped, run checks against the target
9678 * profile instead of the current one.
9679 */
9680 ret = -1;
9681
9682 /*
9683 * index:
9684 * 0: raid10
9685 * 1: raid1
9686 * 2: dup
9687 * 3: raid0
9688 * 4: single
9689 */
9690 target = get_restripe_target(fs_info, block_group->flags);
9691 if (target) {
9692 index = __get_raid_index(extended_to_chunk(target));
9693 } else {
9694 /*
9695 * this is just a balance, so if we were marked as full
9696 * we know there is no space for a new chunk
9697 */
9698 if (full) {
9699 if (debug)
9700 btrfs_warn(fs_info,
9701 "no space to alloc new chunk for block group %llu",
9702 block_group->key.objectid);
9703 goto out;
9704 }
9705
9706 index = get_block_group_index(block_group);
9707 }
9708
9709 if (index == BTRFS_RAID_RAID10) {
9710 dev_min = 4;
9711 /* Divide by 2 */
9712 min_free >>= 1;
9713 } else if (index == BTRFS_RAID_RAID1) {
9714 dev_min = 2;
9715 } else if (index == BTRFS_RAID_DUP) {
9716 /* Multiply by 2 */
9717 min_free <<= 1;
9718 } else if (index == BTRFS_RAID_RAID0) {
9719 dev_min = fs_devices->rw_devices;
9720 min_free = div64_u64(min_free, dev_min);
9721 }
9722
9723 /* We need to do this so that we can look at pending chunks */
9724 trans = btrfs_join_transaction(root);
9725 if (IS_ERR(trans)) {
9726 ret = PTR_ERR(trans);
9727 goto out;
9728 }
9729
9730 mutex_lock(&fs_info->chunk_mutex);
9731 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9732 u64 dev_offset;
9733
9734 /*
9735 * check to make sure we can actually find a chunk with enough
9736 * space to fit our block group in.
9737 */
9738 if (device->total_bytes > device->bytes_used + min_free &&
9739 !device->is_tgtdev_for_dev_replace) {
9740 ret = find_free_dev_extent(trans, device, min_free,
9741 &dev_offset, NULL);
9742 if (!ret)
9743 dev_nr++;
9744
9745 if (dev_nr >= dev_min)
9746 break;
9747
9748 ret = -1;
9749 }
9750 }
9751 if (debug && ret == -1)
9752 btrfs_warn(fs_info,
9753 "no space to allocate a new chunk for block group %llu",
9754 block_group->key.objectid);
9755 mutex_unlock(&fs_info->chunk_mutex);
9756 btrfs_end_transaction(trans);
9757 out:
9758 btrfs_put_block_group(block_group);
9759 return ret;
9760 }
9761
9762 static int find_first_block_group(struct btrfs_fs_info *fs_info,
9763 struct btrfs_path *path,
9764 struct btrfs_key *key)
9765 {
9766 struct btrfs_root *root = fs_info->extent_root;
9767 int ret = 0;
9768 struct btrfs_key found_key;
9769 struct extent_buffer *leaf;
9770 struct btrfs_block_group_item bg;
9771 u64 flags;
9772 int slot;
9773
9774 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9775 if (ret < 0)
9776 goto out;
9777
9778 while (1) {
9779 slot = path->slots[0];
9780 leaf = path->nodes[0];
9781 if (slot >= btrfs_header_nritems(leaf)) {
9782 ret = btrfs_next_leaf(root, path);
9783 if (ret == 0)
9784 continue;
9785 if (ret < 0)
9786 goto out;
9787 break;
9788 }
9789 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9790
9791 if (found_key.objectid >= key->objectid &&
9792 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9793 struct extent_map_tree *em_tree;
9794 struct extent_map *em;
9795
9796 em_tree = &root->fs_info->mapping_tree.map_tree;
9797 read_lock(&em_tree->lock);
9798 em = lookup_extent_mapping(em_tree, found_key.objectid,
9799 found_key.offset);
9800 read_unlock(&em_tree->lock);
9801 if (!em) {
9802 btrfs_err(fs_info,
9803 "logical %llu len %llu found bg but no related chunk",
9804 found_key.objectid, found_key.offset);
9805 ret = -ENOENT;
9806 } else if (em->start != found_key.objectid ||
9807 em->len != found_key.offset) {
9808 btrfs_err(fs_info,
9809 "block group %llu len %llu mismatch with chunk %llu len %llu",
9810 found_key.objectid, found_key.offset,
9811 em->start, em->len);
9812 ret = -EUCLEAN;
9813 } else {
9814 read_extent_buffer(leaf, &bg,
9815 btrfs_item_ptr_offset(leaf, slot),
9816 sizeof(bg));
9817 flags = btrfs_block_group_flags(&bg) &
9818 BTRFS_BLOCK_GROUP_TYPE_MASK;
9819
9820 if (flags != (em->map_lookup->type &
9821 BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9822 btrfs_err(fs_info,
9823 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9824 found_key.objectid,
9825 found_key.offset, flags,
9826 (BTRFS_BLOCK_GROUP_TYPE_MASK &
9827 em->map_lookup->type));
9828 ret = -EUCLEAN;
9829 } else {
9830 ret = 0;
9831 }
9832 }
9833 free_extent_map(em);
9834 goto out;
9835 }
9836 path->slots[0]++;
9837 }
9838 out:
9839 return ret;
9840 }
9841
9842 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9843 {
9844 struct btrfs_block_group_cache *block_group;
9845 u64 last = 0;
9846
9847 while (1) {
9848 struct inode *inode;
9849
9850 block_group = btrfs_lookup_first_block_group(info, last);
9851 while (block_group) {
9852 wait_block_group_cache_done(block_group);
9853 spin_lock(&block_group->lock);
9854 if (block_group->iref)
9855 break;
9856 spin_unlock(&block_group->lock);
9857 block_group = next_block_group(info, block_group);
9858 }
9859 if (!block_group) {
9860 if (last == 0)
9861 break;
9862 last = 0;
9863 continue;
9864 }
9865
9866 inode = block_group->inode;
9867 block_group->iref = 0;
9868 block_group->inode = NULL;
9869 spin_unlock(&block_group->lock);
9870 ASSERT(block_group->io_ctl.inode == NULL);
9871 iput(inode);
9872 last = block_group->key.objectid + block_group->key.offset;
9873 btrfs_put_block_group(block_group);
9874 }
9875 }
9876
9877 /*
9878 * Must be called only after stopping all workers, since we could have block
9879 * group caching kthreads running, and therefore they could race with us if we
9880 * freed the block groups before stopping them.
9881 */
9882 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9883 {
9884 struct btrfs_block_group_cache *block_group;
9885 struct btrfs_space_info *space_info;
9886 struct btrfs_caching_control *caching_ctl;
9887 struct rb_node *n;
9888
9889 down_write(&info->commit_root_sem);
9890 while (!list_empty(&info->caching_block_groups)) {
9891 caching_ctl = list_entry(info->caching_block_groups.next,
9892 struct btrfs_caching_control, list);
9893 list_del(&caching_ctl->list);
9894 put_caching_control(caching_ctl);
9895 }
9896 up_write(&info->commit_root_sem);
9897
9898 spin_lock(&info->unused_bgs_lock);
9899 while (!list_empty(&info->unused_bgs)) {
9900 block_group = list_first_entry(&info->unused_bgs,
9901 struct btrfs_block_group_cache,
9902 bg_list);
9903 list_del_init(&block_group->bg_list);
9904 btrfs_put_block_group(block_group);
9905 }
9906 spin_unlock(&info->unused_bgs_lock);
9907
9908 spin_lock(&info->block_group_cache_lock);
9909 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9910 block_group = rb_entry(n, struct btrfs_block_group_cache,
9911 cache_node);
9912 rb_erase(&block_group->cache_node,
9913 &info->block_group_cache_tree);
9914 RB_CLEAR_NODE(&block_group->cache_node);
9915 spin_unlock(&info->block_group_cache_lock);
9916
9917 down_write(&block_group->space_info->groups_sem);
9918 list_del(&block_group->list);
9919 up_write(&block_group->space_info->groups_sem);
9920
9921 /*
9922 * We haven't cached this block group, which means we could
9923 * possibly have excluded extents on this block group.
9924 */
9925 if (block_group->cached == BTRFS_CACHE_NO ||
9926 block_group->cached == BTRFS_CACHE_ERROR)
9927 free_excluded_extents(info, block_group);
9928
9929 btrfs_remove_free_space_cache(block_group);
9930 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9931 ASSERT(list_empty(&block_group->dirty_list));
9932 ASSERT(list_empty(&block_group->io_list));
9933 ASSERT(list_empty(&block_group->bg_list));
9934 ASSERT(atomic_read(&block_group->count) == 1);
9935 btrfs_put_block_group(block_group);
9936
9937 spin_lock(&info->block_group_cache_lock);
9938 }
9939 spin_unlock(&info->block_group_cache_lock);
9940
9941 /* now that all the block groups are freed, go through and
9942 * free all the space_info structs. This is only called during
9943 * the final stages of unmount, and so we know nobody is
9944 * using them. We call synchronize_rcu() once before we start,
9945 * just to be on the safe side.
9946 */
9947 synchronize_rcu();
9948
9949 release_global_block_rsv(info);
9950
9951 while (!list_empty(&info->space_info)) {
9952 int i;
9953
9954 space_info = list_entry(info->space_info.next,
9955 struct btrfs_space_info,
9956 list);
9957
9958 /*
9959 * Do not hide this behind enospc_debug, this is actually
9960 * important and indicates a real bug if this happens.
9961 */
9962 if (WARN_ON(space_info->bytes_pinned > 0 ||
9963 space_info->bytes_reserved > 0 ||
9964 space_info->bytes_may_use > 0))
9965 dump_space_info(info, space_info, 0, 0);
9966 list_del(&space_info->list);
9967 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9968 struct kobject *kobj;
9969 kobj = space_info->block_group_kobjs[i];
9970 space_info->block_group_kobjs[i] = NULL;
9971 if (kobj) {
9972 kobject_del(kobj);
9973 kobject_put(kobj);
9974 }
9975 }
9976 kobject_del(&space_info->kobj);
9977 kobject_put(&space_info->kobj);
9978 }
9979 return 0;
9980 }
9981
9982 static void link_block_group(struct btrfs_block_group_cache *cache)
9983 {
9984 struct btrfs_space_info *space_info = cache->space_info;
9985 int index = get_block_group_index(cache);
9986 bool first = false;
9987
9988 down_write(&space_info->groups_sem);
9989 if (list_empty(&space_info->block_groups[index]))
9990 first = true;
9991 list_add_tail(&cache->list, &space_info->block_groups[index]);
9992 up_write(&space_info->groups_sem);
9993
9994 if (first) {
9995 struct raid_kobject *rkobj;
9996 int ret;
9997
9998 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9999 if (!rkobj)
10000 goto out_err;
10001 rkobj->raid_type = index;
10002 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10003 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10004 "%s", get_raid_name(index));
10005 if (ret) {
10006 kobject_put(&rkobj->kobj);
10007 goto out_err;
10008 }
10009 space_info->block_group_kobjs[index] = &rkobj->kobj;
10010 }
10011
10012 return;
10013 out_err:
10014 btrfs_warn(cache->fs_info,
10015 "failed to add kobject for block cache, ignoring");
10016 }
10017
10018 static struct btrfs_block_group_cache *
10019 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10020 u64 start, u64 size)
10021 {
10022 struct btrfs_block_group_cache *cache;
10023
10024 cache = kzalloc(sizeof(*cache), GFP_NOFS);
10025 if (!cache)
10026 return NULL;
10027
10028 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10029 GFP_NOFS);
10030 if (!cache->free_space_ctl) {
10031 kfree(cache);
10032 return NULL;
10033 }
10034
10035 cache->key.objectid = start;
10036 cache->key.offset = size;
10037 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10038
10039 cache->fs_info = fs_info;
10040 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10041 set_free_space_tree_thresholds(cache);
10042
10043 atomic_set(&cache->count, 1);
10044 spin_lock_init(&cache->lock);
10045 init_rwsem(&cache->data_rwsem);
10046 INIT_LIST_HEAD(&cache->list);
10047 INIT_LIST_HEAD(&cache->cluster_list);
10048 INIT_LIST_HEAD(&cache->bg_list);
10049 INIT_LIST_HEAD(&cache->ro_list);
10050 INIT_LIST_HEAD(&cache->dirty_list);
10051 INIT_LIST_HEAD(&cache->io_list);
10052 btrfs_init_free_space_ctl(cache);
10053 atomic_set(&cache->trimming, 0);
10054 mutex_init(&cache->free_space_lock);
10055 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10056
10057 return cache;
10058 }
10059
10060
10061 /*
10062 * Iterate all chunks and verify that each of them has the corresponding block
10063 * group
10064 */
10065 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10066 {
10067 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
10068 struct extent_map *em;
10069 struct btrfs_block_group_cache *bg;
10070 u64 start = 0;
10071 int ret = 0;
10072
10073 while (1) {
10074 read_lock(&map_tree->map_tree.lock);
10075 /*
10076 * lookup_extent_mapping will return the first extent map
10077 * intersecting the range, so setting @len to 1 is enough to
10078 * get the first chunk.
10079 */
10080 em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
10081 read_unlock(&map_tree->map_tree.lock);
10082 if (!em)
10083 break;
10084
10085 bg = btrfs_lookup_block_group(fs_info, em->start);
10086 if (!bg) {
10087 btrfs_err(fs_info,
10088 "chunk start=%llu len=%llu doesn't have corresponding block group",
10089 em->start, em->len);
10090 ret = -EUCLEAN;
10091 free_extent_map(em);
10092 break;
10093 }
10094 if (bg->key.objectid != em->start ||
10095 bg->key.offset != em->len ||
10096 (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10097 (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10098 btrfs_err(fs_info,
10099 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10100 em->start, em->len,
10101 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10102 bg->key.objectid, bg->key.offset,
10103 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10104 ret = -EUCLEAN;
10105 free_extent_map(em);
10106 btrfs_put_block_group(bg);
10107 break;
10108 }
10109 start = em->start + em->len;
10110 free_extent_map(em);
10111 btrfs_put_block_group(bg);
10112 }
10113 return ret;
10114 }
10115
10116 int btrfs_read_block_groups(struct btrfs_fs_info *info)
10117 {
10118 struct btrfs_path *path;
10119 int ret;
10120 struct btrfs_block_group_cache *cache;
10121 struct btrfs_space_info *space_info;
10122 struct btrfs_key key;
10123 struct btrfs_key found_key;
10124 struct extent_buffer *leaf;
10125 int need_clear = 0;
10126 u64 cache_gen;
10127 u64 feature;
10128 int mixed;
10129
10130 feature = btrfs_super_incompat_flags(info->super_copy);
10131 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10132
10133 key.objectid = 0;
10134 key.offset = 0;
10135 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10136 path = btrfs_alloc_path();
10137 if (!path)
10138 return -ENOMEM;
10139 path->reada = READA_FORWARD;
10140
10141 cache_gen = btrfs_super_cache_generation(info->super_copy);
10142 if (btrfs_test_opt(info, SPACE_CACHE) &&
10143 btrfs_super_generation(info->super_copy) != cache_gen)
10144 need_clear = 1;
10145 if (btrfs_test_opt(info, CLEAR_CACHE))
10146 need_clear = 1;
10147
10148 while (1) {
10149 ret = find_first_block_group(info, path, &key);
10150 if (ret > 0)
10151 break;
10152 if (ret != 0)
10153 goto error;
10154
10155 leaf = path->nodes[0];
10156 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10157
10158 cache = btrfs_create_block_group_cache(info, found_key.objectid,
10159 found_key.offset);
10160 if (!cache) {
10161 ret = -ENOMEM;
10162 goto error;
10163 }
10164
10165 if (need_clear) {
10166 /*
10167 * When we mount with old space cache, we need to
10168 * set BTRFS_DC_CLEAR and set dirty flag.
10169 *
10170 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10171 * truncate the old free space cache inode and
10172 * setup a new one.
10173 * b) Setting 'dirty flag' makes sure that we flush
10174 * the new space cache info onto disk.
10175 */
10176 if (btrfs_test_opt(info, SPACE_CACHE))
10177 cache->disk_cache_state = BTRFS_DC_CLEAR;
10178 }
10179
10180 read_extent_buffer(leaf, &cache->item,
10181 btrfs_item_ptr_offset(leaf, path->slots[0]),
10182 sizeof(cache->item));
10183 cache->flags = btrfs_block_group_flags(&cache->item);
10184 if (!mixed &&
10185 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10186 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10187 btrfs_err(info,
10188 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10189 cache->key.objectid);
10190 btrfs_put_block_group(cache);
10191 ret = -EINVAL;
10192 goto error;
10193 }
10194
10195 key.objectid = found_key.objectid + found_key.offset;
10196 btrfs_release_path(path);
10197
10198 /*
10199 * We need to exclude the super stripes now so that the space
10200 * info has super bytes accounted for, otherwise we'll think
10201 * we have more space than we actually do.
10202 */
10203 ret = exclude_super_stripes(info, cache);
10204 if (ret) {
10205 /*
10206 * We may have excluded something, so call this just in
10207 * case.
10208 */
10209 free_excluded_extents(info, cache);
10210 btrfs_put_block_group(cache);
10211 goto error;
10212 }
10213
10214 /*
10215 * check for two cases, either we are full, and therefore
10216 * don't need to bother with the caching work since we won't
10217 * find any space, or we are empty, and we can just add all
10218 * the space in and be done with it. This saves us _alot_ of
10219 * time, particularly in the full case.
10220 */
10221 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10222 cache->last_byte_to_unpin = (u64)-1;
10223 cache->cached = BTRFS_CACHE_FINISHED;
10224 free_excluded_extents(info, cache);
10225 } else if (btrfs_block_group_used(&cache->item) == 0) {
10226 cache->last_byte_to_unpin = (u64)-1;
10227 cache->cached = BTRFS_CACHE_FINISHED;
10228 add_new_free_space(cache, info,
10229 found_key.objectid,
10230 found_key.objectid +
10231 found_key.offset);
10232 free_excluded_extents(info, cache);
10233 }
10234
10235 ret = btrfs_add_block_group_cache(info, cache);
10236 if (ret) {
10237 btrfs_remove_free_space_cache(cache);
10238 btrfs_put_block_group(cache);
10239 goto error;
10240 }
10241
10242 trace_btrfs_add_block_group(info, cache, 0);
10243 update_space_info(info, cache->flags, found_key.offset,
10244 btrfs_block_group_used(&cache->item),
10245 cache->bytes_super, &space_info);
10246
10247 cache->space_info = space_info;
10248
10249 link_block_group(cache);
10250
10251 set_avail_alloc_bits(info, cache->flags);
10252 if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10253 inc_block_group_ro(cache, 1);
10254 } else if (btrfs_block_group_used(&cache->item) == 0) {
10255 spin_lock(&info->unused_bgs_lock);
10256 /* Should always be true but just in case. */
10257 if (list_empty(&cache->bg_list)) {
10258 btrfs_get_block_group(cache);
10259 list_add_tail(&cache->bg_list,
10260 &info->unused_bgs);
10261 }
10262 spin_unlock(&info->unused_bgs_lock);
10263 }
10264 }
10265
10266 list_for_each_entry_rcu(space_info, &info->space_info, list) {
10267 if (!(get_alloc_profile(info, space_info->flags) &
10268 (BTRFS_BLOCK_GROUP_RAID10 |
10269 BTRFS_BLOCK_GROUP_RAID1 |
10270 BTRFS_BLOCK_GROUP_RAID5 |
10271 BTRFS_BLOCK_GROUP_RAID6 |
10272 BTRFS_BLOCK_GROUP_DUP)))
10273 continue;
10274 /*
10275 * avoid allocating from un-mirrored block group if there are
10276 * mirrored block groups.
10277 */
10278 list_for_each_entry(cache,
10279 &space_info->block_groups[BTRFS_RAID_RAID0],
10280 list)
10281 inc_block_group_ro(cache, 1);
10282 list_for_each_entry(cache,
10283 &space_info->block_groups[BTRFS_RAID_SINGLE],
10284 list)
10285 inc_block_group_ro(cache, 1);
10286 }
10287
10288 init_global_block_rsv(info);
10289 ret = check_chunk_block_group_mappings(info);
10290 error:
10291 btrfs_free_path(path);
10292 return ret;
10293 }
10294
10295 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10296 struct btrfs_fs_info *fs_info)
10297 {
10298 struct btrfs_block_group_cache *block_group;
10299 struct btrfs_root *extent_root = fs_info->extent_root;
10300 struct btrfs_block_group_item item;
10301 struct btrfs_key key;
10302 int ret = 0;
10303
10304 if (!trans->can_flush_pending_bgs)
10305 return;
10306
10307 while (!list_empty(&trans->new_bgs)) {
10308 block_group = list_first_entry(&trans->new_bgs,
10309 struct btrfs_block_group_cache,
10310 bg_list);
10311 if (ret)
10312 goto next;
10313
10314 spin_lock(&block_group->lock);
10315 memcpy(&item, &block_group->item, sizeof(item));
10316 memcpy(&key, &block_group->key, sizeof(key));
10317 spin_unlock(&block_group->lock);
10318
10319 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10320 sizeof(item));
10321 if (ret)
10322 btrfs_abort_transaction(trans, ret);
10323 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
10324 key.offset);
10325 if (ret)
10326 btrfs_abort_transaction(trans, ret);
10327 add_block_group_free_space(trans, fs_info, block_group);
10328 /* already aborted the transaction if it failed. */
10329 next:
10330 list_del_init(&block_group->bg_list);
10331 }
10332 btrfs_trans_release_chunk_metadata(trans);
10333 }
10334
10335 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10336 struct btrfs_fs_info *fs_info, u64 bytes_used,
10337 u64 type, u64 chunk_offset, u64 size)
10338 {
10339 struct btrfs_block_group_cache *cache;
10340 int ret;
10341
10342 btrfs_set_log_full_commit(fs_info, trans);
10343
10344 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10345 if (!cache)
10346 return -ENOMEM;
10347
10348 btrfs_set_block_group_used(&cache->item, bytes_used);
10349 btrfs_set_block_group_chunk_objectid(&cache->item,
10350 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10351 btrfs_set_block_group_flags(&cache->item, type);
10352
10353 cache->flags = type;
10354 cache->last_byte_to_unpin = (u64)-1;
10355 cache->cached = BTRFS_CACHE_FINISHED;
10356 cache->needs_free_space = 1;
10357 ret = exclude_super_stripes(fs_info, cache);
10358 if (ret) {
10359 /*
10360 * We may have excluded something, so call this just in
10361 * case.
10362 */
10363 free_excluded_extents(fs_info, cache);
10364 btrfs_put_block_group(cache);
10365 return ret;
10366 }
10367
10368 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
10369
10370 free_excluded_extents(fs_info, cache);
10371
10372 #ifdef CONFIG_BTRFS_DEBUG
10373 if (btrfs_should_fragment_free_space(cache)) {
10374 u64 new_bytes_used = size - bytes_used;
10375
10376 bytes_used += new_bytes_used >> 1;
10377 fragment_free_space(cache);
10378 }
10379 #endif
10380 /*
10381 * Ensure the corresponding space_info object is created and
10382 * assigned to our block group. We want our bg to be added to the rbtree
10383 * with its ->space_info set.
10384 */
10385 cache->space_info = __find_space_info(fs_info, cache->flags);
10386 if (!cache->space_info) {
10387 ret = create_space_info(fs_info, cache->flags,
10388 &cache->space_info);
10389 if (ret) {
10390 btrfs_remove_free_space_cache(cache);
10391 btrfs_put_block_group(cache);
10392 return ret;
10393 }
10394 }
10395
10396 ret = btrfs_add_block_group_cache(fs_info, cache);
10397 if (ret) {
10398 btrfs_remove_free_space_cache(cache);
10399 btrfs_put_block_group(cache);
10400 return ret;
10401 }
10402
10403 /*
10404 * Now that our block group has its ->space_info set and is inserted in
10405 * the rbtree, update the space info's counters.
10406 */
10407 trace_btrfs_add_block_group(fs_info, cache, 1);
10408 update_space_info(fs_info, cache->flags, size, bytes_used,
10409 cache->bytes_super, &cache->space_info);
10410 update_global_block_rsv(fs_info);
10411
10412 link_block_group(cache);
10413
10414 list_add_tail(&cache->bg_list, &trans->new_bgs);
10415
10416 set_avail_alloc_bits(fs_info, type);
10417 return 0;
10418 }
10419
10420 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10421 {
10422 u64 extra_flags = chunk_to_extended(flags) &
10423 BTRFS_EXTENDED_PROFILE_MASK;
10424
10425 write_seqlock(&fs_info->profiles_lock);
10426 if (flags & BTRFS_BLOCK_GROUP_DATA)
10427 fs_info->avail_data_alloc_bits &= ~extra_flags;
10428 if (flags & BTRFS_BLOCK_GROUP_METADATA)
10429 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10430 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10431 fs_info->avail_system_alloc_bits &= ~extra_flags;
10432 write_sequnlock(&fs_info->profiles_lock);
10433 }
10434
10435 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10436 struct btrfs_fs_info *fs_info, u64 group_start,
10437 struct extent_map *em)
10438 {
10439 struct btrfs_root *root = fs_info->extent_root;
10440 struct btrfs_path *path;
10441 struct btrfs_block_group_cache *block_group;
10442 struct btrfs_free_cluster *cluster;
10443 struct btrfs_root *tree_root = fs_info->tree_root;
10444 struct btrfs_key key;
10445 struct inode *inode;
10446 struct kobject *kobj = NULL;
10447 int ret;
10448 int index;
10449 int factor;
10450 struct btrfs_caching_control *caching_ctl = NULL;
10451 bool remove_em;
10452
10453 block_group = btrfs_lookup_block_group(fs_info, group_start);
10454 BUG_ON(!block_group);
10455 BUG_ON(!block_group->ro);
10456
10457 /*
10458 * Free the reserved super bytes from this block group before
10459 * remove it.
10460 */
10461 free_excluded_extents(fs_info, block_group);
10462 btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10463 block_group->key.offset);
10464
10465 memcpy(&key, &block_group->key, sizeof(key));
10466 index = get_block_group_index(block_group);
10467 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10468 BTRFS_BLOCK_GROUP_RAID1 |
10469 BTRFS_BLOCK_GROUP_RAID10))
10470 factor = 2;
10471 else
10472 factor = 1;
10473
10474 /* make sure this block group isn't part of an allocation cluster */
10475 cluster = &fs_info->data_alloc_cluster;
10476 spin_lock(&cluster->refill_lock);
10477 btrfs_return_cluster_to_free_space(block_group, cluster);
10478 spin_unlock(&cluster->refill_lock);
10479
10480 /*
10481 * make sure this block group isn't part of a metadata
10482 * allocation cluster
10483 */
10484 cluster = &fs_info->meta_alloc_cluster;
10485 spin_lock(&cluster->refill_lock);
10486 btrfs_return_cluster_to_free_space(block_group, cluster);
10487 spin_unlock(&cluster->refill_lock);
10488
10489 path = btrfs_alloc_path();
10490 if (!path) {
10491 ret = -ENOMEM;
10492 goto out;
10493 }
10494
10495 /*
10496 * get the inode first so any iput calls done for the io_list
10497 * aren't the final iput (no unlinks allowed now)
10498 */
10499 inode = lookup_free_space_inode(fs_info, block_group, path);
10500
10501 mutex_lock(&trans->transaction->cache_write_mutex);
10502 /*
10503 * make sure our free spache cache IO is done before remove the
10504 * free space inode
10505 */
10506 spin_lock(&trans->transaction->dirty_bgs_lock);
10507 if (!list_empty(&block_group->io_list)) {
10508 list_del_init(&block_group->io_list);
10509
10510 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10511
10512 spin_unlock(&trans->transaction->dirty_bgs_lock);
10513 btrfs_wait_cache_io(trans, block_group, path);
10514 btrfs_put_block_group(block_group);
10515 spin_lock(&trans->transaction->dirty_bgs_lock);
10516 }
10517
10518 if (!list_empty(&block_group->dirty_list)) {
10519 list_del_init(&block_group->dirty_list);
10520 btrfs_put_block_group(block_group);
10521 }
10522 spin_unlock(&trans->transaction->dirty_bgs_lock);
10523 mutex_unlock(&trans->transaction->cache_write_mutex);
10524
10525 if (!IS_ERR(inode)) {
10526 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10527 if (ret) {
10528 btrfs_add_delayed_iput(inode);
10529 goto out;
10530 }
10531 clear_nlink(inode);
10532 /* One for the block groups ref */
10533 spin_lock(&block_group->lock);
10534 if (block_group->iref) {
10535 block_group->iref = 0;
10536 block_group->inode = NULL;
10537 spin_unlock(&block_group->lock);
10538 iput(inode);
10539 } else {
10540 spin_unlock(&block_group->lock);
10541 }
10542 /* One for our lookup ref */
10543 btrfs_add_delayed_iput(inode);
10544 }
10545
10546 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10547 key.offset = block_group->key.objectid;
10548 key.type = 0;
10549
10550 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10551 if (ret < 0)
10552 goto out;
10553 if (ret > 0)
10554 btrfs_release_path(path);
10555 if (ret == 0) {
10556 ret = btrfs_del_item(trans, tree_root, path);
10557 if (ret)
10558 goto out;
10559 btrfs_release_path(path);
10560 }
10561
10562 spin_lock(&fs_info->block_group_cache_lock);
10563 rb_erase(&block_group->cache_node,
10564 &fs_info->block_group_cache_tree);
10565 RB_CLEAR_NODE(&block_group->cache_node);
10566
10567 if (fs_info->first_logical_byte == block_group->key.objectid)
10568 fs_info->first_logical_byte = (u64)-1;
10569 spin_unlock(&fs_info->block_group_cache_lock);
10570
10571 down_write(&block_group->space_info->groups_sem);
10572 /*
10573 * we must use list_del_init so people can check to see if they
10574 * are still on the list after taking the semaphore
10575 */
10576 list_del_init(&block_group->list);
10577 if (list_empty(&block_group->space_info->block_groups[index])) {
10578 kobj = block_group->space_info->block_group_kobjs[index];
10579 block_group->space_info->block_group_kobjs[index] = NULL;
10580 clear_avail_alloc_bits(fs_info, block_group->flags);
10581 }
10582 up_write(&block_group->space_info->groups_sem);
10583 if (kobj) {
10584 kobject_del(kobj);
10585 kobject_put(kobj);
10586 }
10587
10588 if (block_group->has_caching_ctl)
10589 caching_ctl = get_caching_control(block_group);
10590 if (block_group->cached == BTRFS_CACHE_STARTED)
10591 wait_block_group_cache_done(block_group);
10592 if (block_group->has_caching_ctl) {
10593 down_write(&fs_info->commit_root_sem);
10594 if (!caching_ctl) {
10595 struct btrfs_caching_control *ctl;
10596
10597 list_for_each_entry(ctl,
10598 &fs_info->caching_block_groups, list)
10599 if (ctl->block_group == block_group) {
10600 caching_ctl = ctl;
10601 refcount_inc(&caching_ctl->count);
10602 break;
10603 }
10604 }
10605 if (caching_ctl)
10606 list_del_init(&caching_ctl->list);
10607 up_write(&fs_info->commit_root_sem);
10608 if (caching_ctl) {
10609 /* Once for the caching bgs list and once for us. */
10610 put_caching_control(caching_ctl);
10611 put_caching_control(caching_ctl);
10612 }
10613 }
10614
10615 spin_lock(&trans->transaction->dirty_bgs_lock);
10616 if (!list_empty(&block_group->dirty_list)) {
10617 WARN_ON(1);
10618 }
10619 if (!list_empty(&block_group->io_list)) {
10620 WARN_ON(1);
10621 }
10622 spin_unlock(&trans->transaction->dirty_bgs_lock);
10623 btrfs_remove_free_space_cache(block_group);
10624
10625 spin_lock(&block_group->space_info->lock);
10626 list_del_init(&block_group->ro_list);
10627
10628 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10629 WARN_ON(block_group->space_info->total_bytes
10630 < block_group->key.offset);
10631 WARN_ON(block_group->space_info->bytes_readonly
10632 < block_group->key.offset);
10633 WARN_ON(block_group->space_info->disk_total
10634 < block_group->key.offset * factor);
10635 }
10636 block_group->space_info->total_bytes -= block_group->key.offset;
10637 block_group->space_info->bytes_readonly -= block_group->key.offset;
10638 block_group->space_info->disk_total -= block_group->key.offset * factor;
10639
10640 spin_unlock(&block_group->space_info->lock);
10641
10642 memcpy(&key, &block_group->key, sizeof(key));
10643
10644 mutex_lock(&fs_info->chunk_mutex);
10645 if (!list_empty(&em->list)) {
10646 /* We're in the transaction->pending_chunks list. */
10647 free_extent_map(em);
10648 }
10649 spin_lock(&block_group->lock);
10650 block_group->removed = 1;
10651 /*
10652 * At this point trimming can't start on this block group, because we
10653 * removed the block group from the tree fs_info->block_group_cache_tree
10654 * so no one can't find it anymore and even if someone already got this
10655 * block group before we removed it from the rbtree, they have already
10656 * incremented block_group->trimming - if they didn't, they won't find
10657 * any free space entries because we already removed them all when we
10658 * called btrfs_remove_free_space_cache().
10659 *
10660 * And we must not remove the extent map from the fs_info->mapping_tree
10661 * to prevent the same logical address range and physical device space
10662 * ranges from being reused for a new block group. This is because our
10663 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10664 * completely transactionless, so while it is trimming a range the
10665 * currently running transaction might finish and a new one start,
10666 * allowing for new block groups to be created that can reuse the same
10667 * physical device locations unless we take this special care.
10668 *
10669 * There may also be an implicit trim operation if the file system
10670 * is mounted with -odiscard. The same protections must remain
10671 * in place until the extents have been discarded completely when
10672 * the transaction commit has completed.
10673 */
10674 remove_em = (atomic_read(&block_group->trimming) == 0);
10675 /*
10676 * Make sure a trimmer task always sees the em in the pinned_chunks list
10677 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10678 * before checking block_group->removed).
10679 */
10680 if (!remove_em) {
10681 /*
10682 * Our em might be in trans->transaction->pending_chunks which
10683 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10684 * and so is the fs_info->pinned_chunks list.
10685 *
10686 * So at this point we must be holding the chunk_mutex to avoid
10687 * any races with chunk allocation (more specifically at
10688 * volumes.c:contains_pending_extent()), to ensure it always
10689 * sees the em, either in the pending_chunks list or in the
10690 * pinned_chunks list.
10691 */
10692 list_move_tail(&em->list, &fs_info->pinned_chunks);
10693 }
10694 spin_unlock(&block_group->lock);
10695
10696 mutex_unlock(&fs_info->chunk_mutex);
10697
10698 ret = remove_block_group_free_space(trans, fs_info, block_group);
10699 if (ret)
10700 goto out;
10701
10702 btrfs_put_block_group(block_group);
10703 btrfs_put_block_group(block_group);
10704
10705 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10706 if (ret > 0)
10707 ret = -EIO;
10708 if (ret < 0)
10709 goto out;
10710
10711 ret = btrfs_del_item(trans, root, path);
10712 if (ret)
10713 goto out;
10714
10715 if (remove_em) {
10716 struct extent_map_tree *em_tree;
10717
10718 em_tree = &fs_info->mapping_tree.map_tree;
10719 write_lock(&em_tree->lock);
10720 /*
10721 * The em might be in the pending_chunks list, so make sure the
10722 * chunk mutex is locked, since remove_extent_mapping() will
10723 * delete us from that list.
10724 */
10725 remove_extent_mapping(em_tree, em);
10726 write_unlock(&em_tree->lock);
10727 /* once for the tree */
10728 free_extent_map(em);
10729 }
10730 out:
10731 btrfs_free_path(path);
10732 return ret;
10733 }
10734
10735 struct btrfs_trans_handle *
10736 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10737 const u64 chunk_offset)
10738 {
10739 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10740 struct extent_map *em;
10741 struct map_lookup *map;
10742 unsigned int num_items;
10743
10744 read_lock(&em_tree->lock);
10745 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10746 read_unlock(&em_tree->lock);
10747 ASSERT(em && em->start == chunk_offset);
10748
10749 /*
10750 * We need to reserve 3 + N units from the metadata space info in order
10751 * to remove a block group (done at btrfs_remove_chunk() and at
10752 * btrfs_remove_block_group()), which are used for:
10753 *
10754 * 1 unit for adding the free space inode's orphan (located in the tree
10755 * of tree roots).
10756 * 1 unit for deleting the block group item (located in the extent
10757 * tree).
10758 * 1 unit for deleting the free space item (located in tree of tree
10759 * roots).
10760 * N units for deleting N device extent items corresponding to each
10761 * stripe (located in the device tree).
10762 *
10763 * In order to remove a block group we also need to reserve units in the
10764 * system space info in order to update the chunk tree (update one or
10765 * more device items and remove one chunk item), but this is done at
10766 * btrfs_remove_chunk() through a call to check_system_chunk().
10767 */
10768 map = em->map_lookup;
10769 num_items = 3 + map->num_stripes;
10770 free_extent_map(em);
10771
10772 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10773 num_items, 1);
10774 }
10775
10776 /*
10777 * Process the unused_bgs list and remove any that don't have any allocated
10778 * space inside of them.
10779 */
10780 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10781 {
10782 struct btrfs_block_group_cache *block_group;
10783 struct btrfs_space_info *space_info;
10784 struct btrfs_trans_handle *trans;
10785 int ret = 0;
10786
10787 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10788 return;
10789
10790 spin_lock(&fs_info->unused_bgs_lock);
10791 while (!list_empty(&fs_info->unused_bgs)) {
10792 u64 start, end;
10793 int trimming;
10794
10795 block_group = list_first_entry(&fs_info->unused_bgs,
10796 struct btrfs_block_group_cache,
10797 bg_list);
10798 list_del_init(&block_group->bg_list);
10799
10800 space_info = block_group->space_info;
10801
10802 if (ret || btrfs_mixed_space_info(space_info)) {
10803 btrfs_put_block_group(block_group);
10804 continue;
10805 }
10806 spin_unlock(&fs_info->unused_bgs_lock);
10807
10808 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10809
10810 /* Don't want to race with allocators so take the groups_sem */
10811 down_write(&space_info->groups_sem);
10812 spin_lock(&block_group->lock);
10813 if (block_group->reserved || block_group->pinned ||
10814 btrfs_block_group_used(&block_group->item) ||
10815 block_group->ro ||
10816 list_is_singular(&block_group->list)) {
10817 /*
10818 * We want to bail if we made new allocations or have
10819 * outstanding allocations in this block group. We do
10820 * the ro check in case balance is currently acting on
10821 * this block group.
10822 */
10823 spin_unlock(&block_group->lock);
10824 up_write(&space_info->groups_sem);
10825 goto next;
10826 }
10827 spin_unlock(&block_group->lock);
10828
10829 /* We don't want to force the issue, only flip if it's ok. */
10830 ret = inc_block_group_ro(block_group, 0);
10831 up_write(&space_info->groups_sem);
10832 if (ret < 0) {
10833 ret = 0;
10834 goto next;
10835 }
10836
10837 /*
10838 * Want to do this before we do anything else so we can recover
10839 * properly if we fail to join the transaction.
10840 */
10841 trans = btrfs_start_trans_remove_block_group(fs_info,
10842 block_group->key.objectid);
10843 if (IS_ERR(trans)) {
10844 btrfs_dec_block_group_ro(block_group);
10845 ret = PTR_ERR(trans);
10846 goto next;
10847 }
10848
10849 /*
10850 * We could have pending pinned extents for this block group,
10851 * just delete them, we don't care about them anymore.
10852 */
10853 start = block_group->key.objectid;
10854 end = start + block_group->key.offset - 1;
10855 /*
10856 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10857 * btrfs_finish_extent_commit(). If we are at transaction N,
10858 * another task might be running finish_extent_commit() for the
10859 * previous transaction N - 1, and have seen a range belonging
10860 * to the block group in freed_extents[] before we were able to
10861 * clear the whole block group range from freed_extents[]. This
10862 * means that task can lookup for the block group after we
10863 * unpinned it from freed_extents[] and removed it, leading to
10864 * a BUG_ON() at btrfs_unpin_extent_range().
10865 */
10866 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10867 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10868 EXTENT_DIRTY);
10869 if (ret) {
10870 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10871 btrfs_dec_block_group_ro(block_group);
10872 goto end_trans;
10873 }
10874 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10875 EXTENT_DIRTY);
10876 if (ret) {
10877 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10878 btrfs_dec_block_group_ro(block_group);
10879 goto end_trans;
10880 }
10881 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10882
10883 /* Reset pinned so btrfs_put_block_group doesn't complain */
10884 spin_lock(&space_info->lock);
10885 spin_lock(&block_group->lock);
10886
10887 space_info->bytes_pinned -= block_group->pinned;
10888 space_info->bytes_readonly += block_group->pinned;
10889 percpu_counter_add(&space_info->total_bytes_pinned,
10890 -block_group->pinned);
10891 block_group->pinned = 0;
10892
10893 spin_unlock(&block_group->lock);
10894 spin_unlock(&space_info->lock);
10895
10896 /* DISCARD can flip during remount */
10897 trimming = btrfs_test_opt(fs_info, DISCARD);
10898
10899 /* Implicit trim during transaction commit. */
10900 if (trimming)
10901 btrfs_get_block_group_trimming(block_group);
10902
10903 /*
10904 * Btrfs_remove_chunk will abort the transaction if things go
10905 * horribly wrong.
10906 */
10907 ret = btrfs_remove_chunk(trans, fs_info,
10908 block_group->key.objectid);
10909
10910 if (ret) {
10911 if (trimming)
10912 btrfs_put_block_group_trimming(block_group);
10913 goto end_trans;
10914 }
10915
10916 /*
10917 * If we're not mounted with -odiscard, we can just forget
10918 * about this block group. Otherwise we'll need to wait
10919 * until transaction commit to do the actual discard.
10920 */
10921 if (trimming) {
10922 spin_lock(&fs_info->unused_bgs_lock);
10923 /*
10924 * A concurrent scrub might have added us to the list
10925 * fs_info->unused_bgs, so use a list_move operation
10926 * to add the block group to the deleted_bgs list.
10927 */
10928 list_move(&block_group->bg_list,
10929 &trans->transaction->deleted_bgs);
10930 spin_unlock(&fs_info->unused_bgs_lock);
10931 btrfs_get_block_group(block_group);
10932 }
10933 end_trans:
10934 btrfs_end_transaction(trans);
10935 next:
10936 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10937 btrfs_put_block_group(block_group);
10938 spin_lock(&fs_info->unused_bgs_lock);
10939 }
10940 spin_unlock(&fs_info->unused_bgs_lock);
10941 }
10942
10943 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10944 {
10945 struct btrfs_space_info *space_info;
10946 struct btrfs_super_block *disk_super;
10947 u64 features;
10948 u64 flags;
10949 int mixed = 0;
10950 int ret;
10951
10952 disk_super = fs_info->super_copy;
10953 if (!btrfs_super_root(disk_super))
10954 return -EINVAL;
10955
10956 features = btrfs_super_incompat_flags(disk_super);
10957 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10958 mixed = 1;
10959
10960 flags = BTRFS_BLOCK_GROUP_SYSTEM;
10961 ret = create_space_info(fs_info, flags, &space_info);
10962 if (ret)
10963 goto out;
10964
10965 if (mixed) {
10966 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10967 ret = create_space_info(fs_info, flags, &space_info);
10968 } else {
10969 flags = BTRFS_BLOCK_GROUP_METADATA;
10970 ret = create_space_info(fs_info, flags, &space_info);
10971 if (ret)
10972 goto out;
10973
10974 flags = BTRFS_BLOCK_GROUP_DATA;
10975 ret = create_space_info(fs_info, flags, &space_info);
10976 }
10977 out:
10978 return ret;
10979 }
10980
10981 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10982 u64 start, u64 end)
10983 {
10984 return unpin_extent_range(fs_info, start, end, false);
10985 }
10986
10987 /*
10988 * It used to be that old block groups would be left around forever.
10989 * Iterating over them would be enough to trim unused space. Since we
10990 * now automatically remove them, we also need to iterate over unallocated
10991 * space.
10992 *
10993 * We don't want a transaction for this since the discard may take a
10994 * substantial amount of time. We don't require that a transaction be
10995 * running, but we do need to take a running transaction into account
10996 * to ensure that we're not discarding chunks that were released or
10997 * allocated in the current transaction.
10998 *
10999 * Holding the chunks lock will prevent other threads from allocating
11000 * or releasing chunks, but it won't prevent a running transaction
11001 * from committing and releasing the memory that the pending chunks
11002 * list head uses. For that, we need to take a reference to the
11003 * transaction and hold the commit root sem. We only need to hold
11004 * it while performing the free space search since we have already
11005 * held back allocations.
11006 */
11007 static int btrfs_trim_free_extents(struct btrfs_device *device,
11008 u64 minlen, u64 *trimmed)
11009 {
11010 u64 start = 0, len = 0;
11011 int ret;
11012
11013 *trimmed = 0;
11014
11015 /* Discard not supported = nothing to do. */
11016 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
11017 return 0;
11018
11019 /* Not writeable = nothing to do. */
11020 if (!device->writeable)
11021 return 0;
11022
11023 /* No free space = nothing to do. */
11024 if (device->total_bytes <= device->bytes_used)
11025 return 0;
11026
11027 ret = 0;
11028
11029 while (1) {
11030 struct btrfs_fs_info *fs_info = device->fs_info;
11031 struct btrfs_transaction *trans;
11032 u64 bytes;
11033
11034 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11035 if (ret)
11036 break;
11037
11038 ret = down_read_killable(&fs_info->commit_root_sem);
11039 if (ret) {
11040 mutex_unlock(&fs_info->chunk_mutex);
11041 break;
11042 }
11043
11044 spin_lock(&fs_info->trans_lock);
11045 trans = fs_info->running_transaction;
11046 if (trans)
11047 refcount_inc(&trans->use_count);
11048 spin_unlock(&fs_info->trans_lock);
11049
11050 if (!trans)
11051 up_read(&fs_info->commit_root_sem);
11052
11053 ret = find_free_dev_extent_start(trans, device, minlen, start,
11054 &start, &len);
11055 if (trans) {
11056 up_read(&fs_info->commit_root_sem);
11057 btrfs_put_transaction(trans);
11058 }
11059
11060 if (ret) {
11061 mutex_unlock(&fs_info->chunk_mutex);
11062 if (ret == -ENOSPC)
11063 ret = 0;
11064 break;
11065 }
11066
11067 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11068 mutex_unlock(&fs_info->chunk_mutex);
11069
11070 if (ret)
11071 break;
11072
11073 start += len;
11074 *trimmed += bytes;
11075
11076 if (fatal_signal_pending(current)) {
11077 ret = -ERESTARTSYS;
11078 break;
11079 }
11080
11081 cond_resched();
11082 }
11083
11084 return ret;
11085 }
11086
11087 /*
11088 * Trim the whole filesystem by:
11089 * 1) trimming the free space in each block group
11090 * 2) trimming the unallocated space on each device
11091 *
11092 * This will also continue trimming even if a block group or device encounters
11093 * an error. The return value will be the last error, or 0 if nothing bad
11094 * happens.
11095 */
11096 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11097 {
11098 struct btrfs_block_group_cache *cache = NULL;
11099 struct btrfs_device *device;
11100 struct list_head *devices;
11101 u64 group_trimmed;
11102 u64 start;
11103 u64 end;
11104 u64 trimmed = 0;
11105 u64 bg_failed = 0;
11106 u64 dev_failed = 0;
11107 int bg_ret = 0;
11108 int dev_ret = 0;
11109 int ret = 0;
11110
11111 cache = btrfs_lookup_first_block_group(fs_info, range->start);
11112 for (; cache; cache = next_block_group(fs_info, cache)) {
11113 if (cache->key.objectid >= (range->start + range->len)) {
11114 btrfs_put_block_group(cache);
11115 break;
11116 }
11117
11118 start = max(range->start, cache->key.objectid);
11119 end = min(range->start + range->len,
11120 cache->key.objectid + cache->key.offset);
11121
11122 if (end - start >= range->minlen) {
11123 if (!block_group_cache_done(cache)) {
11124 ret = cache_block_group(cache, 0);
11125 if (ret) {
11126 bg_failed++;
11127 bg_ret = ret;
11128 continue;
11129 }
11130 ret = wait_block_group_cache_done(cache);
11131 if (ret) {
11132 bg_failed++;
11133 bg_ret = ret;
11134 continue;
11135 }
11136 }
11137 ret = btrfs_trim_block_group(cache,
11138 &group_trimmed,
11139 start,
11140 end,
11141 range->minlen);
11142
11143 trimmed += group_trimmed;
11144 if (ret) {
11145 bg_failed++;
11146 bg_ret = ret;
11147 continue;
11148 }
11149 }
11150 }
11151
11152 if (bg_failed)
11153 btrfs_warn(fs_info,
11154 "failed to trim %llu block group(s), last error %d",
11155 bg_failed, bg_ret);
11156 mutex_lock(&fs_info->fs_devices->device_list_mutex);
11157 devices = &fs_info->fs_devices->devices;
11158 list_for_each_entry(device, devices, dev_list) {
11159 ret = btrfs_trim_free_extents(device, range->minlen,
11160 &group_trimmed);
11161 if (ret) {
11162 dev_failed++;
11163 dev_ret = ret;
11164 break;
11165 }
11166
11167 trimmed += group_trimmed;
11168 }
11169 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11170
11171 if (dev_failed)
11172 btrfs_warn(fs_info,
11173 "failed to trim %llu device(s), last error %d",
11174 dev_failed, dev_ret);
11175 range->len = trimmed;
11176 if (bg_ret)
11177 return bg_ret;
11178 return dev_ret;
11179 }
11180
11181 /*
11182 * btrfs_{start,end}_write_no_snapshotting() are similar to
11183 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11184 * data into the page cache through nocow before the subvolume is snapshoted,
11185 * but flush the data into disk after the snapshot creation, or to prevent
11186 * operations while snapshotting is ongoing and that cause the snapshot to be
11187 * inconsistent (writes followed by expanding truncates for example).
11188 */
11189 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11190 {
11191 percpu_counter_dec(&root->subv_writers->counter);
11192 /*
11193 * Make sure counter is updated before we wake up waiters.
11194 */
11195 smp_mb();
11196 if (waitqueue_active(&root->subv_writers->wait))
11197 wake_up(&root->subv_writers->wait);
11198 }
11199
11200 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11201 {
11202 if (atomic_read(&root->will_be_snapshotted))
11203 return 0;
11204
11205 percpu_counter_inc(&root->subv_writers->counter);
11206 /*
11207 * Make sure counter is updated before we check for snapshot creation.
11208 */
11209 smp_mb();
11210 if (atomic_read(&root->will_be_snapshotted)) {
11211 btrfs_end_write_no_snapshotting(root);
11212 return 0;
11213 }
11214 return 1;
11215 }
11216
11217 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11218 {
11219 while (true) {
11220 int ret;
11221
11222 ret = btrfs_start_write_no_snapshotting(root);
11223 if (ret)
11224 break;
11225 wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
11226 TASK_UNINTERRUPTIBLE);
11227 }
11228 }