]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - fs/btrfs/extent-tree.c
btrfs: Open code __btrfs_free_reserved_extent in btrfs_free_reserved_extent
[mirror_ubuntu-hirsute-kernel.git] / fs / btrfs / extent-tree.c
CommitLineData
c1d7c514 1// SPDX-License-Identifier: GPL-2.0
6cbd5570
CM
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
6cbd5570 4 */
c1d7c514 5
ec6b910f 6#include <linux/sched.h>
f361bf4a 7#include <linux/sched/signal.h>
edbd8d4e 8#include <linux/pagemap.h>
ec44a35c 9#include <linux/writeback.h>
21af804c 10#include <linux/blkdev.h>
b7a9f29f 11#include <linux/sort.h>
4184ea7f 12#include <linux/rcupdate.h>
817d52f8 13#include <linux/kthread.h>
5a0e3ad6 14#include <linux/slab.h>
dff51cd1 15#include <linux/ratelimit.h>
b150a4f1 16#include <linux/percpu_counter.h>
69fe2d75 17#include <linux/lockdep.h>
9678c543 18#include <linux/crc32c.h>
602cbe91 19#include "misc.h"
995946dd 20#include "tree-log.h"
fec577fb
CM
21#include "disk-io.h"
22#include "print-tree.h"
0b86a832 23#include "volumes.h"
53b381b3 24#include "raid56.h"
925baedd 25#include "locking.h"
fa9c0d79 26#include "free-space-cache.h"
1e144fb8 27#include "free-space-tree.h"
6ab0a202 28#include "sysfs.h"
fcebe456 29#include "qgroup.h"
fd708b81 30#include "ref-verify.h"
8719aaae 31#include "space-info.h"
d12ffdd1 32#include "block-rsv.h"
86736342 33#include "delalloc-space.h"
aac0023c 34#include "block-group.h"
fec577fb 35
709c0486
AJ
36#undef SCRAMBLE_DELAYED_REFS
37
9f9b8e8d 38
5d4f98a2 39static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
40 struct btrfs_delayed_ref_node *node, u64 parent,
41 u64 root_objectid, u64 owner_objectid,
42 u64 owner_offset, int refs_to_drop,
43 struct btrfs_delayed_extent_op *extra_op);
5d4f98a2
YZ
44static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
45 struct extent_buffer *leaf,
46 struct btrfs_extent_item *ei);
47static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
48 u64 parent, u64 root_objectid,
49 u64 flags, u64 owner, u64 offset,
50 struct btrfs_key *ins, int ref_mod);
51static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 52 struct btrfs_delayed_ref_node *node,
21ebfbe7 53 struct btrfs_delayed_extent_op *extent_op);
11833d66
YZ
54static int find_next_key(struct btrfs_path *path, int level,
55 struct btrfs_key *key);
6a63209f 56
32da5386 57static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
0f9dd46c
JB
58{
59 return (cache->flags & bits) == bits;
60}
61
6f410d1b
JB
62int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
63 u64 start, u64 num_bytes)
817d52f8 64{
11833d66 65 u64 end = start + num_bytes - 1;
0b246afa 66 set_extent_bits(&fs_info->freed_extents[0],
ceeb0ae7 67 start, end, EXTENT_UPTODATE);
0b246afa 68 set_extent_bits(&fs_info->freed_extents[1],
ceeb0ae7 69 start, end, EXTENT_UPTODATE);
11833d66
YZ
70 return 0;
71}
817d52f8 72
32da5386 73void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
11833d66 74{
9e715da8 75 struct btrfs_fs_info *fs_info = cache->fs_info;
11833d66 76 u64 start, end;
817d52f8 77
b3470b5d
DS
78 start = cache->start;
79 end = start + cache->length - 1;
11833d66 80
0b246afa 81 clear_extent_bits(&fs_info->freed_extents[0],
91166212 82 start, end, EXTENT_UPTODATE);
0b246afa 83 clear_extent_bits(&fs_info->freed_extents[1],
91166212 84 start, end, EXTENT_UPTODATE);
817d52f8
JB
85}
86
78192442 87static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
0d9f824d 88{
ddf30cf0
QW
89 if (ref->type == BTRFS_REF_METADATA) {
90 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
78192442 91 return BTRFS_BLOCK_GROUP_SYSTEM;
0d9f824d 92 else
78192442 93 return BTRFS_BLOCK_GROUP_METADATA;
0d9f824d 94 }
78192442
QW
95 return BTRFS_BLOCK_GROUP_DATA;
96}
97
98static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
99 struct btrfs_ref *ref)
100{
101 struct btrfs_space_info *space_info;
102 u64 flags = generic_ref_to_space_flags(ref);
103
280c2908 104 space_info = btrfs_find_space_info(fs_info, flags);
78192442
QW
105 ASSERT(space_info);
106 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
107 BTRFS_TOTAL_BYTES_PINNED_BATCH);
108}
109
110static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
111 struct btrfs_ref *ref)
112{
113 struct btrfs_space_info *space_info;
114 u64 flags = generic_ref_to_space_flags(ref);
0d9f824d 115
280c2908 116 space_info = btrfs_find_space_info(fs_info, flags);
55e8196a 117 ASSERT(space_info);
78192442 118 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
dec59fa3 119 BTRFS_TOTAL_BYTES_PINNED_BATCH);
0d9f824d
OS
120}
121
1a4ed8fd 122/* simple helper to search for an existing data extent at a given offset */
2ff7e61e 123int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
e02119d5
CM
124{
125 int ret;
126 struct btrfs_key key;
31840ae1 127 struct btrfs_path *path;
e02119d5 128
31840ae1 129 path = btrfs_alloc_path();
d8926bb3
MF
130 if (!path)
131 return -ENOMEM;
132
e02119d5
CM
133 key.objectid = start;
134 key.offset = len;
3173a18f 135 key.type = BTRFS_EXTENT_ITEM_KEY;
0b246afa 136 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
31840ae1 137 btrfs_free_path(path);
7bb86316
CM
138 return ret;
139}
140
a22285a6 141/*
3173a18f 142 * helper function to lookup reference count and flags of a tree block.
a22285a6
YZ
143 *
144 * the head node for delayed ref is used to store the sum of all the
145 * reference count modifications queued up in the rbtree. the head
146 * node may also store the extent flags to set. This way you can check
147 * to see what the reference count and extent flags would be if all of
148 * the delayed refs are not processed.
149 */
150int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2ff7e61e 151 struct btrfs_fs_info *fs_info, u64 bytenr,
3173a18f 152 u64 offset, int metadata, u64 *refs, u64 *flags)
a22285a6
YZ
153{
154 struct btrfs_delayed_ref_head *head;
155 struct btrfs_delayed_ref_root *delayed_refs;
156 struct btrfs_path *path;
157 struct btrfs_extent_item *ei;
158 struct extent_buffer *leaf;
159 struct btrfs_key key;
160 u32 item_size;
161 u64 num_refs;
162 u64 extent_flags;
163 int ret;
164
3173a18f
JB
165 /*
166 * If we don't have skinny metadata, don't bother doing anything
167 * different
168 */
0b246afa
JM
169 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
170 offset = fs_info->nodesize;
3173a18f
JB
171 metadata = 0;
172 }
173
a22285a6
YZ
174 path = btrfs_alloc_path();
175 if (!path)
176 return -ENOMEM;
177
a22285a6
YZ
178 if (!trans) {
179 path->skip_locking = 1;
180 path->search_commit_root = 1;
181 }
639eefc8
FDBM
182
183search_again:
184 key.objectid = bytenr;
185 key.offset = offset;
186 if (metadata)
187 key.type = BTRFS_METADATA_ITEM_KEY;
188 else
189 key.type = BTRFS_EXTENT_ITEM_KEY;
190
0b246afa 191 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
a22285a6
YZ
192 if (ret < 0)
193 goto out_free;
194
3173a18f 195 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
74be9510
FDBM
196 if (path->slots[0]) {
197 path->slots[0]--;
198 btrfs_item_key_to_cpu(path->nodes[0], &key,
199 path->slots[0]);
200 if (key.objectid == bytenr &&
201 key.type == BTRFS_EXTENT_ITEM_KEY &&
0b246afa 202 key.offset == fs_info->nodesize)
74be9510
FDBM
203 ret = 0;
204 }
3173a18f
JB
205 }
206
a22285a6
YZ
207 if (ret == 0) {
208 leaf = path->nodes[0];
209 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
210 if (item_size >= sizeof(*ei)) {
211 ei = btrfs_item_ptr(leaf, path->slots[0],
212 struct btrfs_extent_item);
213 num_refs = btrfs_extent_refs(leaf, ei);
214 extent_flags = btrfs_extent_flags(leaf, ei);
215 } else {
ba3c2b19
NB
216 ret = -EINVAL;
217 btrfs_print_v0_err(fs_info);
218 if (trans)
219 btrfs_abort_transaction(trans, ret);
220 else
221 btrfs_handle_fs_error(fs_info, ret, NULL);
222
223 goto out_free;
a22285a6 224 }
ba3c2b19 225
a22285a6
YZ
226 BUG_ON(num_refs == 0);
227 } else {
228 num_refs = 0;
229 extent_flags = 0;
230 ret = 0;
231 }
232
233 if (!trans)
234 goto out;
235
236 delayed_refs = &trans->transaction->delayed_refs;
237 spin_lock(&delayed_refs->lock);
f72ad18e 238 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
a22285a6
YZ
239 if (head) {
240 if (!mutex_trylock(&head->mutex)) {
d278850e 241 refcount_inc(&head->refs);
a22285a6
YZ
242 spin_unlock(&delayed_refs->lock);
243
b3b4aa74 244 btrfs_release_path(path);
a22285a6 245
8cc33e5c
DS
246 /*
247 * Mutex was contended, block until it's released and try
248 * again
249 */
a22285a6
YZ
250 mutex_lock(&head->mutex);
251 mutex_unlock(&head->mutex);
d278850e 252 btrfs_put_delayed_ref_head(head);
639eefc8 253 goto search_again;
a22285a6 254 }
d7df2c79 255 spin_lock(&head->lock);
a22285a6
YZ
256 if (head->extent_op && head->extent_op->update_flags)
257 extent_flags |= head->extent_op->flags_to_set;
258 else
259 BUG_ON(num_refs == 0);
260
d278850e 261 num_refs += head->ref_mod;
d7df2c79 262 spin_unlock(&head->lock);
a22285a6
YZ
263 mutex_unlock(&head->mutex);
264 }
265 spin_unlock(&delayed_refs->lock);
266out:
267 WARN_ON(num_refs == 0);
268 if (refs)
269 *refs = num_refs;
270 if (flags)
271 *flags = extent_flags;
272out_free:
273 btrfs_free_path(path);
274 return ret;
275}
276
d8d5f3e1
CM
277/*
278 * Back reference rules. Back refs have three main goals:
279 *
280 * 1) differentiate between all holders of references to an extent so that
281 * when a reference is dropped we can make sure it was a valid reference
282 * before freeing the extent.
283 *
284 * 2) Provide enough information to quickly find the holders of an extent
285 * if we notice a given block is corrupted or bad.
286 *
287 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
288 * maintenance. This is actually the same as #2, but with a slightly
289 * different use case.
290 *
5d4f98a2
YZ
291 * There are two kinds of back refs. The implicit back refs is optimized
292 * for pointers in non-shared tree blocks. For a given pointer in a block,
293 * back refs of this kind provide information about the block's owner tree
294 * and the pointer's key. These information allow us to find the block by
295 * b-tree searching. The full back refs is for pointers in tree blocks not
296 * referenced by their owner trees. The location of tree block is recorded
297 * in the back refs. Actually the full back refs is generic, and can be
298 * used in all cases the implicit back refs is used. The major shortcoming
299 * of the full back refs is its overhead. Every time a tree block gets
300 * COWed, we have to update back refs entry for all pointers in it.
301 *
302 * For a newly allocated tree block, we use implicit back refs for
303 * pointers in it. This means most tree related operations only involve
304 * implicit back refs. For a tree block created in old transaction, the
305 * only way to drop a reference to it is COW it. So we can detect the
306 * event that tree block loses its owner tree's reference and do the
307 * back refs conversion.
308 *
01327610 309 * When a tree block is COWed through a tree, there are four cases:
5d4f98a2
YZ
310 *
311 * The reference count of the block is one and the tree is the block's
312 * owner tree. Nothing to do in this case.
313 *
314 * The reference count of the block is one and the tree is not the
315 * block's owner tree. In this case, full back refs is used for pointers
316 * in the block. Remove these full back refs, add implicit back refs for
317 * every pointers in the new block.
318 *
319 * The reference count of the block is greater than one and the tree is
320 * the block's owner tree. In this case, implicit back refs is used for
321 * pointers in the block. Add full back refs for every pointers in the
322 * block, increase lower level extents' reference counts. The original
323 * implicit back refs are entailed to the new block.
324 *
325 * The reference count of the block is greater than one and the tree is
326 * not the block's owner tree. Add implicit back refs for every pointer in
327 * the new block, increase lower level extents' reference count.
328 *
329 * Back Reference Key composing:
330 *
331 * The key objectid corresponds to the first byte in the extent,
332 * The key type is used to differentiate between types of back refs.
333 * There are different meanings of the key offset for different types
334 * of back refs.
335 *
d8d5f3e1
CM
336 * File extents can be referenced by:
337 *
338 * - multiple snapshots, subvolumes, or different generations in one subvol
31840ae1 339 * - different files inside a single subvolume
d8d5f3e1
CM
340 * - different offsets inside a file (bookend extents in file.c)
341 *
5d4f98a2 342 * The extent ref structure for the implicit back refs has fields for:
d8d5f3e1
CM
343 *
344 * - Objectid of the subvolume root
d8d5f3e1 345 * - objectid of the file holding the reference
5d4f98a2
YZ
346 * - original offset in the file
347 * - how many bookend extents
d8d5f3e1 348 *
5d4f98a2
YZ
349 * The key offset for the implicit back refs is hash of the first
350 * three fields.
d8d5f3e1 351 *
5d4f98a2 352 * The extent ref structure for the full back refs has field for:
d8d5f3e1 353 *
5d4f98a2 354 * - number of pointers in the tree leaf
d8d5f3e1 355 *
5d4f98a2
YZ
356 * The key offset for the implicit back refs is the first byte of
357 * the tree leaf
d8d5f3e1 358 *
5d4f98a2
YZ
359 * When a file extent is allocated, The implicit back refs is used.
360 * the fields are filled in:
d8d5f3e1 361 *
5d4f98a2 362 * (root_key.objectid, inode objectid, offset in file, 1)
d8d5f3e1 363 *
5d4f98a2
YZ
364 * When a file extent is removed file truncation, we find the
365 * corresponding implicit back refs and check the following fields:
d8d5f3e1 366 *
5d4f98a2 367 * (btrfs_header_owner(leaf), inode objectid, offset in file)
d8d5f3e1 368 *
5d4f98a2 369 * Btree extents can be referenced by:
d8d5f3e1 370 *
5d4f98a2 371 * - Different subvolumes
d8d5f3e1 372 *
5d4f98a2
YZ
373 * Both the implicit back refs and the full back refs for tree blocks
374 * only consist of key. The key offset for the implicit back refs is
375 * objectid of block's owner tree. The key offset for the full back refs
376 * is the first byte of parent block.
d8d5f3e1 377 *
5d4f98a2
YZ
378 * When implicit back refs is used, information about the lowest key and
379 * level of the tree block are required. These information are stored in
380 * tree block info structure.
d8d5f3e1 381 */
31840ae1 382
167ce953
LB
383/*
384 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
52042d8e 385 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
167ce953
LB
386 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
387 */
388int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
389 struct btrfs_extent_inline_ref *iref,
390 enum btrfs_inline_ref_type is_data)
391{
392 int type = btrfs_extent_inline_ref_type(eb, iref);
64ecdb64 393 u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
167ce953
LB
394
395 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
396 type == BTRFS_SHARED_BLOCK_REF_KEY ||
397 type == BTRFS_SHARED_DATA_REF_KEY ||
398 type == BTRFS_EXTENT_DATA_REF_KEY) {
399 if (is_data == BTRFS_REF_TYPE_BLOCK) {
64ecdb64 400 if (type == BTRFS_TREE_BLOCK_REF_KEY)
167ce953 401 return type;
64ecdb64
LB
402 if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
403 ASSERT(eb->fs_info);
404 /*
405 * Every shared one has parent tree
406 * block, which must be aligned to
407 * nodesize.
408 */
409 if (offset &&
410 IS_ALIGNED(offset, eb->fs_info->nodesize))
411 return type;
412 }
167ce953 413 } else if (is_data == BTRFS_REF_TYPE_DATA) {
64ecdb64 414 if (type == BTRFS_EXTENT_DATA_REF_KEY)
167ce953 415 return type;
64ecdb64
LB
416 if (type == BTRFS_SHARED_DATA_REF_KEY) {
417 ASSERT(eb->fs_info);
418 /*
419 * Every shared one has parent tree
420 * block, which must be aligned to
421 * nodesize.
422 */
423 if (offset &&
424 IS_ALIGNED(offset, eb->fs_info->nodesize))
425 return type;
426 }
167ce953
LB
427 } else {
428 ASSERT(is_data == BTRFS_REF_TYPE_ANY);
429 return type;
430 }
431 }
432
433 btrfs_print_leaf((struct extent_buffer *)eb);
434 btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
435 eb->start, type);
436 WARN_ON(1);
437
438 return BTRFS_REF_TYPE_INVALID;
439}
440
0785a9aa 441u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
5d4f98a2
YZ
442{
443 u32 high_crc = ~(u32)0;
444 u32 low_crc = ~(u32)0;
445 __le64 lenum;
446
447 lenum = cpu_to_le64(root_objectid);
65019df8 448 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
5d4f98a2 449 lenum = cpu_to_le64(owner);
65019df8 450 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2 451 lenum = cpu_to_le64(offset);
65019df8 452 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
5d4f98a2
YZ
453
454 return ((u64)high_crc << 31) ^ (u64)low_crc;
455}
456
457static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
458 struct btrfs_extent_data_ref *ref)
459{
460 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
461 btrfs_extent_data_ref_objectid(leaf, ref),
462 btrfs_extent_data_ref_offset(leaf, ref));
463}
464
465static int match_extent_data_ref(struct extent_buffer *leaf,
466 struct btrfs_extent_data_ref *ref,
467 u64 root_objectid, u64 owner, u64 offset)
468{
469 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
470 btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
471 btrfs_extent_data_ref_offset(leaf, ref) != offset)
472 return 0;
473 return 1;
474}
475
476static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
477 struct btrfs_path *path,
478 u64 bytenr, u64 parent,
479 u64 root_objectid,
480 u64 owner, u64 offset)
481{
bd1d53ef 482 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2
YZ
483 struct btrfs_key key;
484 struct btrfs_extent_data_ref *ref;
31840ae1 485 struct extent_buffer *leaf;
5d4f98a2 486 u32 nritems;
74493f7a 487 int ret;
5d4f98a2
YZ
488 int recow;
489 int err = -ENOENT;
74493f7a 490
31840ae1 491 key.objectid = bytenr;
5d4f98a2
YZ
492 if (parent) {
493 key.type = BTRFS_SHARED_DATA_REF_KEY;
494 key.offset = parent;
495 } else {
496 key.type = BTRFS_EXTENT_DATA_REF_KEY;
497 key.offset = hash_extent_data_ref(root_objectid,
498 owner, offset);
499 }
500again:
501 recow = 0;
502 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
503 if (ret < 0) {
504 err = ret;
505 goto fail;
506 }
31840ae1 507
5d4f98a2
YZ
508 if (parent) {
509 if (!ret)
510 return 0;
5d4f98a2 511 goto fail;
31840ae1
ZY
512 }
513
514 leaf = path->nodes[0];
5d4f98a2
YZ
515 nritems = btrfs_header_nritems(leaf);
516 while (1) {
517 if (path->slots[0] >= nritems) {
518 ret = btrfs_next_leaf(root, path);
519 if (ret < 0)
520 err = ret;
521 if (ret)
522 goto fail;
523
524 leaf = path->nodes[0];
525 nritems = btrfs_header_nritems(leaf);
526 recow = 1;
527 }
528
529 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
530 if (key.objectid != bytenr ||
531 key.type != BTRFS_EXTENT_DATA_REF_KEY)
532 goto fail;
533
534 ref = btrfs_item_ptr(leaf, path->slots[0],
535 struct btrfs_extent_data_ref);
536
537 if (match_extent_data_ref(leaf, ref, root_objectid,
538 owner, offset)) {
539 if (recow) {
b3b4aa74 540 btrfs_release_path(path);
5d4f98a2
YZ
541 goto again;
542 }
543 err = 0;
544 break;
545 }
546 path->slots[0]++;
31840ae1 547 }
5d4f98a2
YZ
548fail:
549 return err;
31840ae1
ZY
550}
551
5d4f98a2 552static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
553 struct btrfs_path *path,
554 u64 bytenr, u64 parent,
555 u64 root_objectid, u64 owner,
556 u64 offset, int refs_to_add)
31840ae1 557{
62b895af 558 struct btrfs_root *root = trans->fs_info->extent_root;
31840ae1
ZY
559 struct btrfs_key key;
560 struct extent_buffer *leaf;
5d4f98a2 561 u32 size;
31840ae1
ZY
562 u32 num_refs;
563 int ret;
74493f7a 564
74493f7a 565 key.objectid = bytenr;
5d4f98a2
YZ
566 if (parent) {
567 key.type = BTRFS_SHARED_DATA_REF_KEY;
568 key.offset = parent;
569 size = sizeof(struct btrfs_shared_data_ref);
570 } else {
571 key.type = BTRFS_EXTENT_DATA_REF_KEY;
572 key.offset = hash_extent_data_ref(root_objectid,
573 owner, offset);
574 size = sizeof(struct btrfs_extent_data_ref);
575 }
74493f7a 576
5d4f98a2
YZ
577 ret = btrfs_insert_empty_item(trans, root, path, &key, size);
578 if (ret && ret != -EEXIST)
579 goto fail;
580
581 leaf = path->nodes[0];
582 if (parent) {
583 struct btrfs_shared_data_ref *ref;
31840ae1 584 ref = btrfs_item_ptr(leaf, path->slots[0],
5d4f98a2
YZ
585 struct btrfs_shared_data_ref);
586 if (ret == 0) {
587 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
588 } else {
589 num_refs = btrfs_shared_data_ref_count(leaf, ref);
590 num_refs += refs_to_add;
591 btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
31840ae1 592 }
5d4f98a2
YZ
593 } else {
594 struct btrfs_extent_data_ref *ref;
595 while (ret == -EEXIST) {
596 ref = btrfs_item_ptr(leaf, path->slots[0],
597 struct btrfs_extent_data_ref);
598 if (match_extent_data_ref(leaf, ref, root_objectid,
599 owner, offset))
600 break;
b3b4aa74 601 btrfs_release_path(path);
5d4f98a2
YZ
602 key.offset++;
603 ret = btrfs_insert_empty_item(trans, root, path, &key,
604 size);
605 if (ret && ret != -EEXIST)
606 goto fail;
31840ae1 607
5d4f98a2
YZ
608 leaf = path->nodes[0];
609 }
610 ref = btrfs_item_ptr(leaf, path->slots[0],
611 struct btrfs_extent_data_ref);
612 if (ret == 0) {
613 btrfs_set_extent_data_ref_root(leaf, ref,
614 root_objectid);
615 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
616 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
617 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
618 } else {
619 num_refs = btrfs_extent_data_ref_count(leaf, ref);
620 num_refs += refs_to_add;
621 btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
31840ae1 622 }
31840ae1 623 }
5d4f98a2
YZ
624 btrfs_mark_buffer_dirty(leaf);
625 ret = 0;
626fail:
b3b4aa74 627 btrfs_release_path(path);
7bb86316 628 return ret;
74493f7a
CM
629}
630
5d4f98a2 631static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2 632 struct btrfs_path *path,
fcebe456 633 int refs_to_drop, int *last_ref)
31840ae1 634{
5d4f98a2
YZ
635 struct btrfs_key key;
636 struct btrfs_extent_data_ref *ref1 = NULL;
637 struct btrfs_shared_data_ref *ref2 = NULL;
31840ae1 638 struct extent_buffer *leaf;
5d4f98a2 639 u32 num_refs = 0;
31840ae1
ZY
640 int ret = 0;
641
642 leaf = path->nodes[0];
5d4f98a2
YZ
643 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
644
645 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
646 ref1 = btrfs_item_ptr(leaf, path->slots[0],
647 struct btrfs_extent_data_ref);
648 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
650 ref2 = btrfs_item_ptr(leaf, path->slots[0],
651 struct btrfs_shared_data_ref);
652 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
6d8ff4e4 653 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
ba3c2b19
NB
654 btrfs_print_v0_err(trans->fs_info);
655 btrfs_abort_transaction(trans, -EINVAL);
656 return -EINVAL;
5d4f98a2
YZ
657 } else {
658 BUG();
659 }
660
56bec294
CM
661 BUG_ON(num_refs < refs_to_drop);
662 num_refs -= refs_to_drop;
5d4f98a2 663
31840ae1 664 if (num_refs == 0) {
e9f6290d 665 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
fcebe456 666 *last_ref = 1;
31840ae1 667 } else {
5d4f98a2
YZ
668 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
669 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
670 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
671 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
31840ae1
ZY
672 btrfs_mark_buffer_dirty(leaf);
673 }
31840ae1
ZY
674 return ret;
675}
676
9ed0dea0 677static noinline u32 extent_data_ref_count(struct btrfs_path *path,
5d4f98a2 678 struct btrfs_extent_inline_ref *iref)
15916de8 679{
5d4f98a2
YZ
680 struct btrfs_key key;
681 struct extent_buffer *leaf;
682 struct btrfs_extent_data_ref *ref1;
683 struct btrfs_shared_data_ref *ref2;
684 u32 num_refs = 0;
3de28d57 685 int type;
5d4f98a2
YZ
686
687 leaf = path->nodes[0];
688 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ba3c2b19
NB
689
690 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
5d4f98a2 691 if (iref) {
3de28d57
LB
692 /*
693 * If type is invalid, we should have bailed out earlier than
694 * this call.
695 */
696 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
697 ASSERT(type != BTRFS_REF_TYPE_INVALID);
698 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
5d4f98a2
YZ
699 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
700 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
701 } else {
702 ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
703 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
704 }
705 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
706 ref1 = btrfs_item_ptr(leaf, path->slots[0],
707 struct btrfs_extent_data_ref);
708 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
709 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
710 ref2 = btrfs_item_ptr(leaf, path->slots[0],
711 struct btrfs_shared_data_ref);
712 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
5d4f98a2
YZ
713 } else {
714 WARN_ON(1);
715 }
716 return num_refs;
717}
15916de8 718
5d4f98a2 719static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
720 struct btrfs_path *path,
721 u64 bytenr, u64 parent,
722 u64 root_objectid)
1f3c79a2 723{
b8582eea 724 struct btrfs_root *root = trans->fs_info->extent_root;
5d4f98a2 725 struct btrfs_key key;
1f3c79a2 726 int ret;
1f3c79a2 727
5d4f98a2
YZ
728 key.objectid = bytenr;
729 if (parent) {
730 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
731 key.offset = parent;
732 } else {
733 key.type = BTRFS_TREE_BLOCK_REF_KEY;
734 key.offset = root_objectid;
1f3c79a2
LH
735 }
736
5d4f98a2
YZ
737 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
738 if (ret > 0)
739 ret = -ENOENT;
5d4f98a2 740 return ret;
1f3c79a2
LH
741}
742
5d4f98a2 743static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
744 struct btrfs_path *path,
745 u64 bytenr, u64 parent,
746 u64 root_objectid)
31840ae1 747{
5d4f98a2 748 struct btrfs_key key;
31840ae1 749 int ret;
31840ae1 750
5d4f98a2
YZ
751 key.objectid = bytenr;
752 if (parent) {
753 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
754 key.offset = parent;
755 } else {
756 key.type = BTRFS_TREE_BLOCK_REF_KEY;
757 key.offset = root_objectid;
758 }
759
10728404 760 ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
87bde3cd 761 path, &key, 0);
b3b4aa74 762 btrfs_release_path(path);
31840ae1
ZY
763 return ret;
764}
765
5d4f98a2 766static inline int extent_ref_type(u64 parent, u64 owner)
31840ae1 767{
5d4f98a2
YZ
768 int type;
769 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
770 if (parent > 0)
771 type = BTRFS_SHARED_BLOCK_REF_KEY;
772 else
773 type = BTRFS_TREE_BLOCK_REF_KEY;
774 } else {
775 if (parent > 0)
776 type = BTRFS_SHARED_DATA_REF_KEY;
777 else
778 type = BTRFS_EXTENT_DATA_REF_KEY;
779 }
780 return type;
31840ae1 781}
56bec294 782
2c47e605
YZ
783static int find_next_key(struct btrfs_path *path, int level,
784 struct btrfs_key *key)
56bec294 785
02217ed2 786{
2c47e605 787 for (; level < BTRFS_MAX_LEVEL; level++) {
5d4f98a2
YZ
788 if (!path->nodes[level])
789 break;
5d4f98a2
YZ
790 if (path->slots[level] + 1 >=
791 btrfs_header_nritems(path->nodes[level]))
792 continue;
793 if (level == 0)
794 btrfs_item_key_to_cpu(path->nodes[level], key,
795 path->slots[level] + 1);
796 else
797 btrfs_node_key_to_cpu(path->nodes[level], key,
798 path->slots[level] + 1);
799 return 0;
800 }
801 return 1;
802}
037e6390 803
5d4f98a2
YZ
804/*
805 * look for inline back ref. if back ref is found, *ref_ret is set
806 * to the address of inline back ref, and 0 is returned.
807 *
808 * if back ref isn't found, *ref_ret is set to the address where it
809 * should be inserted, and -ENOENT is returned.
810 *
811 * if insert is true and there are too many inline back refs, the path
812 * points to the extent item, and -EAGAIN is returned.
813 *
814 * NOTE: inline back refs are ordered in the same way that back ref
815 * items in the tree are ordered.
816 */
817static noinline_for_stack
818int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
819 struct btrfs_path *path,
820 struct btrfs_extent_inline_ref **ref_ret,
821 u64 bytenr, u64 num_bytes,
822 u64 parent, u64 root_objectid,
823 u64 owner, u64 offset, int insert)
824{
867cc1fb 825 struct btrfs_fs_info *fs_info = trans->fs_info;
87bde3cd 826 struct btrfs_root *root = fs_info->extent_root;
5d4f98a2
YZ
827 struct btrfs_key key;
828 struct extent_buffer *leaf;
829 struct btrfs_extent_item *ei;
830 struct btrfs_extent_inline_ref *iref;
831 u64 flags;
832 u64 item_size;
833 unsigned long ptr;
834 unsigned long end;
835 int extra_size;
836 int type;
837 int want;
838 int ret;
839 int err = 0;
0b246afa 840 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3de28d57 841 int needed;
26b8003f 842
db94535d 843 key.objectid = bytenr;
31840ae1 844 key.type = BTRFS_EXTENT_ITEM_KEY;
56bec294 845 key.offset = num_bytes;
31840ae1 846
5d4f98a2
YZ
847 want = extent_ref_type(parent, owner);
848 if (insert) {
849 extra_size = btrfs_extent_inline_ref_size(want);
85d4198e 850 path->keep_locks = 1;
5d4f98a2
YZ
851 } else
852 extra_size = -1;
3173a18f
JB
853
854 /*
16d1c062
NB
855 * Owner is our level, so we can just add one to get the level for the
856 * block we are interested in.
3173a18f
JB
857 */
858 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
859 key.type = BTRFS_METADATA_ITEM_KEY;
860 key.offset = owner;
861 }
862
863again:
5d4f98a2 864 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
b9473439 865 if (ret < 0) {
5d4f98a2
YZ
866 err = ret;
867 goto out;
868 }
3173a18f
JB
869
870 /*
871 * We may be a newly converted file system which still has the old fat
872 * extent entries for metadata, so try and see if we have one of those.
873 */
874 if (ret > 0 && skinny_metadata) {
875 skinny_metadata = false;
876 if (path->slots[0]) {
877 path->slots[0]--;
878 btrfs_item_key_to_cpu(path->nodes[0], &key,
879 path->slots[0]);
880 if (key.objectid == bytenr &&
881 key.type == BTRFS_EXTENT_ITEM_KEY &&
882 key.offset == num_bytes)
883 ret = 0;
884 }
885 if (ret) {
9ce49a0b 886 key.objectid = bytenr;
3173a18f
JB
887 key.type = BTRFS_EXTENT_ITEM_KEY;
888 key.offset = num_bytes;
889 btrfs_release_path(path);
890 goto again;
891 }
892 }
893
79787eaa
JM
894 if (ret && !insert) {
895 err = -ENOENT;
896 goto out;
fae7f21c 897 } else if (WARN_ON(ret)) {
492104c8 898 err = -EIO;
492104c8 899 goto out;
79787eaa 900 }
5d4f98a2
YZ
901
902 leaf = path->nodes[0];
903 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
6d8ff4e4 904 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
905 err = -EINVAL;
906 btrfs_print_v0_err(fs_info);
907 btrfs_abort_transaction(trans, err);
908 goto out;
909 }
5d4f98a2 910
5d4f98a2
YZ
911 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
912 flags = btrfs_extent_flags(leaf, ei);
913
914 ptr = (unsigned long)(ei + 1);
915 end = (unsigned long)ei + item_size;
916
3173a18f 917 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
5d4f98a2
YZ
918 ptr += sizeof(struct btrfs_tree_block_info);
919 BUG_ON(ptr > end);
5d4f98a2
YZ
920 }
921
3de28d57
LB
922 if (owner >= BTRFS_FIRST_FREE_OBJECTID)
923 needed = BTRFS_REF_TYPE_DATA;
924 else
925 needed = BTRFS_REF_TYPE_BLOCK;
926
5d4f98a2
YZ
927 err = -ENOENT;
928 while (1) {
929 if (ptr >= end) {
930 WARN_ON(ptr > end);
931 break;
932 }
933 iref = (struct btrfs_extent_inline_ref *)ptr;
3de28d57
LB
934 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
935 if (type == BTRFS_REF_TYPE_INVALID) {
af431dcb 936 err = -EUCLEAN;
3de28d57
LB
937 goto out;
938 }
939
5d4f98a2
YZ
940 if (want < type)
941 break;
942 if (want > type) {
943 ptr += btrfs_extent_inline_ref_size(type);
944 continue;
945 }
946
947 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
948 struct btrfs_extent_data_ref *dref;
949 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
950 if (match_extent_data_ref(leaf, dref, root_objectid,
951 owner, offset)) {
952 err = 0;
953 break;
954 }
955 if (hash_extent_data_ref_item(leaf, dref) <
956 hash_extent_data_ref(root_objectid, owner, offset))
957 break;
958 } else {
959 u64 ref_offset;
960 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
961 if (parent > 0) {
962 if (parent == ref_offset) {
963 err = 0;
964 break;
965 }
966 if (ref_offset < parent)
967 break;
968 } else {
969 if (root_objectid == ref_offset) {
970 err = 0;
971 break;
972 }
973 if (ref_offset < root_objectid)
974 break;
975 }
976 }
977 ptr += btrfs_extent_inline_ref_size(type);
978 }
979 if (err == -ENOENT && insert) {
980 if (item_size + extra_size >=
981 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
982 err = -EAGAIN;
983 goto out;
984 }
985 /*
986 * To add new inline back ref, we have to make sure
987 * there is no corresponding back ref item.
988 * For simplicity, we just do not add new inline back
989 * ref if there is any kind of item for this block
990 */
2c47e605
YZ
991 if (find_next_key(path, 0, &key) == 0 &&
992 key.objectid == bytenr &&
85d4198e 993 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
5d4f98a2
YZ
994 err = -EAGAIN;
995 goto out;
996 }
997 }
998 *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
999out:
85d4198e 1000 if (insert) {
5d4f98a2
YZ
1001 path->keep_locks = 0;
1002 btrfs_unlock_up_safe(path, 1);
1003 }
1004 return err;
1005}
1006
1007/*
1008 * helper to add new inline back ref
1009 */
1010static noinline_for_stack
87bde3cd 1011void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
143bede5
JM
1012 struct btrfs_path *path,
1013 struct btrfs_extent_inline_ref *iref,
1014 u64 parent, u64 root_objectid,
1015 u64 owner, u64 offset, int refs_to_add,
1016 struct btrfs_delayed_extent_op *extent_op)
5d4f98a2
YZ
1017{
1018 struct extent_buffer *leaf;
1019 struct btrfs_extent_item *ei;
1020 unsigned long ptr;
1021 unsigned long end;
1022 unsigned long item_offset;
1023 u64 refs;
1024 int size;
1025 int type;
5d4f98a2
YZ
1026
1027 leaf = path->nodes[0];
1028 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1029 item_offset = (unsigned long)iref - (unsigned long)ei;
1030
1031 type = extent_ref_type(parent, owner);
1032 size = btrfs_extent_inline_ref_size(type);
1033
c71dd880 1034 btrfs_extend_item(path, size);
5d4f98a2
YZ
1035
1036 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1037 refs = btrfs_extent_refs(leaf, ei);
1038 refs += refs_to_add;
1039 btrfs_set_extent_refs(leaf, ei, refs);
1040 if (extent_op)
1041 __run_delayed_extent_op(extent_op, leaf, ei);
1042
1043 ptr = (unsigned long)ei + item_offset;
1044 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1045 if (ptr < end - size)
1046 memmove_extent_buffer(leaf, ptr + size, ptr,
1047 end - size - ptr);
1048
1049 iref = (struct btrfs_extent_inline_ref *)ptr;
1050 btrfs_set_extent_inline_ref_type(leaf, iref, type);
1051 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1052 struct btrfs_extent_data_ref *dref;
1053 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1054 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1055 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1056 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1057 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1058 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1059 struct btrfs_shared_data_ref *sref;
1060 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1061 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1062 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1063 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1064 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1065 } else {
1066 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1067 }
1068 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1069}
1070
1071static int lookup_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1072 struct btrfs_path *path,
1073 struct btrfs_extent_inline_ref **ref_ret,
1074 u64 bytenr, u64 num_bytes, u64 parent,
1075 u64 root_objectid, u64 owner, u64 offset)
1076{
1077 int ret;
1078
867cc1fb
NB
1079 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1080 num_bytes, parent, root_objectid,
1081 owner, offset, 0);
5d4f98a2 1082 if (ret != -ENOENT)
54aa1f4d 1083 return ret;
5d4f98a2 1084
b3b4aa74 1085 btrfs_release_path(path);
5d4f98a2
YZ
1086 *ref_ret = NULL;
1087
1088 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
b8582eea
NB
1089 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1090 root_objectid);
5d4f98a2 1091 } else {
bd1d53ef
NB
1092 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1093 root_objectid, owner, offset);
b9473439 1094 }
5d4f98a2
YZ
1095 return ret;
1096}
31840ae1 1097
5d4f98a2
YZ
1098/*
1099 * helper to update/remove inline back ref
1100 */
1101static noinline_for_stack
61a18f1c 1102void update_inline_extent_backref(struct btrfs_path *path,
143bede5
JM
1103 struct btrfs_extent_inline_ref *iref,
1104 int refs_to_mod,
fcebe456
JB
1105 struct btrfs_delayed_extent_op *extent_op,
1106 int *last_ref)
5d4f98a2 1107{
61a18f1c 1108 struct extent_buffer *leaf = path->nodes[0];
5d4f98a2
YZ
1109 struct btrfs_extent_item *ei;
1110 struct btrfs_extent_data_ref *dref = NULL;
1111 struct btrfs_shared_data_ref *sref = NULL;
1112 unsigned long ptr;
1113 unsigned long end;
1114 u32 item_size;
1115 int size;
1116 int type;
5d4f98a2
YZ
1117 u64 refs;
1118
5d4f98a2
YZ
1119 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1120 refs = btrfs_extent_refs(leaf, ei);
1121 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1122 refs += refs_to_mod;
1123 btrfs_set_extent_refs(leaf, ei, refs);
1124 if (extent_op)
1125 __run_delayed_extent_op(extent_op, leaf, ei);
1126
3de28d57
LB
1127 /*
1128 * If type is invalid, we should have bailed out after
1129 * lookup_inline_extent_backref().
1130 */
1131 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1132 ASSERT(type != BTRFS_REF_TYPE_INVALID);
5d4f98a2
YZ
1133
1134 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1135 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1136 refs = btrfs_extent_data_ref_count(leaf, dref);
1137 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1138 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1139 refs = btrfs_shared_data_ref_count(leaf, sref);
1140 } else {
1141 refs = 1;
1142 BUG_ON(refs_to_mod != -1);
56bec294 1143 }
31840ae1 1144
5d4f98a2
YZ
1145 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1146 refs += refs_to_mod;
1147
1148 if (refs > 0) {
1149 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1150 btrfs_set_extent_data_ref_count(leaf, dref, refs);
1151 else
1152 btrfs_set_shared_data_ref_count(leaf, sref, refs);
1153 } else {
fcebe456 1154 *last_ref = 1;
5d4f98a2
YZ
1155 size = btrfs_extent_inline_ref_size(type);
1156 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1157 ptr = (unsigned long)iref;
1158 end = (unsigned long)ei + item_size;
1159 if (ptr + size < end)
1160 memmove_extent_buffer(leaf, ptr, ptr + size,
1161 end - ptr - size);
1162 item_size -= size;
78ac4f9e 1163 btrfs_truncate_item(path, item_size, 1);
5d4f98a2
YZ
1164 }
1165 btrfs_mark_buffer_dirty(leaf);
5d4f98a2
YZ
1166}
1167
1168static noinline_for_stack
1169int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1170 struct btrfs_path *path,
1171 u64 bytenr, u64 num_bytes, u64 parent,
1172 u64 root_objectid, u64 owner,
1173 u64 offset, int refs_to_add,
1174 struct btrfs_delayed_extent_op *extent_op)
1175{
1176 struct btrfs_extent_inline_ref *iref;
1177 int ret;
1178
867cc1fb
NB
1179 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1180 num_bytes, parent, root_objectid,
1181 owner, offset, 1);
5d4f98a2
YZ
1182 if (ret == 0) {
1183 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
61a18f1c
NB
1184 update_inline_extent_backref(path, iref, refs_to_add,
1185 extent_op, NULL);
5d4f98a2 1186 } else if (ret == -ENOENT) {
a639cdeb 1187 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
143bede5
JM
1188 root_objectid, owner, offset,
1189 refs_to_add, extent_op);
1190 ret = 0;
771ed689 1191 }
5d4f98a2
YZ
1192 return ret;
1193}
31840ae1 1194
5d4f98a2 1195static int insert_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1196 struct btrfs_path *path,
1197 u64 bytenr, u64 parent, u64 root_objectid,
1198 u64 owner, u64 offset, int refs_to_add)
1199{
1200 int ret;
1201 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1202 BUG_ON(refs_to_add != 1);
10728404
NB
1203 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1204 root_objectid);
5d4f98a2 1205 } else {
62b895af
NB
1206 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1207 root_objectid, owner, offset,
1208 refs_to_add);
5d4f98a2
YZ
1209 }
1210 return ret;
1211}
56bec294 1212
5d4f98a2 1213static int remove_extent_backref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1214 struct btrfs_path *path,
1215 struct btrfs_extent_inline_ref *iref,
fcebe456 1216 int refs_to_drop, int is_data, int *last_ref)
5d4f98a2 1217{
143bede5 1218 int ret = 0;
b9473439 1219
5d4f98a2
YZ
1220 BUG_ON(!is_data && refs_to_drop != 1);
1221 if (iref) {
61a18f1c
NB
1222 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1223 last_ref);
5d4f98a2 1224 } else if (is_data) {
e9f6290d 1225 ret = remove_extent_data_ref(trans, path, refs_to_drop,
fcebe456 1226 last_ref);
5d4f98a2 1227 } else {
fcebe456 1228 *last_ref = 1;
87cc7a8a 1229 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
5d4f98a2
YZ
1230 }
1231 return ret;
1232}
1233
d04c6b88
JM
1234static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1235 u64 *discarded_bytes)
5d4f98a2 1236{
86557861
JM
1237 int j, ret = 0;
1238 u64 bytes_left, end;
4d89d377 1239 u64 aligned_start = ALIGN(start, 1 << 9);
d04c6b88 1240
4d89d377
JM
1241 if (WARN_ON(start != aligned_start)) {
1242 len -= aligned_start - start;
1243 len = round_down(len, 1 << 9);
1244 start = aligned_start;
1245 }
d04c6b88 1246
4d89d377 1247 *discarded_bytes = 0;
86557861
JM
1248
1249 if (!len)
1250 return 0;
1251
1252 end = start + len;
1253 bytes_left = len;
1254
1255 /* Skip any superblocks on this device. */
1256 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1257 u64 sb_start = btrfs_sb_offset(j);
1258 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1259 u64 size = sb_start - start;
1260
1261 if (!in_range(sb_start, start, bytes_left) &&
1262 !in_range(sb_end, start, bytes_left) &&
1263 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1264 continue;
1265
1266 /*
1267 * Superblock spans beginning of range. Adjust start and
1268 * try again.
1269 */
1270 if (sb_start <= start) {
1271 start += sb_end - start;
1272 if (start > end) {
1273 bytes_left = 0;
1274 break;
1275 }
1276 bytes_left = end - start;
1277 continue;
1278 }
1279
1280 if (size) {
1281 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1282 GFP_NOFS, 0);
1283 if (!ret)
1284 *discarded_bytes += size;
1285 else if (ret != -EOPNOTSUPP)
1286 return ret;
1287 }
1288
1289 start = sb_end;
1290 if (start > end) {
1291 bytes_left = 0;
1292 break;
1293 }
1294 bytes_left = end - start;
1295 }
1296
1297 if (bytes_left) {
1298 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
4d89d377
JM
1299 GFP_NOFS, 0);
1300 if (!ret)
86557861 1301 *discarded_bytes += bytes_left;
4d89d377 1302 }
d04c6b88 1303 return ret;
5d4f98a2 1304}
5d4f98a2 1305
2ff7e61e 1306int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1edb647b 1307 u64 num_bytes, u64 *actual_bytes)
5d4f98a2 1308{
6b7faadd 1309 int ret = 0;
5378e607 1310 u64 discarded_bytes = 0;
6b7faadd
QW
1311 u64 end = bytenr + num_bytes;
1312 u64 cur = bytenr;
a1d3c478 1313 struct btrfs_bio *bbio = NULL;
5d4f98a2 1314
e244a0ae 1315
2999241d
FM
1316 /*
1317 * Avoid races with device replace and make sure our bbio has devices
1318 * associated to its stripes that don't go away while we are discarding.
1319 */
0b246afa 1320 btrfs_bio_counter_inc_blocked(fs_info);
6b7faadd
QW
1321 while (cur < end) {
1322 struct btrfs_bio_stripe *stripe;
5d4f98a2
YZ
1323 int i;
1324
6b7faadd
QW
1325 num_bytes = end - cur;
1326 /* Tell the block device(s) that the sectors can be discarded */
1327 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1328 &num_bytes, &bbio, 0);
1329 /*
1330 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1331 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1332 * thus we can't continue anyway.
1333 */
1334 if (ret < 0)
1335 goto out;
5d4f98a2 1336
6b7faadd 1337 stripe = bbio->stripes;
a1d3c478 1338 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
d04c6b88 1339 u64 bytes;
38b5f68e
AJ
1340 struct request_queue *req_q;
1341
627e0873
FM
1342 if (!stripe->dev->bdev) {
1343 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1344 continue;
1345 }
38b5f68e
AJ
1346 req_q = bdev_get_queue(stripe->dev->bdev);
1347 if (!blk_queue_discard(req_q))
d5e2003c
JB
1348 continue;
1349
5378e607
LD
1350 ret = btrfs_issue_discard(stripe->dev->bdev,
1351 stripe->physical,
d04c6b88
JM
1352 stripe->length,
1353 &bytes);
6b7faadd 1354 if (!ret) {
d04c6b88 1355 discarded_bytes += bytes;
6b7faadd
QW
1356 } else if (ret != -EOPNOTSUPP) {
1357 /*
1358 * Logic errors or -ENOMEM, or -EIO, but
1359 * unlikely to happen.
1360 *
1361 * And since there are two loops, explicitly
1362 * go to out to avoid confusion.
1363 */
1364 btrfs_put_bbio(bbio);
1365 goto out;
1366 }
d5e2003c
JB
1367
1368 /*
1369 * Just in case we get back EOPNOTSUPP for some reason,
1370 * just ignore the return value so we don't screw up
1371 * people calling discard_extent.
1372 */
1373 ret = 0;
5d4f98a2 1374 }
6e9606d2 1375 btrfs_put_bbio(bbio);
6b7faadd 1376 cur += num_bytes;
5d4f98a2 1377 }
6b7faadd 1378out:
0b246afa 1379 btrfs_bio_counter_dec(fs_info);
5378e607
LD
1380
1381 if (actual_bytes)
1382 *actual_bytes = discarded_bytes;
1383
5d4f98a2 1384
53b381b3
DW
1385 if (ret == -EOPNOTSUPP)
1386 ret = 0;
5d4f98a2 1387 return ret;
5d4f98a2
YZ
1388}
1389
79787eaa 1390/* Can return -ENOMEM */
5d4f98a2 1391int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
82fa113f 1392 struct btrfs_ref *generic_ref)
5d4f98a2 1393{
82fa113f 1394 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 1395 int old_ref_mod, new_ref_mod;
5d4f98a2 1396 int ret;
66d7e7f0 1397
82fa113f
QW
1398 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1399 generic_ref->action);
1400 BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1401 generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 1402
82fa113f
QW
1403 if (generic_ref->type == BTRFS_REF_METADATA)
1404 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
ed4f255b 1405 NULL, &old_ref_mod, &new_ref_mod);
82fa113f
QW
1406 else
1407 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
d7eae340 1408 &old_ref_mod, &new_ref_mod);
d7eae340 1409
82fa113f 1410 btrfs_ref_tree_mod(fs_info, generic_ref);
8a5040f7 1411
ddf30cf0 1412 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
78192442 1413 sub_pinned_bytes(fs_info, generic_ref);
d7eae340 1414
5d4f98a2
YZ
1415 return ret;
1416}
1417
bd3c685e
NB
1418/*
1419 * __btrfs_inc_extent_ref - insert backreference for a given extent
1420 *
1421 * @trans: Handle of transaction
1422 *
1423 * @node: The delayed ref node used to get the bytenr/length for
1424 * extent whose references are incremented.
1425 *
1426 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1427 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1428 * bytenr of the parent block. Since new extents are always
1429 * created with indirect references, this will only be the case
1430 * when relocating a shared extent. In that case, root_objectid
1431 * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
1432 * be 0
1433 *
1434 * @root_objectid: The id of the root where this modification has originated,
1435 * this can be either one of the well-known metadata trees or
1436 * the subvolume id which references this extent.
1437 *
1438 * @owner: For data extents it is the inode number of the owning file.
1439 * For metadata extents this parameter holds the level in the
1440 * tree of the extent.
1441 *
1442 * @offset: For metadata extents the offset is ignored and is currently
1443 * always passed as 0. For data extents it is the fileoffset
1444 * this extent belongs to.
1445 *
1446 * @refs_to_add Number of references to add
1447 *
1448 * @extent_op Pointer to a structure, holding information necessary when
1449 * updating a tree block's flags
1450 *
1451 */
5d4f98a2 1452static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
c682f9b3 1453 struct btrfs_delayed_ref_node *node,
5d4f98a2
YZ
1454 u64 parent, u64 root_objectid,
1455 u64 owner, u64 offset, int refs_to_add,
1456 struct btrfs_delayed_extent_op *extent_op)
1457{
1458 struct btrfs_path *path;
1459 struct extent_buffer *leaf;
1460 struct btrfs_extent_item *item;
fcebe456 1461 struct btrfs_key key;
c682f9b3
QW
1462 u64 bytenr = node->bytenr;
1463 u64 num_bytes = node->num_bytes;
5d4f98a2
YZ
1464 u64 refs;
1465 int ret;
5d4f98a2
YZ
1466
1467 path = btrfs_alloc_path();
1468 if (!path)
1469 return -ENOMEM;
1470
e4058b54 1471 path->reada = READA_FORWARD;
5d4f98a2
YZ
1472 path->leave_spinning = 1;
1473 /* this will setup the path even if it fails to insert the back ref */
a639cdeb
NB
1474 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1475 parent, root_objectid, owner,
1476 offset, refs_to_add, extent_op);
0ed4792a 1477 if ((ret < 0 && ret != -EAGAIN) || !ret)
5d4f98a2 1478 goto out;
fcebe456
JB
1479
1480 /*
1481 * Ok we had -EAGAIN which means we didn't have space to insert and
1482 * inline extent ref, so just update the reference count and add a
1483 * normal backref.
1484 */
5d4f98a2 1485 leaf = path->nodes[0];
fcebe456 1486 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5d4f98a2
YZ
1487 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1488 refs = btrfs_extent_refs(leaf, item);
1489 btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1490 if (extent_op)
1491 __run_delayed_extent_op(extent_op, leaf, item);
56bec294 1492
5d4f98a2 1493 btrfs_mark_buffer_dirty(leaf);
b3b4aa74 1494 btrfs_release_path(path);
56bec294 1495
e4058b54 1496 path->reada = READA_FORWARD;
b9473439 1497 path->leave_spinning = 1;
56bec294 1498 /* now insert the actual backref */
37593410
NB
1499 ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
1500 owner, offset, refs_to_add);
79787eaa 1501 if (ret)
66642832 1502 btrfs_abort_transaction(trans, ret);
5d4f98a2 1503out:
56bec294 1504 btrfs_free_path(path);
30d133fc 1505 return ret;
56bec294
CM
1506}
1507
5d4f98a2 1508static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1509 struct btrfs_delayed_ref_node *node,
1510 struct btrfs_delayed_extent_op *extent_op,
1511 int insert_reserved)
56bec294 1512{
5d4f98a2
YZ
1513 int ret = 0;
1514 struct btrfs_delayed_data_ref *ref;
1515 struct btrfs_key ins;
1516 u64 parent = 0;
1517 u64 ref_root = 0;
1518 u64 flags = 0;
1519
1520 ins.objectid = node->bytenr;
1521 ins.offset = node->num_bytes;
1522 ins.type = BTRFS_EXTENT_ITEM_KEY;
1523
1524 ref = btrfs_delayed_node_to_data_ref(node);
2bf98ef3 1525 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
599c75ec 1526
5d4f98a2
YZ
1527 if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1528 parent = ref->parent;
fcebe456 1529 ref_root = ref->root;
5d4f98a2
YZ
1530
1531 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 1532 if (extent_op)
5d4f98a2 1533 flags |= extent_op->flags_to_set;
ef89b824
NB
1534 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1535 flags, ref->objectid,
1536 ref->offset, &ins,
1537 node->ref_mod);
5d4f98a2 1538 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
1539 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1540 ref->objectid, ref->offset,
1541 node->ref_mod, extent_op);
5d4f98a2 1542 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 1543 ret = __btrfs_free_extent(trans, node, parent,
5d4f98a2
YZ
1544 ref_root, ref->objectid,
1545 ref->offset, node->ref_mod,
c682f9b3 1546 extent_op);
5d4f98a2
YZ
1547 } else {
1548 BUG();
1549 }
1550 return ret;
1551}
1552
1553static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1554 struct extent_buffer *leaf,
1555 struct btrfs_extent_item *ei)
1556{
1557 u64 flags = btrfs_extent_flags(leaf, ei);
1558 if (extent_op->update_flags) {
1559 flags |= extent_op->flags_to_set;
1560 btrfs_set_extent_flags(leaf, ei, flags);
1561 }
1562
1563 if (extent_op->update_key) {
1564 struct btrfs_tree_block_info *bi;
1565 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1566 bi = (struct btrfs_tree_block_info *)(ei + 1);
1567 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1568 }
1569}
1570
1571static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
d278850e 1572 struct btrfs_delayed_ref_head *head,
5d4f98a2
YZ
1573 struct btrfs_delayed_extent_op *extent_op)
1574{
20b9a2d6 1575 struct btrfs_fs_info *fs_info = trans->fs_info;
5d4f98a2
YZ
1576 struct btrfs_key key;
1577 struct btrfs_path *path;
1578 struct btrfs_extent_item *ei;
1579 struct extent_buffer *leaf;
1580 u32 item_size;
56bec294 1581 int ret;
5d4f98a2 1582 int err = 0;
b1c79e09 1583 int metadata = !extent_op->is_data;
5d4f98a2 1584
79787eaa
JM
1585 if (trans->aborted)
1586 return 0;
1587
0b246afa 1588 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3173a18f
JB
1589 metadata = 0;
1590
5d4f98a2
YZ
1591 path = btrfs_alloc_path();
1592 if (!path)
1593 return -ENOMEM;
1594
d278850e 1595 key.objectid = head->bytenr;
5d4f98a2 1596
3173a18f 1597 if (metadata) {
3173a18f 1598 key.type = BTRFS_METADATA_ITEM_KEY;
b1c79e09 1599 key.offset = extent_op->level;
3173a18f
JB
1600 } else {
1601 key.type = BTRFS_EXTENT_ITEM_KEY;
d278850e 1602 key.offset = head->num_bytes;
3173a18f
JB
1603 }
1604
1605again:
e4058b54 1606 path->reada = READA_FORWARD;
5d4f98a2 1607 path->leave_spinning = 1;
0b246afa 1608 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
5d4f98a2
YZ
1609 if (ret < 0) {
1610 err = ret;
1611 goto out;
1612 }
1613 if (ret > 0) {
3173a18f 1614 if (metadata) {
55994887
FDBM
1615 if (path->slots[0] > 0) {
1616 path->slots[0]--;
1617 btrfs_item_key_to_cpu(path->nodes[0], &key,
1618 path->slots[0]);
d278850e 1619 if (key.objectid == head->bytenr &&
55994887 1620 key.type == BTRFS_EXTENT_ITEM_KEY &&
d278850e 1621 key.offset == head->num_bytes)
55994887
FDBM
1622 ret = 0;
1623 }
1624 if (ret > 0) {
1625 btrfs_release_path(path);
1626 metadata = 0;
3173a18f 1627
d278850e
JB
1628 key.objectid = head->bytenr;
1629 key.offset = head->num_bytes;
55994887
FDBM
1630 key.type = BTRFS_EXTENT_ITEM_KEY;
1631 goto again;
1632 }
1633 } else {
1634 err = -EIO;
1635 goto out;
3173a18f 1636 }
5d4f98a2
YZ
1637 }
1638
1639 leaf = path->nodes[0];
1640 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ba3c2b19 1641
6d8ff4e4 1642 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
1643 err = -EINVAL;
1644 btrfs_print_v0_err(fs_info);
1645 btrfs_abort_transaction(trans, err);
1646 goto out;
1647 }
1648
5d4f98a2
YZ
1649 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1650 __run_delayed_extent_op(extent_op, leaf, ei);
56bec294 1651
5d4f98a2
YZ
1652 btrfs_mark_buffer_dirty(leaf);
1653out:
1654 btrfs_free_path(path);
1655 return err;
56bec294
CM
1656}
1657
5d4f98a2 1658static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1659 struct btrfs_delayed_ref_node *node,
1660 struct btrfs_delayed_extent_op *extent_op,
1661 int insert_reserved)
56bec294
CM
1662{
1663 int ret = 0;
5d4f98a2 1664 struct btrfs_delayed_tree_ref *ref;
5d4f98a2
YZ
1665 u64 parent = 0;
1666 u64 ref_root = 0;
56bec294 1667
5d4f98a2 1668 ref = btrfs_delayed_node_to_tree_ref(node);
f97806f2 1669 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
599c75ec 1670
5d4f98a2
YZ
1671 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1672 parent = ref->parent;
fcebe456 1673 ref_root = ref->root;
5d4f98a2 1674
02794222 1675 if (node->ref_mod != 1) {
f97806f2 1676 btrfs_err(trans->fs_info,
02794222
LB
1677 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1678 node->bytenr, node->ref_mod, node->action, ref_root,
1679 parent);
1680 return -EIO;
1681 }
5d4f98a2 1682 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
3173a18f 1683 BUG_ON(!extent_op || !extent_op->update_flags);
21ebfbe7 1684 ret = alloc_reserved_tree_block(trans, node, extent_op);
5d4f98a2 1685 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2590d0f1
NB
1686 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1687 ref->level, 0, 1, extent_op);
5d4f98a2 1688 } else if (node->action == BTRFS_DROP_DELAYED_REF) {
e72cb923 1689 ret = __btrfs_free_extent(trans, node, parent, ref_root,
c682f9b3 1690 ref->level, 0, 1, extent_op);
5d4f98a2
YZ
1691 } else {
1692 BUG();
1693 }
56bec294
CM
1694 return ret;
1695}
1696
1697/* helper function to actually process a single delayed ref entry */
5d4f98a2 1698static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
1699 struct btrfs_delayed_ref_node *node,
1700 struct btrfs_delayed_extent_op *extent_op,
1701 int insert_reserved)
56bec294 1702{
79787eaa
JM
1703 int ret = 0;
1704
857cc2fc
JB
1705 if (trans->aborted) {
1706 if (insert_reserved)
5fac7f9e 1707 btrfs_pin_extent(trans->fs_info, node->bytenr,
857cc2fc 1708 node->num_bytes, 1);
79787eaa 1709 return 0;
857cc2fc 1710 }
79787eaa 1711
5d4f98a2
YZ
1712 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1713 node->type == BTRFS_SHARED_BLOCK_REF_KEY)
f97806f2 1714 ret = run_delayed_tree_ref(trans, node, extent_op,
5d4f98a2
YZ
1715 insert_reserved);
1716 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1717 node->type == BTRFS_SHARED_DATA_REF_KEY)
2bf98ef3 1718 ret = run_delayed_data_ref(trans, node, extent_op,
5d4f98a2
YZ
1719 insert_reserved);
1720 else
1721 BUG();
80ee54bf
JB
1722 if (ret && insert_reserved)
1723 btrfs_pin_extent(trans->fs_info, node->bytenr,
1724 node->num_bytes, 1);
5d4f98a2 1725 return ret;
56bec294
CM
1726}
1727
c6fc2454 1728static inline struct btrfs_delayed_ref_node *
56bec294
CM
1729select_delayed_ref(struct btrfs_delayed_ref_head *head)
1730{
cffc3374
FM
1731 struct btrfs_delayed_ref_node *ref;
1732
e3d03965 1733 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
c6fc2454 1734 return NULL;
d7df2c79 1735
cffc3374
FM
1736 /*
1737 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1738 * This is to prevent a ref count from going down to zero, which deletes
1739 * the extent item from the extent tree, when there still are references
1740 * to add, which would fail because they would not find the extent item.
1741 */
1d57ee94
WX
1742 if (!list_empty(&head->ref_add_list))
1743 return list_first_entry(&head->ref_add_list,
1744 struct btrfs_delayed_ref_node, add_list);
1745
e3d03965 1746 ref = rb_entry(rb_first_cached(&head->ref_tree),
0e0adbcf 1747 struct btrfs_delayed_ref_node, ref_node);
1d57ee94
WX
1748 ASSERT(list_empty(&ref->add_list));
1749 return ref;
56bec294
CM
1750}
1751
2eadaa22
JB
1752static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1753 struct btrfs_delayed_ref_head *head)
1754{
1755 spin_lock(&delayed_refs->lock);
1756 head->processing = 0;
1757 delayed_refs->num_heads_ready++;
1758 spin_unlock(&delayed_refs->lock);
1759 btrfs_delayed_ref_unlock(head);
1760}
1761
bedc6617
JB
1762static struct btrfs_delayed_extent_op *cleanup_extent_op(
1763 struct btrfs_delayed_ref_head *head)
b00e6250
JB
1764{
1765 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
b00e6250
JB
1766
1767 if (!extent_op)
bedc6617
JB
1768 return NULL;
1769
b00e6250 1770 if (head->must_insert_reserved) {
bedc6617 1771 head->extent_op = NULL;
b00e6250 1772 btrfs_free_delayed_extent_op(extent_op);
bedc6617 1773 return NULL;
b00e6250 1774 }
bedc6617
JB
1775 return extent_op;
1776}
1777
1778static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1779 struct btrfs_delayed_ref_head *head)
1780{
1781 struct btrfs_delayed_extent_op *extent_op;
1782 int ret;
1783
1784 extent_op = cleanup_extent_op(head);
1785 if (!extent_op)
1786 return 0;
1787 head->extent_op = NULL;
b00e6250 1788 spin_unlock(&head->lock);
20b9a2d6 1789 ret = run_delayed_extent_op(trans, head, extent_op);
b00e6250
JB
1790 btrfs_free_delayed_extent_op(extent_op);
1791 return ret ? ret : 1;
1792}
1793
31890da0
JB
1794void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1795 struct btrfs_delayed_ref_root *delayed_refs,
1796 struct btrfs_delayed_ref_head *head)
07c47775 1797{
ba2c4d4e 1798 int nr_items = 1; /* Dropping this ref head update. */
07c47775
JB
1799
1800 if (head->total_ref_mod < 0) {
1801 struct btrfs_space_info *space_info;
1802 u64 flags;
1803
1804 if (head->is_data)
1805 flags = BTRFS_BLOCK_GROUP_DATA;
1806 else if (head->is_system)
1807 flags = BTRFS_BLOCK_GROUP_SYSTEM;
1808 else
1809 flags = BTRFS_BLOCK_GROUP_METADATA;
280c2908 1810 space_info = btrfs_find_space_info(fs_info, flags);
07c47775
JB
1811 ASSERT(space_info);
1812 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1813 -head->num_bytes,
1814 BTRFS_TOTAL_BYTES_PINNED_BATCH);
1815
ba2c4d4e
JB
1816 /*
1817 * We had csum deletions accounted for in our delayed refs rsv,
1818 * we need to drop the csum leaves for this update from our
1819 * delayed_refs_rsv.
1820 */
07c47775
JB
1821 if (head->is_data) {
1822 spin_lock(&delayed_refs->lock);
1823 delayed_refs->pending_csums -= head->num_bytes;
1824 spin_unlock(&delayed_refs->lock);
ba2c4d4e
JB
1825 nr_items += btrfs_csum_bytes_to_leaves(fs_info,
1826 head->num_bytes);
07c47775
JB
1827 }
1828 }
1829
ba2c4d4e 1830 btrfs_delayed_refs_rsv_release(fs_info, nr_items);
07c47775
JB
1831}
1832
194ab0bc 1833static int cleanup_ref_head(struct btrfs_trans_handle *trans,
194ab0bc
JB
1834 struct btrfs_delayed_ref_head *head)
1835{
f9871edd
NB
1836
1837 struct btrfs_fs_info *fs_info = trans->fs_info;
194ab0bc
JB
1838 struct btrfs_delayed_ref_root *delayed_refs;
1839 int ret;
1840
1841 delayed_refs = &trans->transaction->delayed_refs;
1842
bedc6617 1843 ret = run_and_cleanup_extent_op(trans, head);
194ab0bc
JB
1844 if (ret < 0) {
1845 unselect_delayed_ref_head(delayed_refs, head);
1846 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1847 return ret;
1848 } else if (ret) {
1849 return ret;
1850 }
1851
1852 /*
1853 * Need to drop our head ref lock and re-acquire the delayed ref lock
1854 * and then re-check to make sure nobody got added.
1855 */
1856 spin_unlock(&head->lock);
1857 spin_lock(&delayed_refs->lock);
1858 spin_lock(&head->lock);
e3d03965 1859 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
194ab0bc
JB
1860 spin_unlock(&head->lock);
1861 spin_unlock(&delayed_refs->lock);
1862 return 1;
1863 }
d7baffda 1864 btrfs_delete_ref_head(delayed_refs, head);
c1103f7a 1865 spin_unlock(&head->lock);
1e7a1421 1866 spin_unlock(&delayed_refs->lock);
c1103f7a 1867
c1103f7a 1868 if (head->must_insert_reserved) {
d278850e
JB
1869 btrfs_pin_extent(fs_info, head->bytenr,
1870 head->num_bytes, 1);
c1103f7a 1871 if (head->is_data) {
40e046ac
FM
1872 ret = btrfs_del_csums(trans, fs_info->csum_root,
1873 head->bytenr, head->num_bytes);
c1103f7a
JB
1874 }
1875 }
1876
31890da0 1877 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
07c47775
JB
1878
1879 trace_run_delayed_ref_head(fs_info, head, 0);
c1103f7a 1880 btrfs_delayed_ref_unlock(head);
d278850e 1881 btrfs_put_delayed_ref_head(head);
194ab0bc
JB
1882 return 0;
1883}
1884
b1cdbcb5
NB
1885static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1886 struct btrfs_trans_handle *trans)
1887{
1888 struct btrfs_delayed_ref_root *delayed_refs =
1889 &trans->transaction->delayed_refs;
1890 struct btrfs_delayed_ref_head *head = NULL;
1891 int ret;
1892
1893 spin_lock(&delayed_refs->lock);
5637c74b 1894 head = btrfs_select_ref_head(delayed_refs);
b1cdbcb5
NB
1895 if (!head) {
1896 spin_unlock(&delayed_refs->lock);
1897 return head;
1898 }
1899
1900 /*
1901 * Grab the lock that says we are going to process all the refs for
1902 * this head
1903 */
9e920a6f 1904 ret = btrfs_delayed_ref_lock(delayed_refs, head);
b1cdbcb5
NB
1905 spin_unlock(&delayed_refs->lock);
1906
1907 /*
1908 * We may have dropped the spin lock to get the head mutex lock, and
1909 * that might have given someone else time to free the head. If that's
1910 * true, it has been removed from our list and we can move on.
1911 */
1912 if (ret == -EAGAIN)
1913 head = ERR_PTR(-EAGAIN);
1914
1915 return head;
1916}
1917
e7261386
NB
1918static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1919 struct btrfs_delayed_ref_head *locked_ref,
1920 unsigned long *run_refs)
1921{
1922 struct btrfs_fs_info *fs_info = trans->fs_info;
1923 struct btrfs_delayed_ref_root *delayed_refs;
1924 struct btrfs_delayed_extent_op *extent_op;
1925 struct btrfs_delayed_ref_node *ref;
1926 int must_insert_reserved = 0;
1927 int ret;
1928
1929 delayed_refs = &trans->transaction->delayed_refs;
1930
0110a4c4
NB
1931 lockdep_assert_held(&locked_ref->mutex);
1932 lockdep_assert_held(&locked_ref->lock);
1933
e7261386
NB
1934 while ((ref = select_delayed_ref(locked_ref))) {
1935 if (ref->seq &&
1936 btrfs_check_delayed_seq(fs_info, ref->seq)) {
1937 spin_unlock(&locked_ref->lock);
1938 unselect_delayed_ref_head(delayed_refs, locked_ref);
1939 return -EAGAIN;
1940 }
1941
1942 (*run_refs)++;
1943 ref->in_tree = 0;
1944 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1945 RB_CLEAR_NODE(&ref->ref_node);
1946 if (!list_empty(&ref->add_list))
1947 list_del(&ref->add_list);
1948 /*
1949 * When we play the delayed ref, also correct the ref_mod on
1950 * head
1951 */
1952 switch (ref->action) {
1953 case BTRFS_ADD_DELAYED_REF:
1954 case BTRFS_ADD_DELAYED_EXTENT:
1955 locked_ref->ref_mod -= ref->ref_mod;
1956 break;
1957 case BTRFS_DROP_DELAYED_REF:
1958 locked_ref->ref_mod += ref->ref_mod;
1959 break;
1960 default:
1961 WARN_ON(1);
1962 }
1963 atomic_dec(&delayed_refs->num_entries);
1964
1965 /*
1966 * Record the must_insert_reserved flag before we drop the
1967 * spin lock.
1968 */
1969 must_insert_reserved = locked_ref->must_insert_reserved;
1970 locked_ref->must_insert_reserved = 0;
1971
1972 extent_op = locked_ref->extent_op;
1973 locked_ref->extent_op = NULL;
1974 spin_unlock(&locked_ref->lock);
1975
1976 ret = run_one_delayed_ref(trans, ref, extent_op,
1977 must_insert_reserved);
1978
1979 btrfs_free_delayed_extent_op(extent_op);
1980 if (ret) {
1981 unselect_delayed_ref_head(delayed_refs, locked_ref);
1982 btrfs_put_delayed_ref(ref);
1983 btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1984 ret);
1985 return ret;
1986 }
1987
1988 btrfs_put_delayed_ref(ref);
1989 cond_resched();
1990
1991 spin_lock(&locked_ref->lock);
1992 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1993 }
1994
1995 return 0;
1996}
1997
79787eaa
JM
1998/*
1999 * Returns 0 on success or if called with an already aborted transaction.
2000 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2001 */
d7df2c79 2002static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
d7df2c79 2003 unsigned long nr)
56bec294 2004{
0a1e458a 2005 struct btrfs_fs_info *fs_info = trans->fs_info;
56bec294 2006 struct btrfs_delayed_ref_root *delayed_refs;
56bec294 2007 struct btrfs_delayed_ref_head *locked_ref = NULL;
0a2b2a84 2008 ktime_t start = ktime_get();
56bec294 2009 int ret;
d7df2c79 2010 unsigned long count = 0;
0a2b2a84 2011 unsigned long actual_count = 0;
56bec294
CM
2012
2013 delayed_refs = &trans->transaction->delayed_refs;
0110a4c4 2014 do {
56bec294 2015 if (!locked_ref) {
b1cdbcb5 2016 locked_ref = btrfs_obtain_ref_head(trans);
0110a4c4
NB
2017 if (IS_ERR_OR_NULL(locked_ref)) {
2018 if (PTR_ERR(locked_ref) == -EAGAIN) {
2019 continue;
2020 } else {
2021 break;
2022 }
56bec294 2023 }
0110a4c4 2024 count++;
56bec294 2025 }
2c3cf7d5
FM
2026 /*
2027 * We need to try and merge add/drops of the same ref since we
2028 * can run into issues with relocate dropping the implicit ref
2029 * and then it being added back again before the drop can
2030 * finish. If we merged anything we need to re-loop so we can
2031 * get a good ref.
2032 * Or we can get node references of the same type that weren't
2033 * merged when created due to bumps in the tree mod seq, and
2034 * we need to merge them to prevent adding an inline extent
2035 * backref before dropping it (triggering a BUG_ON at
2036 * insert_inline_extent_backref()).
2037 */
d7df2c79 2038 spin_lock(&locked_ref->lock);
be97f133 2039 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
ae1e206b 2040
0110a4c4
NB
2041 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2042 &actual_count);
2043 if (ret < 0 && ret != -EAGAIN) {
2044 /*
2045 * Error, btrfs_run_delayed_refs_for_head already
2046 * unlocked everything so just bail out
2047 */
2048 return ret;
2049 } else if (!ret) {
2050 /*
2051 * Success, perform the usual cleanup of a processed
2052 * head
2053 */
f9871edd 2054 ret = cleanup_ref_head(trans, locked_ref);
194ab0bc 2055 if (ret > 0 ) {
b00e6250
JB
2056 /* We dropped our lock, we need to loop. */
2057 ret = 0;
d7df2c79 2058 continue;
194ab0bc
JB
2059 } else if (ret) {
2060 return ret;
5d4f98a2 2061 }
22cd2e7d 2062 }
1ce7a5ec 2063
b00e6250 2064 /*
0110a4c4
NB
2065 * Either success case or btrfs_run_delayed_refs_for_head
2066 * returned -EAGAIN, meaning we need to select another head
b00e6250 2067 */
b00e6250 2068
0110a4c4 2069 locked_ref = NULL;
c3e69d58 2070 cond_resched();
0110a4c4 2071 } while ((nr != -1 && count < nr) || locked_ref);
0a2b2a84
JB
2072
2073 /*
2074 * We don't want to include ref heads since we can have empty ref heads
2075 * and those will drastically skew our runtime down since we just do
2076 * accounting, no actual extent tree updates.
2077 */
2078 if (actual_count > 0) {
2079 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2080 u64 avg;
2081
2082 /*
2083 * We weigh the current average higher than our current runtime
2084 * to avoid large swings in the average.
2085 */
2086 spin_lock(&delayed_refs->lock);
2087 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
f8c269d7 2088 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
0a2b2a84
JB
2089 spin_unlock(&delayed_refs->lock);
2090 }
d7df2c79 2091 return 0;
c3e69d58
CM
2092}
2093
709c0486
AJ
2094#ifdef SCRAMBLE_DELAYED_REFS
2095/*
2096 * Normally delayed refs get processed in ascending bytenr order. This
2097 * correlates in most cases to the order added. To expose dependencies on this
2098 * order, we start to process the tree in the middle instead of the beginning
2099 */
2100static u64 find_middle(struct rb_root *root)
2101{
2102 struct rb_node *n = root->rb_node;
2103 struct btrfs_delayed_ref_node *entry;
2104 int alt = 1;
2105 u64 middle;
2106 u64 first = 0, last = 0;
2107
2108 n = rb_first(root);
2109 if (n) {
2110 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2111 first = entry->bytenr;
2112 }
2113 n = rb_last(root);
2114 if (n) {
2115 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2116 last = entry->bytenr;
2117 }
2118 n = root->rb_node;
2119
2120 while (n) {
2121 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2122 WARN_ON(!entry->in_tree);
2123
2124 middle = entry->bytenr;
2125
2126 if (alt)
2127 n = n->rb_left;
2128 else
2129 n = n->rb_right;
2130
2131 alt = 1 - alt;
2132 }
2133 return middle;
2134}
2135#endif
2136
2ff7e61e 2137static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
1be41b78
JB
2138{
2139 u64 num_bytes;
2140
2141 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2142 sizeof(struct btrfs_extent_inline_ref));
0b246afa 2143 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1be41b78
JB
2144 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2145
2146 /*
2147 * We don't ever fill up leaves all the way so multiply by 2 just to be
01327610 2148 * closer to what we're really going to want to use.
1be41b78 2149 */
0b246afa 2150 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
1be41b78
JB
2151}
2152
1262133b
JB
2153/*
2154 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2155 * would require to store the csums for that many bytes.
2156 */
2ff7e61e 2157u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
1262133b
JB
2158{
2159 u64 csum_size;
2160 u64 num_csums_per_leaf;
2161 u64 num_csums;
2162
0b246afa 2163 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
1262133b 2164 num_csums_per_leaf = div64_u64(csum_size,
0b246afa
JM
2165 (u64)btrfs_super_csum_size(fs_info->super_copy));
2166 num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
1262133b
JB
2167 num_csums += num_csums_per_leaf - 1;
2168 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2169 return num_csums;
2170}
2171
c3e69d58
CM
2172/*
2173 * this starts processing the delayed reference count updates and
2174 * extent insertions we have queued up so far. count can be
2175 * 0, which means to process everything in the tree at the start
2176 * of the run (but not newly added entries), or it can be some target
2177 * number you'd like to process.
79787eaa
JM
2178 *
2179 * Returns 0 on success or if called with an aborted transaction
2180 * Returns <0 on error and aborts the transaction
c3e69d58
CM
2181 */
2182int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
c79a70b1 2183 unsigned long count)
c3e69d58 2184{
c79a70b1 2185 struct btrfs_fs_info *fs_info = trans->fs_info;
c3e69d58
CM
2186 struct rb_node *node;
2187 struct btrfs_delayed_ref_root *delayed_refs;
c46effa6 2188 struct btrfs_delayed_ref_head *head;
c3e69d58
CM
2189 int ret;
2190 int run_all = count == (unsigned long)-1;
c3e69d58 2191
79787eaa
JM
2192 /* We'll clean this up in btrfs_cleanup_transaction */
2193 if (trans->aborted)
2194 return 0;
2195
0b246afa 2196 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
511711af
CM
2197 return 0;
2198
c3e69d58 2199 delayed_refs = &trans->transaction->delayed_refs;
26455d33 2200 if (count == 0)
d7df2c79 2201 count = atomic_read(&delayed_refs->num_entries) * 2;
bb721703 2202
c3e69d58 2203again:
709c0486
AJ
2204#ifdef SCRAMBLE_DELAYED_REFS
2205 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2206#endif
0a1e458a 2207 ret = __btrfs_run_delayed_refs(trans, count);
d7df2c79 2208 if (ret < 0) {
66642832 2209 btrfs_abort_transaction(trans, ret);
d7df2c79 2210 return ret;
eb099670 2211 }
c3e69d58 2212
56bec294 2213 if (run_all) {
119e80df 2214 btrfs_create_pending_block_groups(trans);
ea658bad 2215
d7df2c79 2216 spin_lock(&delayed_refs->lock);
5c9d028b 2217 node = rb_first_cached(&delayed_refs->href_root);
d7df2c79
JB
2218 if (!node) {
2219 spin_unlock(&delayed_refs->lock);
56bec294 2220 goto out;
d7df2c79 2221 }
d278850e
JB
2222 head = rb_entry(node, struct btrfs_delayed_ref_head,
2223 href_node);
2224 refcount_inc(&head->refs);
2225 spin_unlock(&delayed_refs->lock);
e9d0b13b 2226
d278850e
JB
2227 /* Mutex was contended, block until it's released and retry. */
2228 mutex_lock(&head->mutex);
2229 mutex_unlock(&head->mutex);
56bec294 2230
d278850e 2231 btrfs_put_delayed_ref_head(head);
d7df2c79 2232 cond_resched();
56bec294 2233 goto again;
5f39d397 2234 }
54aa1f4d 2235out:
a28ec197
CM
2236 return 0;
2237}
2238
5d4f98a2 2239int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
5d4f98a2 2240 u64 bytenr, u64 num_bytes, u64 flags,
b1c79e09 2241 int level, int is_data)
5d4f98a2
YZ
2242{
2243 struct btrfs_delayed_extent_op *extent_op;
2244 int ret;
2245
78a6184a 2246 extent_op = btrfs_alloc_delayed_extent_op();
5d4f98a2
YZ
2247 if (!extent_op)
2248 return -ENOMEM;
2249
2250 extent_op->flags_to_set = flags;
35b3ad50
DS
2251 extent_op->update_flags = true;
2252 extent_op->update_key = false;
2253 extent_op->is_data = is_data ? true : false;
b1c79e09 2254 extent_op->level = level;
5d4f98a2 2255
c6e340bc 2256 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
5d4f98a2 2257 if (ret)
78a6184a 2258 btrfs_free_delayed_extent_op(extent_op);
5d4f98a2
YZ
2259 return ret;
2260}
2261
e4c3b2dc 2262static noinline int check_delayed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2263 struct btrfs_path *path,
2264 u64 objectid, u64 offset, u64 bytenr)
2265{
2266 struct btrfs_delayed_ref_head *head;
2267 struct btrfs_delayed_ref_node *ref;
2268 struct btrfs_delayed_data_ref *data_ref;
2269 struct btrfs_delayed_ref_root *delayed_refs;
e4c3b2dc 2270 struct btrfs_transaction *cur_trans;
0e0adbcf 2271 struct rb_node *node;
5d4f98a2
YZ
2272 int ret = 0;
2273
998ac6d2 2274 spin_lock(&root->fs_info->trans_lock);
e4c3b2dc 2275 cur_trans = root->fs_info->running_transaction;
998ac6d2 2276 if (cur_trans)
2277 refcount_inc(&cur_trans->use_count);
2278 spin_unlock(&root->fs_info->trans_lock);
e4c3b2dc
LB
2279 if (!cur_trans)
2280 return 0;
2281
2282 delayed_refs = &cur_trans->delayed_refs;
5d4f98a2 2283 spin_lock(&delayed_refs->lock);
f72ad18e 2284 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
d7df2c79
JB
2285 if (!head) {
2286 spin_unlock(&delayed_refs->lock);
998ac6d2 2287 btrfs_put_transaction(cur_trans);
d7df2c79
JB
2288 return 0;
2289 }
5d4f98a2
YZ
2290
2291 if (!mutex_trylock(&head->mutex)) {
d278850e 2292 refcount_inc(&head->refs);
5d4f98a2
YZ
2293 spin_unlock(&delayed_refs->lock);
2294
b3b4aa74 2295 btrfs_release_path(path);
5d4f98a2 2296
8cc33e5c
DS
2297 /*
2298 * Mutex was contended, block until it's released and let
2299 * caller try again
2300 */
5d4f98a2
YZ
2301 mutex_lock(&head->mutex);
2302 mutex_unlock(&head->mutex);
d278850e 2303 btrfs_put_delayed_ref_head(head);
998ac6d2 2304 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2305 return -EAGAIN;
2306 }
d7df2c79 2307 spin_unlock(&delayed_refs->lock);
5d4f98a2 2308
d7df2c79 2309 spin_lock(&head->lock);
0e0adbcf
JB
2310 /*
2311 * XXX: We should replace this with a proper search function in the
2312 * future.
2313 */
e3d03965
LB
2314 for (node = rb_first_cached(&head->ref_tree); node;
2315 node = rb_next(node)) {
0e0adbcf 2316 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
d7df2c79
JB
2317 /* If it's a shared ref we know a cross reference exists */
2318 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2319 ret = 1;
2320 break;
2321 }
5d4f98a2 2322
d7df2c79 2323 data_ref = btrfs_delayed_node_to_data_ref(ref);
5d4f98a2 2324
d7df2c79
JB
2325 /*
2326 * If our ref doesn't match the one we're currently looking at
2327 * then we have a cross reference.
2328 */
2329 if (data_ref->root != root->root_key.objectid ||
2330 data_ref->objectid != objectid ||
2331 data_ref->offset != offset) {
2332 ret = 1;
2333 break;
2334 }
5d4f98a2 2335 }
d7df2c79 2336 spin_unlock(&head->lock);
5d4f98a2 2337 mutex_unlock(&head->mutex);
998ac6d2 2338 btrfs_put_transaction(cur_trans);
5d4f98a2
YZ
2339 return ret;
2340}
2341
e4c3b2dc 2342static noinline int check_committed_ref(struct btrfs_root *root,
5d4f98a2
YZ
2343 struct btrfs_path *path,
2344 u64 objectid, u64 offset, u64 bytenr)
be20aa9d 2345{
0b246afa
JM
2346 struct btrfs_fs_info *fs_info = root->fs_info;
2347 struct btrfs_root *extent_root = fs_info->extent_root;
f321e491 2348 struct extent_buffer *leaf;
5d4f98a2
YZ
2349 struct btrfs_extent_data_ref *ref;
2350 struct btrfs_extent_inline_ref *iref;
2351 struct btrfs_extent_item *ei;
f321e491 2352 struct btrfs_key key;
5d4f98a2 2353 u32 item_size;
3de28d57 2354 int type;
be20aa9d 2355 int ret;
925baedd 2356
be20aa9d 2357 key.objectid = bytenr;
31840ae1 2358 key.offset = (u64)-1;
f321e491 2359 key.type = BTRFS_EXTENT_ITEM_KEY;
be20aa9d 2360
be20aa9d
CM
2361 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2362 if (ret < 0)
2363 goto out;
79787eaa 2364 BUG_ON(ret == 0); /* Corruption */
80ff3856
YZ
2365
2366 ret = -ENOENT;
2367 if (path->slots[0] == 0)
31840ae1 2368 goto out;
be20aa9d 2369
31840ae1 2370 path->slots[0]--;
f321e491 2371 leaf = path->nodes[0];
5d4f98a2 2372 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
be20aa9d 2373
5d4f98a2 2374 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
be20aa9d 2375 goto out;
f321e491 2376
5d4f98a2
YZ
2377 ret = 1;
2378 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
5d4f98a2 2379 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
bd09835d 2380
a6bd9cd1 2381 /* If extent item has more than 1 inline ref then it's shared */
5d4f98a2
YZ
2382 if (item_size != sizeof(*ei) +
2383 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2384 goto out;
be20aa9d 2385
a6bd9cd1 2386 /* If extent created before last snapshot => it's definitely shared */
5d4f98a2
YZ
2387 if (btrfs_extent_generation(leaf, ei) <=
2388 btrfs_root_last_snapshot(&root->root_item))
2389 goto out;
2390
2391 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3de28d57 2392
a6bd9cd1 2393 /* If this extent has SHARED_DATA_REF then it's shared */
3de28d57
LB
2394 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2395 if (type != BTRFS_EXTENT_DATA_REF_KEY)
5d4f98a2
YZ
2396 goto out;
2397
2398 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2399 if (btrfs_extent_refs(leaf, ei) !=
2400 btrfs_extent_data_ref_count(leaf, ref) ||
2401 btrfs_extent_data_ref_root(leaf, ref) !=
2402 root->root_key.objectid ||
2403 btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2404 btrfs_extent_data_ref_offset(leaf, ref) != offset)
2405 goto out;
2406
2407 ret = 0;
2408out:
2409 return ret;
2410}
2411
e4c3b2dc
LB
2412int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2413 u64 bytenr)
5d4f98a2
YZ
2414{
2415 struct btrfs_path *path;
2416 int ret;
5d4f98a2
YZ
2417
2418 path = btrfs_alloc_path();
2419 if (!path)
9132c4ff 2420 return -ENOMEM;
5d4f98a2
YZ
2421
2422 do {
e4c3b2dc 2423 ret = check_committed_ref(root, path, objectid,
5d4f98a2
YZ
2424 offset, bytenr);
2425 if (ret && ret != -ENOENT)
f321e491 2426 goto out;
80ff3856 2427
380fd066
MT
2428 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2429 } while (ret == -EAGAIN);
5d4f98a2 2430
be20aa9d 2431out:
80ff3856 2432 btrfs_free_path(path);
f0486c68
YZ
2433 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2434 WARN_ON(ret > 0);
f321e491 2435 return ret;
be20aa9d 2436}
c5739bba 2437
5d4f98a2 2438static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
b7a9f29f 2439 struct btrfs_root *root,
5d4f98a2 2440 struct extent_buffer *buf,
e339a6b0 2441 int full_backref, int inc)
31840ae1 2442{
0b246afa 2443 struct btrfs_fs_info *fs_info = root->fs_info;
31840ae1 2444 u64 bytenr;
5d4f98a2
YZ
2445 u64 num_bytes;
2446 u64 parent;
31840ae1 2447 u64 ref_root;
31840ae1 2448 u32 nritems;
31840ae1
ZY
2449 struct btrfs_key key;
2450 struct btrfs_file_extent_item *fi;
82fa113f
QW
2451 struct btrfs_ref generic_ref = { 0 };
2452 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31840ae1 2453 int i;
82fa113f 2454 int action;
31840ae1
ZY
2455 int level;
2456 int ret = 0;
fccb84c9 2457
0b246afa 2458 if (btrfs_is_testing(fs_info))
faa2dbf0 2459 return 0;
fccb84c9 2460
31840ae1 2461 ref_root = btrfs_header_owner(buf);
31840ae1
ZY
2462 nritems = btrfs_header_nritems(buf);
2463 level = btrfs_header_level(buf);
2464
27cdeb70 2465 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
5d4f98a2 2466 return 0;
31840ae1 2467
5d4f98a2
YZ
2468 if (full_backref)
2469 parent = buf->start;
2470 else
2471 parent = 0;
82fa113f
QW
2472 if (inc)
2473 action = BTRFS_ADD_DELAYED_REF;
2474 else
2475 action = BTRFS_DROP_DELAYED_REF;
5d4f98a2
YZ
2476
2477 for (i = 0; i < nritems; i++) {
31840ae1 2478 if (level == 0) {
5d4f98a2 2479 btrfs_item_key_to_cpu(buf, &key, i);
962a298f 2480 if (key.type != BTRFS_EXTENT_DATA_KEY)
31840ae1 2481 continue;
5d4f98a2 2482 fi = btrfs_item_ptr(buf, i,
31840ae1
ZY
2483 struct btrfs_file_extent_item);
2484 if (btrfs_file_extent_type(buf, fi) ==
2485 BTRFS_FILE_EXTENT_INLINE)
2486 continue;
2487 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2488 if (bytenr == 0)
2489 continue;
5d4f98a2
YZ
2490
2491 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2492 key.offset -= btrfs_file_extent_offset(buf, fi);
82fa113f
QW
2493 btrfs_init_generic_ref(&generic_ref, action, bytenr,
2494 num_bytes, parent);
2495 generic_ref.real_root = root->root_key.objectid;
2496 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2497 key.offset);
2498 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 2499 if (inc)
82fa113f 2500 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 2501 else
ffd4bb2a 2502 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
2503 if (ret)
2504 goto fail;
2505 } else {
5d4f98a2 2506 bytenr = btrfs_node_blockptr(buf, i);
0b246afa 2507 num_bytes = fs_info->nodesize;
82fa113f
QW
2508 btrfs_init_generic_ref(&generic_ref, action, bytenr,
2509 num_bytes, parent);
2510 generic_ref.real_root = root->root_key.objectid;
2511 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2512 generic_ref.skip_qgroup = for_reloc;
dd28b6a5 2513 if (inc)
82fa113f 2514 ret = btrfs_inc_extent_ref(trans, &generic_ref);
dd28b6a5 2515 else
ffd4bb2a 2516 ret = btrfs_free_extent(trans, &generic_ref);
31840ae1
ZY
2517 if (ret)
2518 goto fail;
2519 }
2520 }
2521 return 0;
2522fail:
5d4f98a2
YZ
2523 return ret;
2524}
2525
2526int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2527 struct extent_buffer *buf, int full_backref)
5d4f98a2 2528{
e339a6b0 2529 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
5d4f98a2
YZ
2530}
2531
2532int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
e339a6b0 2533 struct extent_buffer *buf, int full_backref)
5d4f98a2 2534{
e339a6b0 2535 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
31840ae1
ZY
2536}
2537
2ff7e61e 2538int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
d2fb3437 2539{
32da5386 2540 struct btrfs_block_group *block_group;
d2fb3437
YZ
2541 int readonly = 0;
2542
0b246afa 2543 block_group = btrfs_lookup_block_group(fs_info, bytenr);
d2fb3437
YZ
2544 if (!block_group || block_group->ro)
2545 readonly = 1;
2546 if (block_group)
fa9c0d79 2547 btrfs_put_block_group(block_group);
d2fb3437
YZ
2548 return readonly;
2549}
2550
1b86826d 2551static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
9ed74f2d 2552{
0b246afa 2553 struct btrfs_fs_info *fs_info = root->fs_info;
b742bb82 2554 u64 flags;
53b381b3 2555 u64 ret;
9ed74f2d 2556
b742bb82
YZ
2557 if (data)
2558 flags = BTRFS_BLOCK_GROUP_DATA;
0b246afa 2559 else if (root == fs_info->chunk_root)
b742bb82 2560 flags = BTRFS_BLOCK_GROUP_SYSTEM;
9ed74f2d 2561 else
b742bb82 2562 flags = BTRFS_BLOCK_GROUP_METADATA;
9ed74f2d 2563
878d7b67 2564 ret = btrfs_get_alloc_profile(fs_info, flags);
53b381b3 2565 return ret;
6a63209f 2566}
9ed74f2d 2567
2ff7e61e 2568static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
a061fc8d 2569{
32da5386 2570 struct btrfs_block_group *cache;
d2fb3437 2571 u64 bytenr;
0f9dd46c 2572
0b246afa
JM
2573 spin_lock(&fs_info->block_group_cache_lock);
2574 bytenr = fs_info->first_logical_byte;
2575 spin_unlock(&fs_info->block_group_cache_lock);
a1897fdd
LB
2576
2577 if (bytenr < (u64)-1)
2578 return bytenr;
2579
0b246afa 2580 cache = btrfs_lookup_first_block_group(fs_info, search_start);
0f9dd46c 2581 if (!cache)
a061fc8d 2582 return 0;
0f9dd46c 2583
b3470b5d 2584 bytenr = cache->start;
fa9c0d79 2585 btrfs_put_block_group(cache);
d2fb3437
YZ
2586
2587 return bytenr;
a061fc8d
CM
2588}
2589
32da5386 2590static int pin_down_extent(struct btrfs_block_group *cache,
f0486c68 2591 u64 bytenr, u64 num_bytes, int reserved)
324ae4df 2592{
fdf08605
DS
2593 struct btrfs_fs_info *fs_info = cache->fs_info;
2594
11833d66
YZ
2595 spin_lock(&cache->space_info->lock);
2596 spin_lock(&cache->lock);
2597 cache->pinned += num_bytes;
bb96c4e5
JB
2598 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2599 num_bytes);
11833d66
YZ
2600 if (reserved) {
2601 cache->reserved -= num_bytes;
2602 cache->space_info->bytes_reserved -= num_bytes;
2603 }
2604 spin_unlock(&cache->lock);
2605 spin_unlock(&cache->space_info->lock);
68b38550 2606
dec59fa3
EL
2607 percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
2608 num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
0b246afa 2609 set_extent_dirty(fs_info->pinned_extents, bytenr,
f0486c68
YZ
2610 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2611 return 0;
2612}
68b38550 2613
2ff7e61e 2614int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
f0486c68
YZ
2615 u64 bytenr, u64 num_bytes, int reserved)
2616{
32da5386 2617 struct btrfs_block_group *cache;
68b38550 2618
ce6d3eb6
NB
2619 ASSERT(fs_info->running_transaction);
2620
0b246afa 2621 cache = btrfs_lookup_block_group(fs_info, bytenr);
79787eaa 2622 BUG_ON(!cache); /* Logic error */
f0486c68 2623
fdf08605 2624 pin_down_extent(cache, bytenr, num_bytes, reserved);
f0486c68
YZ
2625
2626 btrfs_put_block_group(cache);
11833d66
YZ
2627 return 0;
2628}
2629
f0486c68 2630/*
e688b725
CM
2631 * this function must be called within transaction
2632 */
2ff7e61e 2633int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
e688b725
CM
2634 u64 bytenr, u64 num_bytes)
2635{
32da5386 2636 struct btrfs_block_group *cache;
b50c6e25 2637 int ret;
e688b725 2638
0b246afa 2639 cache = btrfs_lookup_block_group(fs_info, bytenr);
b50c6e25
JB
2640 if (!cache)
2641 return -EINVAL;
e688b725
CM
2642
2643 /*
2644 * pull in the free space cache (if any) so that our pin
2645 * removes the free space from the cache. We have load_only set
2646 * to one because the slow code to read in the free extents does check
2647 * the pinned extents.
2648 */
676f1f75 2649 btrfs_cache_block_group(cache, 1);
e688b725 2650
fdf08605 2651 pin_down_extent(cache, bytenr, num_bytes, 0);
e688b725
CM
2652
2653 /* remove us from the free space cache (if we're there at all) */
b50c6e25 2654 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
e688b725 2655 btrfs_put_block_group(cache);
b50c6e25 2656 return ret;
e688b725
CM
2657}
2658
2ff7e61e
JM
2659static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2660 u64 start, u64 num_bytes)
8c2a1a30
JB
2661{
2662 int ret;
32da5386 2663 struct btrfs_block_group *block_group;
8c2a1a30
JB
2664 struct btrfs_caching_control *caching_ctl;
2665
0b246afa 2666 block_group = btrfs_lookup_block_group(fs_info, start);
8c2a1a30
JB
2667 if (!block_group)
2668 return -EINVAL;
2669
676f1f75 2670 btrfs_cache_block_group(block_group, 0);
e3cb339f 2671 caching_ctl = btrfs_get_caching_control(block_group);
8c2a1a30
JB
2672
2673 if (!caching_ctl) {
2674 /* Logic error */
32da5386 2675 BUG_ON(!btrfs_block_group_done(block_group));
8c2a1a30
JB
2676 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2677 } else {
2678 mutex_lock(&caching_ctl->mutex);
2679
2680 if (start >= caching_ctl->progress) {
6f410d1b
JB
2681 ret = btrfs_add_excluded_extent(fs_info, start,
2682 num_bytes);
8c2a1a30
JB
2683 } else if (start + num_bytes <= caching_ctl->progress) {
2684 ret = btrfs_remove_free_space(block_group,
2685 start, num_bytes);
2686 } else {
2687 num_bytes = caching_ctl->progress - start;
2688 ret = btrfs_remove_free_space(block_group,
2689 start, num_bytes);
2690 if (ret)
2691 goto out_lock;
2692
2693 num_bytes = (start + num_bytes) -
2694 caching_ctl->progress;
2695 start = caching_ctl->progress;
6f410d1b
JB
2696 ret = btrfs_add_excluded_extent(fs_info, start,
2697 num_bytes);
8c2a1a30
JB
2698 }
2699out_lock:
2700 mutex_unlock(&caching_ctl->mutex);
e3cb339f 2701 btrfs_put_caching_control(caching_ctl);
8c2a1a30
JB
2702 }
2703 btrfs_put_block_group(block_group);
2704 return ret;
2705}
2706
bcdc428c 2707int btrfs_exclude_logged_extents(struct extent_buffer *eb)
8c2a1a30 2708{
bcdc428c 2709 struct btrfs_fs_info *fs_info = eb->fs_info;
8c2a1a30
JB
2710 struct btrfs_file_extent_item *item;
2711 struct btrfs_key key;
2712 int found_type;
2713 int i;
b89311ef 2714 int ret = 0;
8c2a1a30 2715
2ff7e61e 2716 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
8c2a1a30
JB
2717 return 0;
2718
2719 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2720 btrfs_item_key_to_cpu(eb, &key, i);
2721 if (key.type != BTRFS_EXTENT_DATA_KEY)
2722 continue;
2723 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2724 found_type = btrfs_file_extent_type(eb, item);
2725 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2726 continue;
2727 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2728 continue;
2729 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2730 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
b89311ef
GJ
2731 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2732 if (ret)
2733 break;
8c2a1a30
JB
2734 }
2735
b89311ef 2736 return ret;
8c2a1a30
JB
2737}
2738
9cfa3e34 2739static void
32da5386 2740btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
9cfa3e34
FM
2741{
2742 atomic_inc(&bg->reservations);
2743}
2744
8b74c03e 2745void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
e8569813 2746{
11833d66
YZ
2747 struct btrfs_caching_control *next;
2748 struct btrfs_caching_control *caching_ctl;
32da5386 2749 struct btrfs_block_group *cache;
e8569813 2750
9e351cc8 2751 down_write(&fs_info->commit_root_sem);
25179201 2752
11833d66
YZ
2753 list_for_each_entry_safe(caching_ctl, next,
2754 &fs_info->caching_block_groups, list) {
2755 cache = caching_ctl->block_group;
32da5386 2756 if (btrfs_block_group_done(cache)) {
11833d66
YZ
2757 cache->last_byte_to_unpin = (u64)-1;
2758 list_del_init(&caching_ctl->list);
e3cb339f 2759 btrfs_put_caching_control(caching_ctl);
e8569813 2760 } else {
11833d66 2761 cache->last_byte_to_unpin = caching_ctl->progress;
e8569813 2762 }
e8569813 2763 }
11833d66
YZ
2764
2765 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
2766 fs_info->pinned_extents = &fs_info->freed_extents[1];
2767 else
2768 fs_info->pinned_extents = &fs_info->freed_extents[0];
2769
9e351cc8 2770 up_write(&fs_info->commit_root_sem);
8929ecfa 2771
67f9c220 2772 btrfs_update_global_block_rsv(fs_info);
e8569813
ZY
2773}
2774
c759c4e1
JB
2775/*
2776 * Returns the free cluster for the given space info and sets empty_cluster to
2777 * what it should be based on the mount options.
2778 */
2779static struct btrfs_free_cluster *
2ff7e61e
JM
2780fetch_cluster_info(struct btrfs_fs_info *fs_info,
2781 struct btrfs_space_info *space_info, u64 *empty_cluster)
c759c4e1
JB
2782{
2783 struct btrfs_free_cluster *ret = NULL;
c759c4e1
JB
2784
2785 *empty_cluster = 0;
2786 if (btrfs_mixed_space_info(space_info))
2787 return ret;
2788
c759c4e1 2789 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
0b246afa 2790 ret = &fs_info->meta_alloc_cluster;
583b7231
HK
2791 if (btrfs_test_opt(fs_info, SSD))
2792 *empty_cluster = SZ_2M;
2793 else
ee22184b 2794 *empty_cluster = SZ_64K;
583b7231
HK
2795 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2796 btrfs_test_opt(fs_info, SSD_SPREAD)) {
2797 *empty_cluster = SZ_2M;
0b246afa 2798 ret = &fs_info->data_alloc_cluster;
c759c4e1
JB
2799 }
2800
2801 return ret;
2802}
2803
2ff7e61e
JM
2804static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2805 u64 start, u64 end,
678886bd 2806 const bool return_free_space)
ccd467d6 2807{
32da5386 2808 struct btrfs_block_group *cache = NULL;
7b398f8e
JB
2809 struct btrfs_space_info *space_info;
2810 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
c759c4e1 2811 struct btrfs_free_cluster *cluster = NULL;
11833d66 2812 u64 len;
c759c4e1
JB
2813 u64 total_unpinned = 0;
2814 u64 empty_cluster = 0;
7b398f8e 2815 bool readonly;
ccd467d6 2816
11833d66 2817 while (start <= end) {
7b398f8e 2818 readonly = false;
11833d66 2819 if (!cache ||
b3470b5d 2820 start >= cache->start + cache->length) {
11833d66
YZ
2821 if (cache)
2822 btrfs_put_block_group(cache);
c759c4e1 2823 total_unpinned = 0;
11833d66 2824 cache = btrfs_lookup_block_group(fs_info, start);
79787eaa 2825 BUG_ON(!cache); /* Logic error */
c759c4e1 2826
2ff7e61e 2827 cluster = fetch_cluster_info(fs_info,
c759c4e1
JB
2828 cache->space_info,
2829 &empty_cluster);
2830 empty_cluster <<= 1;
11833d66
YZ
2831 }
2832
b3470b5d 2833 len = cache->start + cache->length - start;
11833d66
YZ
2834 len = min(len, end + 1 - start);
2835
2836 if (start < cache->last_byte_to_unpin) {
2837 len = min(len, cache->last_byte_to_unpin - start);
678886bd
FM
2838 if (return_free_space)
2839 btrfs_add_free_space(cache, start, len);
11833d66
YZ
2840 }
2841
f0486c68 2842 start += len;
c759c4e1 2843 total_unpinned += len;
7b398f8e 2844 space_info = cache->space_info;
f0486c68 2845
c759c4e1
JB
2846 /*
2847 * If this space cluster has been marked as fragmented and we've
2848 * unpinned enough in this block group to potentially allow a
2849 * cluster to be created inside of it go ahead and clear the
2850 * fragmented check.
2851 */
2852 if (cluster && cluster->fragmented &&
2853 total_unpinned > empty_cluster) {
2854 spin_lock(&cluster->lock);
2855 cluster->fragmented = 0;
2856 spin_unlock(&cluster->lock);
2857 }
2858
7b398f8e 2859 spin_lock(&space_info->lock);
11833d66
YZ
2860 spin_lock(&cache->lock);
2861 cache->pinned -= len;
bb96c4e5 2862 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
4f4db217 2863 space_info->max_extent_size = 0;
dec59fa3
EL
2864 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2865 -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
7b398f8e
JB
2866 if (cache->ro) {
2867 space_info->bytes_readonly += len;
2868 readonly = true;
2869 }
11833d66 2870 spin_unlock(&cache->lock);
957780eb
JB
2871 if (!readonly && return_free_space &&
2872 global_rsv->space_info == space_info) {
2873 u64 to_add = len;
92ac58ec 2874
7b398f8e
JB
2875 spin_lock(&global_rsv->lock);
2876 if (!global_rsv->full) {
957780eb
JB
2877 to_add = min(len, global_rsv->size -
2878 global_rsv->reserved);
2879 global_rsv->reserved += to_add;
bb96c4e5
JB
2880 btrfs_space_info_update_bytes_may_use(fs_info,
2881 space_info, to_add);
7b398f8e
JB
2882 if (global_rsv->reserved >= global_rsv->size)
2883 global_rsv->full = 1;
957780eb 2884 len -= to_add;
7b398f8e
JB
2885 }
2886 spin_unlock(&global_rsv->lock);
957780eb
JB
2887 /* Add to any tickets we may have */
2888 if (len)
18fa2284
JB
2889 btrfs_try_granting_tickets(fs_info,
2890 space_info);
7b398f8e
JB
2891 }
2892 spin_unlock(&space_info->lock);
ccd467d6 2893 }
11833d66
YZ
2894
2895 if (cache)
2896 btrfs_put_block_group(cache);
ccd467d6
CM
2897 return 0;
2898}
2899
5ead2dd0 2900int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
a28ec197 2901{
5ead2dd0 2902 struct btrfs_fs_info *fs_info = trans->fs_info;
32da5386 2903 struct btrfs_block_group *block_group, *tmp;
e33e17ee 2904 struct list_head *deleted_bgs;
11833d66 2905 struct extent_io_tree *unpin;
1a5bc167
CM
2906 u64 start;
2907 u64 end;
a28ec197 2908 int ret;
a28ec197 2909
11833d66
YZ
2910 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
2911 unpin = &fs_info->freed_extents[1];
2912 else
2913 unpin = &fs_info->freed_extents[0];
2914
e33e17ee 2915 while (!trans->aborted) {
0e6ec385
FM
2916 struct extent_state *cached_state = NULL;
2917
d4b450cd 2918 mutex_lock(&fs_info->unused_bg_unpin_mutex);
1a5bc167 2919 ret = find_first_extent_bit(unpin, 0, &start, &end,
0e6ec385 2920 EXTENT_DIRTY, &cached_state);
d4b450cd
FM
2921 if (ret) {
2922 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
a28ec197 2923 break;
d4b450cd 2924 }
1f3c79a2 2925
0b246afa 2926 if (btrfs_test_opt(fs_info, DISCARD))
2ff7e61e 2927 ret = btrfs_discard_extent(fs_info, start,
5378e607 2928 end + 1 - start, NULL);
1f3c79a2 2929
0e6ec385 2930 clear_extent_dirty(unpin, start, end, &cached_state);
2ff7e61e 2931 unpin_extent_range(fs_info, start, end, true);
d4b450cd 2932 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
0e6ec385 2933 free_extent_state(cached_state);
b9473439 2934 cond_resched();
a28ec197 2935 }
817d52f8 2936
e33e17ee
JM
2937 /*
2938 * Transaction is finished. We don't need the lock anymore. We
2939 * do need to clean up the block groups in case of a transaction
2940 * abort.
2941 */
2942 deleted_bgs = &trans->transaction->deleted_bgs;
2943 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2944 u64 trimmed = 0;
2945
2946 ret = -EROFS;
2947 if (!trans->aborted)
2ff7e61e 2948 ret = btrfs_discard_extent(fs_info,
b3470b5d
DS
2949 block_group->start,
2950 block_group->length,
e33e17ee
JM
2951 &trimmed);
2952
2953 list_del_init(&block_group->bg_list);
2954 btrfs_put_block_group_trimming(block_group);
2955 btrfs_put_block_group(block_group);
2956
2957 if (ret) {
2958 const char *errstr = btrfs_decode_error(ret);
2959 btrfs_warn(fs_info,
913e1535 2960 "discard failed while removing blockgroup: errno=%d %s",
e33e17ee
JM
2961 ret, errstr);
2962 }
2963 }
2964
e20d96d6
CM
2965 return 0;
2966}
2967
5d4f98a2 2968static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
e72cb923
NB
2969 struct btrfs_delayed_ref_node *node, u64 parent,
2970 u64 root_objectid, u64 owner_objectid,
2971 u64 owner_offset, int refs_to_drop,
2972 struct btrfs_delayed_extent_op *extent_op)
a28ec197 2973{
e72cb923 2974 struct btrfs_fs_info *info = trans->fs_info;
e2fa7227 2975 struct btrfs_key key;
5d4f98a2 2976 struct btrfs_path *path;
1261ec42 2977 struct btrfs_root *extent_root = info->extent_root;
5f39d397 2978 struct extent_buffer *leaf;
5d4f98a2
YZ
2979 struct btrfs_extent_item *ei;
2980 struct btrfs_extent_inline_ref *iref;
a28ec197 2981 int ret;
5d4f98a2 2982 int is_data;
952fccac
CM
2983 int extent_slot = 0;
2984 int found_extent = 0;
2985 int num_to_del = 1;
5d4f98a2
YZ
2986 u32 item_size;
2987 u64 refs;
c682f9b3
QW
2988 u64 bytenr = node->bytenr;
2989 u64 num_bytes = node->num_bytes;
fcebe456 2990 int last_ref = 0;
0b246afa 2991 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
037e6390 2992
5caf2a00 2993 path = btrfs_alloc_path();
54aa1f4d
CM
2994 if (!path)
2995 return -ENOMEM;
5f26f772 2996
e4058b54 2997 path->reada = READA_FORWARD;
b9473439 2998 path->leave_spinning = 1;
5d4f98a2
YZ
2999
3000 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3001 BUG_ON(!is_data && refs_to_drop != 1);
3002
3173a18f 3003 if (is_data)
897ca819 3004 skinny_metadata = false;
3173a18f 3005
fbe4801b
NB
3006 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3007 parent, root_objectid, owner_objectid,
5d4f98a2 3008 owner_offset);
7bb86316 3009 if (ret == 0) {
952fccac 3010 extent_slot = path->slots[0];
5d4f98a2
YZ
3011 while (extent_slot >= 0) {
3012 btrfs_item_key_to_cpu(path->nodes[0], &key,
952fccac 3013 extent_slot);
5d4f98a2 3014 if (key.objectid != bytenr)
952fccac 3015 break;
5d4f98a2
YZ
3016 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3017 key.offset == num_bytes) {
952fccac
CM
3018 found_extent = 1;
3019 break;
3020 }
3173a18f
JB
3021 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3022 key.offset == owner_objectid) {
3023 found_extent = 1;
3024 break;
3025 }
952fccac
CM
3026 if (path->slots[0] - extent_slot > 5)
3027 break;
5d4f98a2 3028 extent_slot--;
952fccac 3029 }
a79865c6 3030
31840ae1 3031 if (!found_extent) {
5d4f98a2 3032 BUG_ON(iref);
87cc7a8a 3033 ret = remove_extent_backref(trans, path, NULL,
87bde3cd 3034 refs_to_drop,
fcebe456 3035 is_data, &last_ref);
005d6427 3036 if (ret) {
66642832 3037 btrfs_abort_transaction(trans, ret);
005d6427
DS
3038 goto out;
3039 }
b3b4aa74 3040 btrfs_release_path(path);
b9473439 3041 path->leave_spinning = 1;
5d4f98a2
YZ
3042
3043 key.objectid = bytenr;
3044 key.type = BTRFS_EXTENT_ITEM_KEY;
3045 key.offset = num_bytes;
3046
3173a18f
JB
3047 if (!is_data && skinny_metadata) {
3048 key.type = BTRFS_METADATA_ITEM_KEY;
3049 key.offset = owner_objectid;
3050 }
3051
31840ae1
ZY
3052 ret = btrfs_search_slot(trans, extent_root,
3053 &key, path, -1, 1);
3173a18f
JB
3054 if (ret > 0 && skinny_metadata && path->slots[0]) {
3055 /*
3056 * Couldn't find our skinny metadata item,
3057 * see if we have ye olde extent item.
3058 */
3059 path->slots[0]--;
3060 btrfs_item_key_to_cpu(path->nodes[0], &key,
3061 path->slots[0]);
3062 if (key.objectid == bytenr &&
3063 key.type == BTRFS_EXTENT_ITEM_KEY &&
3064 key.offset == num_bytes)
3065 ret = 0;
3066 }
3067
3068 if (ret > 0 && skinny_metadata) {
3069 skinny_metadata = false;
9ce49a0b 3070 key.objectid = bytenr;
3173a18f
JB
3071 key.type = BTRFS_EXTENT_ITEM_KEY;
3072 key.offset = num_bytes;
3073 btrfs_release_path(path);
3074 ret = btrfs_search_slot(trans, extent_root,
3075 &key, path, -1, 1);
3076 }
3077
f3465ca4 3078 if (ret) {
5d163e0e
JM
3079 btrfs_err(info,
3080 "umm, got %d back from search, was looking for %llu",
3081 ret, bytenr);
b783e62d 3082 if (ret > 0)
a4f78750 3083 btrfs_print_leaf(path->nodes[0]);
f3465ca4 3084 }
005d6427 3085 if (ret < 0) {
66642832 3086 btrfs_abort_transaction(trans, ret);
005d6427
DS
3087 goto out;
3088 }
31840ae1
ZY
3089 extent_slot = path->slots[0];
3090 }
fae7f21c 3091 } else if (WARN_ON(ret == -ENOENT)) {
a4f78750 3092 btrfs_print_leaf(path->nodes[0]);
c2cf52eb
SK
3093 btrfs_err(info,
3094 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
c1c9ff7c
GU
3095 bytenr, parent, root_objectid, owner_objectid,
3096 owner_offset);
66642832 3097 btrfs_abort_transaction(trans, ret);
c4a050bb 3098 goto out;
79787eaa 3099 } else {
66642832 3100 btrfs_abort_transaction(trans, ret);
005d6427 3101 goto out;
7bb86316 3102 }
5f39d397
CM
3103
3104 leaf = path->nodes[0];
5d4f98a2 3105 item_size = btrfs_item_size_nr(leaf, extent_slot);
6d8ff4e4 3106 if (unlikely(item_size < sizeof(*ei))) {
ba3c2b19
NB
3107 ret = -EINVAL;
3108 btrfs_print_v0_err(info);
3109 btrfs_abort_transaction(trans, ret);
3110 goto out;
3111 }
952fccac 3112 ei = btrfs_item_ptr(leaf, extent_slot,
123abc88 3113 struct btrfs_extent_item);
3173a18f
JB
3114 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3115 key.type == BTRFS_EXTENT_ITEM_KEY) {
5d4f98a2
YZ
3116 struct btrfs_tree_block_info *bi;
3117 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3118 bi = (struct btrfs_tree_block_info *)(ei + 1);
3119 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3120 }
56bec294 3121
5d4f98a2 3122 refs = btrfs_extent_refs(leaf, ei);
32b02538 3123 if (refs < refs_to_drop) {
5d163e0e
JM
3124 btrfs_err(info,
3125 "trying to drop %d refs but we only have %Lu for bytenr %Lu",
3126 refs_to_drop, refs, bytenr);
32b02538 3127 ret = -EINVAL;
66642832 3128 btrfs_abort_transaction(trans, ret);
32b02538
JB
3129 goto out;
3130 }
56bec294 3131 refs -= refs_to_drop;
5f39d397 3132
5d4f98a2
YZ
3133 if (refs > 0) {
3134 if (extent_op)
3135 __run_delayed_extent_op(extent_op, leaf, ei);
3136 /*
3137 * In the case of inline back ref, reference count will
3138 * be updated by remove_extent_backref
952fccac 3139 */
5d4f98a2
YZ
3140 if (iref) {
3141 BUG_ON(!found_extent);
3142 } else {
3143 btrfs_set_extent_refs(leaf, ei, refs);
3144 btrfs_mark_buffer_dirty(leaf);
3145 }
3146 if (found_extent) {
87cc7a8a
NB
3147 ret = remove_extent_backref(trans, path, iref,
3148 refs_to_drop, is_data,
3149 &last_ref);
005d6427 3150 if (ret) {
66642832 3151 btrfs_abort_transaction(trans, ret);
005d6427
DS
3152 goto out;
3153 }
952fccac 3154 }
5d4f98a2 3155 } else {
5d4f98a2
YZ
3156 if (found_extent) {
3157 BUG_ON(is_data && refs_to_drop !=
9ed0dea0 3158 extent_data_ref_count(path, iref));
5d4f98a2
YZ
3159 if (iref) {
3160 BUG_ON(path->slots[0] != extent_slot);
3161 } else {
3162 BUG_ON(path->slots[0] != extent_slot + 1);
3163 path->slots[0] = extent_slot;
3164 num_to_del = 2;
3165 }
78fae27e 3166 }
b9473439 3167
fcebe456 3168 last_ref = 1;
952fccac
CM
3169 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3170 num_to_del);
005d6427 3171 if (ret) {
66642832 3172 btrfs_abort_transaction(trans, ret);
005d6427
DS
3173 goto out;
3174 }
b3b4aa74 3175 btrfs_release_path(path);
21af804c 3176
5d4f98a2 3177 if (is_data) {
40e046ac
FM
3178 ret = btrfs_del_csums(trans, info->csum_root, bytenr,
3179 num_bytes);
005d6427 3180 if (ret) {
66642832 3181 btrfs_abort_transaction(trans, ret);
005d6427
DS
3182 goto out;
3183 }
459931ec
CM
3184 }
3185
e7355e50 3186 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
1e144fb8 3187 if (ret) {
66642832 3188 btrfs_abort_transaction(trans, ret);
1e144fb8
OS
3189 goto out;
3190 }
3191
ade4b516 3192 ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
005d6427 3193 if (ret) {
66642832 3194 btrfs_abort_transaction(trans, ret);
005d6427
DS
3195 goto out;
3196 }
a28ec197 3197 }
fcebe456
JB
3198 btrfs_release_path(path);
3199
79787eaa 3200out:
5caf2a00 3201 btrfs_free_path(path);
a28ec197
CM
3202 return ret;
3203}
3204
1887be66 3205/*
f0486c68 3206 * when we free an block, it is possible (and likely) that we free the last
1887be66
CM
3207 * delayed ref for that extent as well. This searches the delayed ref tree for
3208 * a given extent, and if there are no other delayed refs to be processed, it
3209 * removes it from the tree.
3210 */
3211static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
2ff7e61e 3212 u64 bytenr)
1887be66
CM
3213{
3214 struct btrfs_delayed_ref_head *head;
3215 struct btrfs_delayed_ref_root *delayed_refs;
f0486c68 3216 int ret = 0;
1887be66
CM
3217
3218 delayed_refs = &trans->transaction->delayed_refs;
3219 spin_lock(&delayed_refs->lock);
f72ad18e 3220 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
1887be66 3221 if (!head)
cf93da7b 3222 goto out_delayed_unlock;
1887be66 3223
d7df2c79 3224 spin_lock(&head->lock);
e3d03965 3225 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1887be66
CM
3226 goto out;
3227
bedc6617
JB
3228 if (cleanup_extent_op(head) != NULL)
3229 goto out;
5d4f98a2 3230
1887be66
CM
3231 /*
3232 * waiting for the lock here would deadlock. If someone else has it
3233 * locked they are already in the process of dropping it anyway
3234 */
3235 if (!mutex_trylock(&head->mutex))
3236 goto out;
3237
d7baffda 3238 btrfs_delete_ref_head(delayed_refs, head);
d7df2c79 3239 head->processing = 0;
d7baffda 3240
d7df2c79 3241 spin_unlock(&head->lock);
1887be66
CM
3242 spin_unlock(&delayed_refs->lock);
3243
f0486c68
YZ
3244 BUG_ON(head->extent_op);
3245 if (head->must_insert_reserved)
3246 ret = 1;
3247
31890da0 3248 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
f0486c68 3249 mutex_unlock(&head->mutex);
d278850e 3250 btrfs_put_delayed_ref_head(head);
f0486c68 3251 return ret;
1887be66 3252out:
d7df2c79 3253 spin_unlock(&head->lock);
cf93da7b
CM
3254
3255out_delayed_unlock:
1887be66
CM
3256 spin_unlock(&delayed_refs->lock);
3257 return 0;
3258}
3259
f0486c68
YZ
3260void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3261 struct btrfs_root *root,
3262 struct extent_buffer *buf,
5581a51a 3263 u64 parent, int last_ref)
f0486c68 3264{
0b246afa 3265 struct btrfs_fs_info *fs_info = root->fs_info;
ed4f255b 3266 struct btrfs_ref generic_ref = { 0 };
b150a4f1 3267 int pin = 1;
f0486c68
YZ
3268 int ret;
3269
ed4f255b
QW
3270 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3271 buf->start, buf->len, parent);
3272 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3273 root->root_key.objectid);
3274
f0486c68 3275 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
d7eae340
OS
3276 int old_ref_mod, new_ref_mod;
3277
8a5040f7 3278 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 3279 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
d7eae340 3280 &old_ref_mod, &new_ref_mod);
79787eaa 3281 BUG_ON(ret); /* -ENOMEM */
d7eae340 3282 pin = old_ref_mod >= 0 && new_ref_mod < 0;
f0486c68
YZ
3283 }
3284
0a16c7d7 3285 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
32da5386 3286 struct btrfs_block_group *cache;
6219872d 3287
f0486c68 3288 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
2ff7e61e 3289 ret = check_ref_cleanup(trans, buf->start);
f0486c68 3290 if (!ret)
37be25bc 3291 goto out;
f0486c68
YZ
3292 }
3293
4da8b76d 3294 pin = 0;
0b246afa 3295 cache = btrfs_lookup_block_group(fs_info, buf->start);
6219872d 3296
f0486c68 3297 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
fdf08605 3298 pin_down_extent(cache, buf->start, buf->len, 1);
6219872d 3299 btrfs_put_block_group(cache);
37be25bc 3300 goto out;
f0486c68
YZ
3301 }
3302
3303 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3304
3305 btrfs_add_free_space(cache, buf->start, buf->len);
4824f1f4 3306 btrfs_free_reserved_bytes(cache, buf->len, 0);
6219872d 3307 btrfs_put_block_group(cache);
71ff6437 3308 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
f0486c68
YZ
3309 }
3310out:
b150a4f1 3311 if (pin)
78192442 3312 add_pinned_bytes(fs_info, &generic_ref);
b150a4f1 3313
0a16c7d7
OS
3314 if (last_ref) {
3315 /*
3316 * Deleting the buffer, clear the corrupt flag since it doesn't
3317 * matter anymore.
3318 */
3319 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3320 }
f0486c68
YZ
3321}
3322
79787eaa 3323/* Can return -ENOMEM */
ffd4bb2a 3324int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
925baedd 3325{
ffd4bb2a 3326 struct btrfs_fs_info *fs_info = trans->fs_info;
d7eae340 3327 int old_ref_mod, new_ref_mod;
925baedd
CM
3328 int ret;
3329
f5ee5c9a 3330 if (btrfs_is_testing(fs_info))
faa2dbf0 3331 return 0;
fccb84c9 3332
56bec294
CM
3333 /*
3334 * tree log blocks never actually go into the extent allocation
3335 * tree, just update pinning info and exit early.
56bec294 3336 */
ffd4bb2a
QW
3337 if ((ref->type == BTRFS_REF_METADATA &&
3338 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3339 (ref->type == BTRFS_REF_DATA &&
3340 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
b9473439 3341 /* unlocks the pinned mutex */
ffd4bb2a 3342 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
d7eae340 3343 old_ref_mod = new_ref_mod = 0;
56bec294 3344 ret = 0;
ffd4bb2a
QW
3345 } else if (ref->type == BTRFS_REF_METADATA) {
3346 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
d7eae340 3347 &old_ref_mod, &new_ref_mod);
5d4f98a2 3348 } else {
ffd4bb2a 3349 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
d7eae340 3350 &old_ref_mod, &new_ref_mod);
56bec294 3351 }
d7eae340 3352
ffd4bb2a
QW
3353 if (!((ref->type == BTRFS_REF_METADATA &&
3354 ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3355 (ref->type == BTRFS_REF_DATA &&
3356 ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3357 btrfs_ref_tree_mod(fs_info, ref);
8a5040f7 3358
ddf30cf0 3359 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
78192442 3360 add_pinned_bytes(fs_info, ref);
d7eae340 3361
925baedd
CM
3362 return ret;
3363}
3364
817d52f8 3365enum btrfs_loop_type {
f262fa8d
DS
3366 LOOP_CACHING_NOWAIT,
3367 LOOP_CACHING_WAIT,
3368 LOOP_ALLOC_CHUNK,
3369 LOOP_NO_EMPTY_SIZE,
817d52f8
JB
3370};
3371
e570fd27 3372static inline void
32da5386 3373btrfs_lock_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3374 int delalloc)
3375{
3376 if (delalloc)
3377 down_read(&cache->data_rwsem);
3378}
3379
32da5386 3380static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3381 int delalloc)
3382{
3383 btrfs_get_block_group(cache);
3384 if (delalloc)
3385 down_read(&cache->data_rwsem);
3386}
3387
32da5386
DS
3388static struct btrfs_block_group *btrfs_lock_cluster(
3389 struct btrfs_block_group *block_group,
e570fd27
MX
3390 struct btrfs_free_cluster *cluster,
3391 int delalloc)
3392{
32da5386 3393 struct btrfs_block_group *used_bg = NULL;
6719afdc 3394
e570fd27 3395 spin_lock(&cluster->refill_lock);
6719afdc
GU
3396 while (1) {
3397 used_bg = cluster->block_group;
3398 if (!used_bg)
3399 return NULL;
3400
3401 if (used_bg == block_group)
e570fd27
MX
3402 return used_bg;
3403
6719afdc 3404 btrfs_get_block_group(used_bg);
e570fd27 3405
6719afdc
GU
3406 if (!delalloc)
3407 return used_bg;
e570fd27 3408
6719afdc
GU
3409 if (down_read_trylock(&used_bg->data_rwsem))
3410 return used_bg;
e570fd27 3411
6719afdc 3412 spin_unlock(&cluster->refill_lock);
e570fd27 3413
e321f8a8
LB
3414 /* We should only have one-level nested. */
3415 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
e570fd27 3416
6719afdc
GU
3417 spin_lock(&cluster->refill_lock);
3418 if (used_bg == cluster->block_group)
3419 return used_bg;
e570fd27 3420
6719afdc
GU
3421 up_read(&used_bg->data_rwsem);
3422 btrfs_put_block_group(used_bg);
3423 }
e570fd27
MX
3424}
3425
3426static inline void
32da5386 3427btrfs_release_block_group(struct btrfs_block_group *cache,
e570fd27
MX
3428 int delalloc)
3429{
3430 if (delalloc)
3431 up_read(&cache->data_rwsem);
3432 btrfs_put_block_group(cache);
3433}
3434
b4bd745d
QW
3435/*
3436 * Structure used internally for find_free_extent() function. Wraps needed
3437 * parameters.
3438 */
3439struct find_free_extent_ctl {
3440 /* Basic allocation info */
3441 u64 ram_bytes;
3442 u64 num_bytes;
3443 u64 empty_size;
3444 u64 flags;
3445 int delalloc;
3446
3447 /* Where to start the search inside the bg */
3448 u64 search_start;
3449
3450 /* For clustered allocation */
3451 u64 empty_cluster;
3452
3453 bool have_caching_bg;
3454 bool orig_have_caching_bg;
3455
3456 /* RAID index, converted from flags */
3457 int index;
3458
e72d79d6
QW
3459 /*
3460 * Current loop number, check find_free_extent_update_loop() for details
3461 */
b4bd745d
QW
3462 int loop;
3463
3464 /*
3465 * Whether we're refilling a cluster, if true we need to re-search
3466 * current block group but don't try to refill the cluster again.
3467 */
3468 bool retry_clustered;
3469
3470 /*
3471 * Whether we're updating free space cache, if true we need to re-search
3472 * current block group but don't try updating free space cache again.
3473 */
3474 bool retry_unclustered;
3475
3476 /* If current block group is cached */
3477 int cached;
3478
3479 /* Max contiguous hole found */
3480 u64 max_extent_size;
3481
3482 /* Total free space from free space cache, not always contiguous */
3483 u64 total_free_space;
3484
3485 /* Found result */
3486 u64 found_offset;
3487};
3488
d06e3bb6
QW
3489
3490/*
3491 * Helper function for find_free_extent().
3492 *
3493 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3494 * Return -EAGAIN to inform caller that we need to re-search this block group
3495 * Return >0 to inform caller that we find nothing
3496 * Return 0 means we have found a location and set ffe_ctl->found_offset.
3497 */
32da5386 3498static int find_free_extent_clustered(struct btrfs_block_group *bg,
d06e3bb6
QW
3499 struct btrfs_free_cluster *last_ptr,
3500 struct find_free_extent_ctl *ffe_ctl,
32da5386 3501 struct btrfs_block_group **cluster_bg_ret)
d06e3bb6 3502{
32da5386 3503 struct btrfs_block_group *cluster_bg;
d06e3bb6
QW
3504 u64 aligned_cluster;
3505 u64 offset;
3506 int ret;
3507
3508 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3509 if (!cluster_bg)
3510 goto refill_cluster;
3511 if (cluster_bg != bg && (cluster_bg->ro ||
3512 !block_group_bits(cluster_bg, ffe_ctl->flags)))
3513 goto release_cluster;
3514
3515 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
b3470b5d 3516 ffe_ctl->num_bytes, cluster_bg->start,
d06e3bb6
QW
3517 &ffe_ctl->max_extent_size);
3518 if (offset) {
3519 /* We have a block, we're done */
3520 spin_unlock(&last_ptr->refill_lock);
3521 trace_btrfs_reserve_extent_cluster(cluster_bg,
3522 ffe_ctl->search_start, ffe_ctl->num_bytes);
3523 *cluster_bg_ret = cluster_bg;
3524 ffe_ctl->found_offset = offset;
3525 return 0;
3526 }
3527 WARN_ON(last_ptr->block_group != cluster_bg);
3528
3529release_cluster:
3530 /*
3531 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3532 * lets just skip it and let the allocator find whatever block it can
3533 * find. If we reach this point, we will have tried the cluster
3534 * allocator plenty of times and not have found anything, so we are
3535 * likely way too fragmented for the clustering stuff to find anything.
3536 *
3537 * However, if the cluster is taken from the current block group,
3538 * release the cluster first, so that we stand a better chance of
3539 * succeeding in the unclustered allocation.
3540 */
3541 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3542 spin_unlock(&last_ptr->refill_lock);
3543 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3544 return -ENOENT;
3545 }
3546
3547 /* This cluster didn't work out, free it and start over */
3548 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3549
3550 if (cluster_bg != bg)
3551 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3552
3553refill_cluster:
3554 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3555 spin_unlock(&last_ptr->refill_lock);
3556 return -ENOENT;
3557 }
3558
3559 aligned_cluster = max_t(u64,
3560 ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3561 bg->full_stripe_len);
2ceeae2e
DS
3562 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3563 ffe_ctl->num_bytes, aligned_cluster);
d06e3bb6
QW
3564 if (ret == 0) {
3565 /* Now pull our allocation out of this cluster */
3566 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3567 ffe_ctl->num_bytes, ffe_ctl->search_start,
3568 &ffe_ctl->max_extent_size);
3569 if (offset) {
3570 /* We found one, proceed */
3571 spin_unlock(&last_ptr->refill_lock);
3572 trace_btrfs_reserve_extent_cluster(bg,
3573 ffe_ctl->search_start,
3574 ffe_ctl->num_bytes);
3575 ffe_ctl->found_offset = offset;
3576 return 0;
3577 }
3578 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3579 !ffe_ctl->retry_clustered) {
3580 spin_unlock(&last_ptr->refill_lock);
3581
3582 ffe_ctl->retry_clustered = true;
676f1f75 3583 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
d06e3bb6
QW
3584 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3585 return -EAGAIN;
3586 }
3587 /*
3588 * At this point we either didn't find a cluster or we weren't able to
3589 * allocate a block from our cluster. Free the cluster we've been
3590 * trying to use, and go to the next block group.
3591 */
3592 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3593 spin_unlock(&last_ptr->refill_lock);
3594 return 1;
3595}
3596
e1a41848
QW
3597/*
3598 * Return >0 to inform caller that we find nothing
3599 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3600 * Return -EAGAIN to inform caller that we need to re-search this block group
3601 */
32da5386 3602static int find_free_extent_unclustered(struct btrfs_block_group *bg,
e1a41848
QW
3603 struct btrfs_free_cluster *last_ptr,
3604 struct find_free_extent_ctl *ffe_ctl)
3605{
3606 u64 offset;
3607
3608 /*
3609 * We are doing an unclustered allocation, set the fragmented flag so
3610 * we don't bother trying to setup a cluster again until we get more
3611 * space.
3612 */
3613 if (unlikely(last_ptr)) {
3614 spin_lock(&last_ptr->lock);
3615 last_ptr->fragmented = 1;
3616 spin_unlock(&last_ptr->lock);
3617 }
3618 if (ffe_ctl->cached) {
3619 struct btrfs_free_space_ctl *free_space_ctl;
3620
3621 free_space_ctl = bg->free_space_ctl;
3622 spin_lock(&free_space_ctl->tree_lock);
3623 if (free_space_ctl->free_space <
3624 ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3625 ffe_ctl->empty_size) {
3626 ffe_ctl->total_free_space = max_t(u64,
3627 ffe_ctl->total_free_space,
3628 free_space_ctl->free_space);
3629 spin_unlock(&free_space_ctl->tree_lock);
3630 return 1;
3631 }
3632 spin_unlock(&free_space_ctl->tree_lock);
3633 }
3634
3635 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3636 ffe_ctl->num_bytes, ffe_ctl->empty_size,
3637 &ffe_ctl->max_extent_size);
3638
3639 /*
3640 * If we didn't find a chunk, and we haven't failed on this block group
3641 * before, and this block group is in the middle of caching and we are
3642 * ok with waiting, then go ahead and wait for progress to be made, and
3643 * set @retry_unclustered to true.
3644 *
3645 * If @retry_unclustered is true then we've already waited on this
3646 * block group once and should move on to the next block group.
3647 */
3648 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3649 ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
676f1f75
JB
3650 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3651 ffe_ctl->empty_size);
e1a41848
QW
3652 ffe_ctl->retry_unclustered = true;
3653 return -EAGAIN;
3654 } else if (!offset) {
3655 return 1;
3656 }
3657 ffe_ctl->found_offset = offset;
3658 return 0;
3659}
3660
e72d79d6
QW
3661/*
3662 * Return >0 means caller needs to re-search for free extent
3663 * Return 0 means we have the needed free extent.
3664 * Return <0 means we failed to locate any free extent.
3665 */
3666static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3667 struct btrfs_free_cluster *last_ptr,
3668 struct btrfs_key *ins,
3669 struct find_free_extent_ctl *ffe_ctl,
3670 int full_search, bool use_cluster)
3671{
3672 struct btrfs_root *root = fs_info->extent_root;
3673 int ret;
3674
3675 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3676 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3677 ffe_ctl->orig_have_caching_bg = true;
3678
3679 if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3680 ffe_ctl->have_caching_bg)
3681 return 1;
3682
3683 if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3684 return 1;
3685
3686 if (ins->objectid) {
3687 if (!use_cluster && last_ptr) {
3688 spin_lock(&last_ptr->lock);
3689 last_ptr->window_start = ins->objectid;
3690 spin_unlock(&last_ptr->lock);
3691 }
3692 return 0;
3693 }
3694
3695 /*
3696 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3697 * caching kthreads as we move along
3698 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3699 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3700 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3701 * again
3702 */
3703 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3704 ffe_ctl->index = 0;
3705 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3706 /*
3707 * We want to skip the LOOP_CACHING_WAIT step if we
3708 * don't have any uncached bgs and we've already done a
3709 * full search through.
3710 */
3711 if (ffe_ctl->orig_have_caching_bg || !full_search)
3712 ffe_ctl->loop = LOOP_CACHING_WAIT;
3713 else
3714 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3715 } else {
3716 ffe_ctl->loop++;
3717 }
3718
3719 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3720 struct btrfs_trans_handle *trans;
3721 int exist = 0;
3722
3723 trans = current->journal_info;
3724 if (trans)
3725 exist = 1;
3726 else
3727 trans = btrfs_join_transaction(root);
3728
3729 if (IS_ERR(trans)) {
3730 ret = PTR_ERR(trans);
3731 return ret;
3732 }
3733
fc471cb0
JB
3734 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3735 CHUNK_ALLOC_FORCE);
e72d79d6
QW
3736
3737 /*
3738 * If we can't allocate a new chunk we've already looped
3739 * through at least once, move on to the NO_EMPTY_SIZE
3740 * case.
3741 */
3742 if (ret == -ENOSPC)
3743 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3744
3745 /* Do not bail out on ENOSPC since we can do more. */
3746 if (ret < 0 && ret != -ENOSPC)
3747 btrfs_abort_transaction(trans, ret);
3748 else
3749 ret = 0;
3750 if (!exist)
3751 btrfs_end_transaction(trans);
3752 if (ret)
3753 return ret;
3754 }
3755
3756 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3757 /*
3758 * Don't loop again if we already have no empty_size and
3759 * no empty_cluster.
3760 */
3761 if (ffe_ctl->empty_size == 0 &&
3762 ffe_ctl->empty_cluster == 0)
3763 return -ENOSPC;
3764 ffe_ctl->empty_size = 0;
3765 ffe_ctl->empty_cluster = 0;
3766 }
3767 return 1;
3768 }
3769 return -ENOSPC;
3770}
3771
fec577fb
CM
3772/*
3773 * walks the btree of allocated extents and find a hole of a given size.
3774 * The key ins is changed to record the hole:
a4820398 3775 * ins->objectid == start position
62e2749e 3776 * ins->flags = BTRFS_EXTENT_ITEM_KEY
a4820398 3777 * ins->offset == the size of the hole.
fec577fb 3778 * Any available blocks before search_start are skipped.
a4820398
MX
3779 *
3780 * If there is no suitable free space, we will record the max size of
3781 * the free space extent currently.
e72d79d6
QW
3782 *
3783 * The overall logic and call chain:
3784 *
3785 * find_free_extent()
3786 * |- Iterate through all block groups
3787 * | |- Get a valid block group
3788 * | |- Try to do clustered allocation in that block group
3789 * | |- Try to do unclustered allocation in that block group
3790 * | |- Check if the result is valid
3791 * | | |- If valid, then exit
3792 * | |- Jump to next block group
3793 * |
3794 * |- Push harder to find free extents
3795 * |- If not found, re-iterate all block groups
fec577fb 3796 */
87bde3cd 3797static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
18513091
WX
3798 u64 ram_bytes, u64 num_bytes, u64 empty_size,
3799 u64 hint_byte, struct btrfs_key *ins,
3800 u64 flags, int delalloc)
fec577fb 3801{
80eb234a 3802 int ret = 0;
db8fe64f 3803 int cache_block_group_error = 0;
fa9c0d79 3804 struct btrfs_free_cluster *last_ptr = NULL;
32da5386 3805 struct btrfs_block_group *block_group = NULL;
b4bd745d 3806 struct find_free_extent_ctl ffe_ctl = {0};
80eb234a 3807 struct btrfs_space_info *space_info;
67377734 3808 bool use_cluster = true;
a5e681d9 3809 bool full_search = false;
fec577fb 3810
0b246afa 3811 WARN_ON(num_bytes < fs_info->sectorsize);
b4bd745d
QW
3812
3813 ffe_ctl.ram_bytes = ram_bytes;
3814 ffe_ctl.num_bytes = num_bytes;
3815 ffe_ctl.empty_size = empty_size;
3816 ffe_ctl.flags = flags;
3817 ffe_ctl.search_start = 0;
3818 ffe_ctl.retry_clustered = false;
3819 ffe_ctl.retry_unclustered = false;
3820 ffe_ctl.delalloc = delalloc;
3821 ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
3822 ffe_ctl.have_caching_bg = false;
3823 ffe_ctl.orig_have_caching_bg = false;
3824 ffe_ctl.found_offset = 0;
3825
962a298f 3826 ins->type = BTRFS_EXTENT_ITEM_KEY;
80eb234a
JB
3827 ins->objectid = 0;
3828 ins->offset = 0;
b1a4d965 3829
71ff6437 3830 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
3f7de037 3831
280c2908 3832 space_info = btrfs_find_space_info(fs_info, flags);
1b1d1f66 3833 if (!space_info) {
0b246afa 3834 btrfs_err(fs_info, "No space info for %llu", flags);
1b1d1f66
JB
3835 return -ENOSPC;
3836 }
2552d17e 3837
67377734 3838 /*
4f4db217
JB
3839 * If our free space is heavily fragmented we may not be able to make
3840 * big contiguous allocations, so instead of doing the expensive search
3841 * for free space, simply return ENOSPC with our max_extent_size so we
3842 * can go ahead and search for a more manageable chunk.
3843 *
3844 * If our max_extent_size is large enough for our allocation simply
3845 * disable clustering since we will likely not be able to find enough
3846 * space to create a cluster and induce latency trying.
67377734 3847 */
4f4db217
JB
3848 if (unlikely(space_info->max_extent_size)) {
3849 spin_lock(&space_info->lock);
3850 if (space_info->max_extent_size &&
3851 num_bytes > space_info->max_extent_size) {
3852 ins->offset = space_info->max_extent_size;
3853 spin_unlock(&space_info->lock);
3854 return -ENOSPC;
3855 } else if (space_info->max_extent_size) {
3856 use_cluster = false;
3857 }
3858 spin_unlock(&space_info->lock);
fa9c0d79 3859 }
0f9dd46c 3860
b4bd745d
QW
3861 last_ptr = fetch_cluster_info(fs_info, space_info,
3862 &ffe_ctl.empty_cluster);
239b14b3 3863 if (last_ptr) {
fa9c0d79
CM
3864 spin_lock(&last_ptr->lock);
3865 if (last_ptr->block_group)
3866 hint_byte = last_ptr->window_start;
c759c4e1
JB
3867 if (last_ptr->fragmented) {
3868 /*
3869 * We still set window_start so we can keep track of the
3870 * last place we found an allocation to try and save
3871 * some time.
3872 */
3873 hint_byte = last_ptr->window_start;
3874 use_cluster = false;
3875 }
fa9c0d79 3876 spin_unlock(&last_ptr->lock);
239b14b3 3877 }
fa9c0d79 3878
b4bd745d
QW
3879 ffe_ctl.search_start = max(ffe_ctl.search_start,
3880 first_logical_byte(fs_info, 0));
3881 ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
3882 if (ffe_ctl.search_start == hint_byte) {
3883 block_group = btrfs_lookup_block_group(fs_info,
3884 ffe_ctl.search_start);
817d52f8
JB
3885 /*
3886 * we don't want to use the block group if it doesn't match our
3887 * allocation bits, or if its not cached.
ccf0e725
JB
3888 *
3889 * However if we are re-searching with an ideal block group
3890 * picked out then we don't care that the block group is cached.
817d52f8 3891 */
b6919a58 3892 if (block_group && block_group_bits(block_group, flags) &&
285ff5af 3893 block_group->cached != BTRFS_CACHE_NO) {
2552d17e 3894 down_read(&space_info->groups_sem);
44fb5511
CM
3895 if (list_empty(&block_group->list) ||
3896 block_group->ro) {
3897 /*
3898 * someone is removing this block group,
3899 * we can't jump into the have_block_group
3900 * target because our list pointers are not
3901 * valid
3902 */
3903 btrfs_put_block_group(block_group);
3904 up_read(&space_info->groups_sem);
ccf0e725 3905 } else {
b4bd745d 3906 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3e72ee88 3907 block_group->flags);
e570fd27 3908 btrfs_lock_block_group(block_group, delalloc);
44fb5511 3909 goto have_block_group;
ccf0e725 3910 }
2552d17e 3911 } else if (block_group) {
fa9c0d79 3912 btrfs_put_block_group(block_group);
2552d17e 3913 }
42e70e7a 3914 }
2552d17e 3915search:
b4bd745d
QW
3916 ffe_ctl.have_caching_bg = false;
3917 if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
3918 ffe_ctl.index == 0)
a5e681d9 3919 full_search = true;
80eb234a 3920 down_read(&space_info->groups_sem);
b4bd745d
QW
3921 list_for_each_entry(block_group,
3922 &space_info->block_groups[ffe_ctl.index], list) {
14443937
JM
3923 /* If the block group is read-only, we can skip it entirely. */
3924 if (unlikely(block_group->ro))
3925 continue;
3926
e570fd27 3927 btrfs_grab_block_group(block_group, delalloc);
b3470b5d 3928 ffe_ctl.search_start = block_group->start;
42e70e7a 3929
83a50de9
CM
3930 /*
3931 * this can happen if we end up cycling through all the
3932 * raid types, but we want to make sure we only allocate
3933 * for the proper type.
3934 */
b6919a58 3935 if (!block_group_bits(block_group, flags)) {
bece2e82 3936 u64 extra = BTRFS_BLOCK_GROUP_DUP |
c7369b3f 3937 BTRFS_BLOCK_GROUP_RAID1_MASK |
a07e8a46 3938 BTRFS_BLOCK_GROUP_RAID56_MASK |
83a50de9
CM
3939 BTRFS_BLOCK_GROUP_RAID10;
3940
3941 /*
3942 * if they asked for extra copies and this block group
3943 * doesn't provide them, bail. This does allow us to
3944 * fill raid0 from raid1.
3945 */
b6919a58 3946 if ((flags & extra) && !(block_group->flags & extra))
83a50de9 3947 goto loop;
2a28468e
QW
3948
3949 /*
3950 * This block group has different flags than we want.
3951 * It's possible that we have MIXED_GROUP flag but no
3952 * block group is mixed. Just skip such block group.
3953 */
3954 btrfs_release_block_group(block_group, delalloc);
3955 continue;
83a50de9
CM
3956 }
3957
2552d17e 3958have_block_group:
32da5386 3959 ffe_ctl.cached = btrfs_block_group_done(block_group);
b4bd745d
QW
3960 if (unlikely(!ffe_ctl.cached)) {
3961 ffe_ctl.have_caching_bg = true;
676f1f75 3962 ret = btrfs_cache_block_group(block_group, 0);
db8fe64f
JB
3963
3964 /*
3965 * If we get ENOMEM here or something else we want to
3966 * try other block groups, because it may not be fatal.
3967 * However if we can't find anything else we need to
3968 * save our return here so that we return the actual
3969 * error that caused problems, not ENOSPC.
3970 */
3971 if (ret < 0) {
3972 if (!cache_block_group_error)
3973 cache_block_group_error = ret;
3974 ret = 0;
3975 goto loop;
3976 }
1d4284bd 3977 ret = 0;
817d52f8
JB
3978 }
3979
36cce922
JB
3980 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
3981 goto loop;
0f9dd46c 3982
0a24325e 3983 /*
062c05c4
AO
3984 * Ok we want to try and use the cluster allocator, so
3985 * lets look there
0a24325e 3986 */
c759c4e1 3987 if (last_ptr && use_cluster) {
32da5386 3988 struct btrfs_block_group *cluster_bg = NULL;
fa9c0d79 3989
d06e3bb6
QW
3990 ret = find_free_extent_clustered(block_group, last_ptr,
3991 &ffe_ctl, &cluster_bg);
062c05c4 3992
fa9c0d79 3993 if (ret == 0) {
d06e3bb6
QW
3994 if (cluster_bg && cluster_bg != block_group) {
3995 btrfs_release_block_group(block_group,
3996 delalloc);
3997 block_group = cluster_bg;
fa9c0d79 3998 }
d06e3bb6
QW
3999 goto checks;
4000 } else if (ret == -EAGAIN) {
817d52f8 4001 goto have_block_group;
d06e3bb6
QW
4002 } else if (ret > 0) {
4003 goto loop;
fa9c0d79 4004 }
d06e3bb6 4005 /* ret == -ENOENT case falls through */
fa9c0d79
CM
4006 }
4007
e1a41848
QW
4008 ret = find_free_extent_unclustered(block_group, last_ptr,
4009 &ffe_ctl);
4010 if (ret == -EAGAIN)
817d52f8 4011 goto have_block_group;
e1a41848 4012 else if (ret > 0)
1cdda9b8 4013 goto loop;
e1a41848 4014 /* ret == 0 case falls through */
fa9c0d79 4015checks:
b4bd745d
QW
4016 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4017 fs_info->stripesize);
25179201 4018
2552d17e 4019 /* move on to the next group */
b4bd745d 4020 if (ffe_ctl.search_start + num_bytes >
b3470b5d 4021 block_group->start + block_group->length) {
b4bd745d
QW
4022 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4023 num_bytes);
2552d17e 4024 goto loop;
6226cb0a 4025 }
f5a31e16 4026
b4bd745d
QW
4027 if (ffe_ctl.found_offset < ffe_ctl.search_start)
4028 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4029 ffe_ctl.search_start - ffe_ctl.found_offset);
2552d17e 4030
18513091
WX
4031 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
4032 num_bytes, delalloc);
f0486c68 4033 if (ret == -EAGAIN) {
b4bd745d
QW
4034 btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4035 num_bytes);
2552d17e 4036 goto loop;
0f9dd46c 4037 }
9cfa3e34 4038 btrfs_inc_block_group_reservations(block_group);
0b86a832 4039
f0486c68 4040 /* we are all good, lets return */
b4bd745d 4041 ins->objectid = ffe_ctl.search_start;
2552d17e 4042 ins->offset = num_bytes;
d2fb3437 4043
b4bd745d
QW
4044 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4045 num_bytes);
e570fd27 4046 btrfs_release_block_group(block_group, delalloc);
2552d17e
JB
4047 break;
4048loop:
b4bd745d
QW
4049 ffe_ctl.retry_clustered = false;
4050 ffe_ctl.retry_unclustered = false;
3e72ee88 4051 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
b4bd745d 4052 ffe_ctl.index);
e570fd27 4053 btrfs_release_block_group(block_group, delalloc);
14443937 4054 cond_resched();
2552d17e
JB
4055 }
4056 up_read(&space_info->groups_sem);
4057
e72d79d6
QW
4058 ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
4059 full_search, use_cluster);
4060 if (ret > 0)
b742bb82
YZ
4061 goto search;
4062
db8fe64f 4063 if (ret == -ENOSPC && !cache_block_group_error) {
b4bd745d
QW
4064 /*
4065 * Use ffe_ctl->total_free_space as fallback if we can't find
4066 * any contiguous hole.
4067 */
4068 if (!ffe_ctl.max_extent_size)
4069 ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4f4db217 4070 spin_lock(&space_info->lock);
b4bd745d 4071 space_info->max_extent_size = ffe_ctl.max_extent_size;
4f4db217 4072 spin_unlock(&space_info->lock);
b4bd745d 4073 ins->offset = ffe_ctl.max_extent_size;
db8fe64f
JB
4074 } else if (ret == -ENOSPC) {
4075 ret = cache_block_group_error;
4f4db217 4076 }
0f70abe2 4077 return ret;
fec577fb 4078}
ec44a35c 4079
6f47c706
NB
4080/*
4081 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4082 * hole that is at least as big as @num_bytes.
4083 *
4084 * @root - The root that will contain this extent
4085 *
4086 * @ram_bytes - The amount of space in ram that @num_bytes take. This
4087 * is used for accounting purposes. This value differs
4088 * from @num_bytes only in the case of compressed extents.
4089 *
4090 * @num_bytes - Number of bytes to allocate on-disk.
4091 *
4092 * @min_alloc_size - Indicates the minimum amount of space that the
4093 * allocator should try to satisfy. In some cases
4094 * @num_bytes may be larger than what is required and if
4095 * the filesystem is fragmented then allocation fails.
4096 * However, the presence of @min_alloc_size gives a
4097 * chance to try and satisfy the smaller allocation.
4098 *
4099 * @empty_size - A hint that you plan on doing more COW. This is the
4100 * size in bytes the allocator should try to find free
4101 * next to the block it returns. This is just a hint and
4102 * may be ignored by the allocator.
4103 *
4104 * @hint_byte - Hint to the allocator to start searching above the byte
4105 * address passed. It might be ignored.
4106 *
4107 * @ins - This key is modified to record the found hole. It will
4108 * have the following values:
4109 * ins->objectid == start position
4110 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4111 * ins->offset == the size of the hole.
4112 *
4113 * @is_data - Boolean flag indicating whether an extent is
4114 * allocated for data (true) or metadata (false)
4115 *
4116 * @delalloc - Boolean flag indicating whether this allocation is for
4117 * delalloc or not. If 'true' data_rwsem of block groups
4118 * is going to be acquired.
4119 *
4120 *
4121 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4122 * case -ENOSPC is returned then @ins->offset will contain the size of the
4123 * largest available hole the allocator managed to find.
4124 */
18513091 4125int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
11833d66
YZ
4126 u64 num_bytes, u64 min_alloc_size,
4127 u64 empty_size, u64 hint_byte,
e570fd27 4128 struct btrfs_key *ins, int is_data, int delalloc)
fec577fb 4129{
ab8d0fc4 4130 struct btrfs_fs_info *fs_info = root->fs_info;
36af4e07 4131 bool final_tried = num_bytes == min_alloc_size;
b6919a58 4132 u64 flags;
fec577fb 4133 int ret;
925baedd 4134
1b86826d 4135 flags = get_alloc_profile_by_root(root, is_data);
98d20f67 4136again:
0b246afa 4137 WARN_ON(num_bytes < fs_info->sectorsize);
87bde3cd 4138 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
18513091 4139 hint_byte, ins, flags, delalloc);
9cfa3e34 4140 if (!ret && !is_data) {
ab8d0fc4 4141 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
9cfa3e34 4142 } else if (ret == -ENOSPC) {
a4820398
MX
4143 if (!final_tried && ins->offset) {
4144 num_bytes = min(num_bytes >> 1, ins->offset);
da17066c 4145 num_bytes = round_down(num_bytes,
0b246afa 4146 fs_info->sectorsize);
9e622d6b 4147 num_bytes = max(num_bytes, min_alloc_size);
18513091 4148 ram_bytes = num_bytes;
9e622d6b
MX
4149 if (num_bytes == min_alloc_size)
4150 final_tried = true;
4151 goto again;
ab8d0fc4 4152 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
9e622d6b
MX
4153 struct btrfs_space_info *sinfo;
4154
280c2908 4155 sinfo = btrfs_find_space_info(fs_info, flags);
0b246afa 4156 btrfs_err(fs_info,
5d163e0e
JM
4157 "allocation failed flags %llu, wanted %llu",
4158 flags, num_bytes);
53804280 4159 if (sinfo)
5da6afeb
JB
4160 btrfs_dump_space_info(fs_info, sinfo,
4161 num_bytes, 1);
9e622d6b 4162 }
925baedd 4163 }
0f9dd46c
JB
4164
4165 return ret;
e6dcd2dc
CM
4166}
4167
2ff7e61e 4168static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27
MX
4169 u64 start, u64 len,
4170 int pin, int delalloc)
65b51a00 4171{
32da5386 4172 struct btrfs_block_group *cache;
1f3c79a2 4173 int ret = 0;
0f9dd46c 4174
0b246afa 4175 cache = btrfs_lookup_block_group(fs_info, start);
0f9dd46c 4176 if (!cache) {
0b246afa
JM
4177 btrfs_err(fs_info, "Unable to find block group for %llu",
4178 start);
0f9dd46c
JB
4179 return -ENOSPC;
4180 }
1f3c79a2 4181
7ef54d54 4182 ret = pin_down_extent(cache, start, len, 1);
fa9c0d79 4183 btrfs_put_block_group(cache);
e6dcd2dc
CM
4184 return ret;
4185}
4186
2ff7e61e 4187int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
e570fd27 4188 u64 start, u64 len, int delalloc)
e688b725 4189{
7ef54d54
NB
4190 struct btrfs_block_group *cache;
4191
4192 cache = btrfs_lookup_block_group(fs_info, start);
4193 if (!cache) {
4194 btrfs_err(fs_info, "unable to find block group for %llu", start);
4195 return -ENOSPC;
4196 }
4197
4198 btrfs_add_free_space(cache, start, len);
4199 btrfs_free_reserved_bytes(cache, len, delalloc);
4200 trace_btrfs_reserved_extent_free(fs_info, start, len);
4201
4202 btrfs_put_block_group(cache);
4203 return 0;
e688b725
CM
4204}
4205
2ff7e61e 4206int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
e688b725
CM
4207 u64 start, u64 len)
4208{
2ff7e61e 4209 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
e688b725
CM
4210}
4211
5d4f98a2 4212static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4213 u64 parent, u64 root_objectid,
4214 u64 flags, u64 owner, u64 offset,
4215 struct btrfs_key *ins, int ref_mod)
e6dcd2dc 4216{
ef89b824 4217 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 4218 int ret;
e6dcd2dc 4219 struct btrfs_extent_item *extent_item;
5d4f98a2 4220 struct btrfs_extent_inline_ref *iref;
e6dcd2dc 4221 struct btrfs_path *path;
5d4f98a2
YZ
4222 struct extent_buffer *leaf;
4223 int type;
4224 u32 size;
26b8003f 4225
5d4f98a2
YZ
4226 if (parent > 0)
4227 type = BTRFS_SHARED_DATA_REF_KEY;
4228 else
4229 type = BTRFS_EXTENT_DATA_REF_KEY;
58176a96 4230
5d4f98a2 4231 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7bb86316
CM
4232
4233 path = btrfs_alloc_path();
db5b493a
TI
4234 if (!path)
4235 return -ENOMEM;
47e4bb98 4236
b9473439 4237 path->leave_spinning = 1;
5d4f98a2
YZ
4238 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4239 ins, size);
79787eaa
JM
4240 if (ret) {
4241 btrfs_free_path(path);
4242 return ret;
4243 }
0f9dd46c 4244
5d4f98a2
YZ
4245 leaf = path->nodes[0];
4246 extent_item = btrfs_item_ptr(leaf, path->slots[0],
47e4bb98 4247 struct btrfs_extent_item);
5d4f98a2
YZ
4248 btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4249 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4250 btrfs_set_extent_flags(leaf, extent_item,
4251 flags | BTRFS_EXTENT_FLAG_DATA);
4252
4253 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4254 btrfs_set_extent_inline_ref_type(leaf, iref, type);
4255 if (parent > 0) {
4256 struct btrfs_shared_data_ref *ref;
4257 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4258 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4259 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4260 } else {
4261 struct btrfs_extent_data_ref *ref;
4262 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4263 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4264 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4265 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4266 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4267 }
47e4bb98
CM
4268
4269 btrfs_mark_buffer_dirty(path->nodes[0]);
7bb86316 4270 btrfs_free_path(path);
f510cfec 4271
25a356d3 4272 ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
1e144fb8
OS
4273 if (ret)
4274 return ret;
4275
ade4b516 4276 ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
79787eaa 4277 if (ret) { /* -ENOENT, logic error */
c2cf52eb 4278 btrfs_err(fs_info, "update block group failed for %llu %llu",
c1c9ff7c 4279 ins->objectid, ins->offset);
f5947066
CM
4280 BUG();
4281 }
71ff6437 4282 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
e6dcd2dc
CM
4283 return ret;
4284}
4285
5d4f98a2 4286static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4e6bd4e0 4287 struct btrfs_delayed_ref_node *node,
21ebfbe7 4288 struct btrfs_delayed_extent_op *extent_op)
e6dcd2dc 4289{
9dcdbe01 4290 struct btrfs_fs_info *fs_info = trans->fs_info;
e6dcd2dc 4291 int ret;
5d4f98a2 4292 struct btrfs_extent_item *extent_item;
4e6bd4e0 4293 struct btrfs_key extent_key;
5d4f98a2
YZ
4294 struct btrfs_tree_block_info *block_info;
4295 struct btrfs_extent_inline_ref *iref;
4296 struct btrfs_path *path;
4297 struct extent_buffer *leaf;
4e6bd4e0 4298 struct btrfs_delayed_tree_ref *ref;
3173a18f 4299 u32 size = sizeof(*extent_item) + sizeof(*iref);
4e6bd4e0 4300 u64 num_bytes;
21ebfbe7 4301 u64 flags = extent_op->flags_to_set;
0b246afa 4302 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3173a18f 4303
4e6bd4e0
NB
4304 ref = btrfs_delayed_node_to_tree_ref(node);
4305
4e6bd4e0
NB
4306 extent_key.objectid = node->bytenr;
4307 if (skinny_metadata) {
4308 extent_key.offset = ref->level;
4309 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4310 num_bytes = fs_info->nodesize;
4311 } else {
4312 extent_key.offset = node->num_bytes;
4313 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
3173a18f 4314 size += sizeof(*block_info);
4e6bd4e0
NB
4315 num_bytes = node->num_bytes;
4316 }
1c2308f8 4317
5d4f98a2 4318 path = btrfs_alloc_path();
80ee54bf 4319 if (!path)
d8926bb3 4320 return -ENOMEM;
56bec294 4321
5d4f98a2
YZ
4322 path->leave_spinning = 1;
4323 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4e6bd4e0 4324 &extent_key, size);
79787eaa 4325 if (ret) {
dd825259 4326 btrfs_free_path(path);
79787eaa
JM
4327 return ret;
4328 }
5d4f98a2
YZ
4329
4330 leaf = path->nodes[0];
4331 extent_item = btrfs_item_ptr(leaf, path->slots[0],
4332 struct btrfs_extent_item);
4333 btrfs_set_extent_refs(leaf, extent_item, 1);
4334 btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4335 btrfs_set_extent_flags(leaf, extent_item,
4336 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5d4f98a2 4337
3173a18f
JB
4338 if (skinny_metadata) {
4339 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4340 } else {
4341 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
21ebfbe7 4342 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4e6bd4e0 4343 btrfs_set_tree_block_level(leaf, block_info, ref->level);
3173a18f
JB
4344 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4345 }
5d4f98a2 4346
d4b20733 4347 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
5d4f98a2
YZ
4348 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4349 btrfs_set_extent_inline_ref_type(leaf, iref,
4350 BTRFS_SHARED_BLOCK_REF_KEY);
d4b20733 4351 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
5d4f98a2
YZ
4352 } else {
4353 btrfs_set_extent_inline_ref_type(leaf, iref,
4354 BTRFS_TREE_BLOCK_REF_KEY);
4e6bd4e0 4355 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
5d4f98a2
YZ
4356 }
4357
4358 btrfs_mark_buffer_dirty(leaf);
4359 btrfs_free_path(path);
4360
4e6bd4e0
NB
4361 ret = remove_from_free_space_tree(trans, extent_key.objectid,
4362 num_bytes);
1e144fb8
OS
4363 if (ret)
4364 return ret;
4365
ade4b516
JB
4366 ret = btrfs_update_block_group(trans, extent_key.objectid,
4367 fs_info->nodesize, 1);
79787eaa 4368 if (ret) { /* -ENOENT, logic error */
c2cf52eb 4369 btrfs_err(fs_info, "update block group failed for %llu %llu",
4e6bd4e0 4370 extent_key.objectid, extent_key.offset);
5d4f98a2
YZ
4371 BUG();
4372 }
0be5dc67 4373
4e6bd4e0 4374 trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
0b246afa 4375 fs_info->nodesize);
5d4f98a2
YZ
4376 return ret;
4377}
4378
4379int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84f7d8e6 4380 struct btrfs_root *root, u64 owner,
5846a3c2
QW
4381 u64 offset, u64 ram_bytes,
4382 struct btrfs_key *ins)
5d4f98a2 4383{
76675593 4384 struct btrfs_ref generic_ref = { 0 };
5d4f98a2
YZ
4385 int ret;
4386
84f7d8e6 4387 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
5d4f98a2 4388
76675593
QW
4389 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4390 ins->objectid, ins->offset, 0);
4391 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
8a5040f7 4392 btrfs_ref_tree_mod(root->fs_info, &generic_ref);
76675593
QW
4393 ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
4394 ram_bytes, NULL, NULL);
e6dcd2dc
CM
4395 return ret;
4396}
e02119d5
CM
4397
4398/*
4399 * this is used by the tree logging recovery code. It records that
4400 * an extent has been allocated and makes sure to clear the free
4401 * space cache bits as well
4402 */
5d4f98a2 4403int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5d4f98a2
YZ
4404 u64 root_objectid, u64 owner, u64 offset,
4405 struct btrfs_key *ins)
e02119d5 4406{
61da2abf 4407 struct btrfs_fs_info *fs_info = trans->fs_info;
e02119d5 4408 int ret;
32da5386 4409 struct btrfs_block_group *block_group;
ed7a6948 4410 struct btrfs_space_info *space_info;
11833d66 4411
8c2a1a30
JB
4412 /*
4413 * Mixed block groups will exclude before processing the log so we only
01327610 4414 * need to do the exclude dance if this fs isn't mixed.
8c2a1a30 4415 */
0b246afa 4416 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
2ff7e61e
JM
4417 ret = __exclude_logged_extent(fs_info, ins->objectid,
4418 ins->offset);
b50c6e25 4419 if (ret)
8c2a1a30 4420 return ret;
11833d66
YZ
4421 }
4422
0b246afa 4423 block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8c2a1a30
JB
4424 if (!block_group)
4425 return -EINVAL;
4426
ed7a6948
WX
4427 space_info = block_group->space_info;
4428 spin_lock(&space_info->lock);
4429 spin_lock(&block_group->lock);
4430 space_info->bytes_reserved += ins->offset;
4431 block_group->reserved += ins->offset;
4432 spin_unlock(&block_group->lock);
4433 spin_unlock(&space_info->lock);
4434
ef89b824
NB
4435 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4436 offset, ins, 1);
b50c6e25 4437 btrfs_put_block_group(block_group);
e02119d5
CM
4438 return ret;
4439}
4440
48a3b636
ES
4441static struct extent_buffer *
4442btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
bc877d28 4443 u64 bytenr, int level, u64 owner)
65b51a00 4444{
0b246afa 4445 struct btrfs_fs_info *fs_info = root->fs_info;
65b51a00
CM
4446 struct extent_buffer *buf;
4447
2ff7e61e 4448 buf = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
4449 if (IS_ERR(buf))
4450 return buf;
4451
b72c3aba
QW
4452 /*
4453 * Extra safety check in case the extent tree is corrupted and extent
4454 * allocator chooses to use a tree block which is already used and
4455 * locked.
4456 */
4457 if (buf->lock_owner == current->pid) {
4458 btrfs_err_rl(fs_info,
4459"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4460 buf->start, btrfs_header_owner(buf), current->pid);
4461 free_extent_buffer(buf);
4462 return ERR_PTR(-EUCLEAN);
4463 }
4464
85d4e461 4465 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
65b51a00 4466 btrfs_tree_lock(buf);
6a884d7d 4467 btrfs_clean_tree_block(buf);
3083ee2e 4468 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
b4ce94de 4469
8bead258 4470 btrfs_set_lock_blocking_write(buf);
4db8c528 4471 set_extent_buffer_uptodate(buf);
b4ce94de 4472
bc877d28
NB
4473 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4474 btrfs_set_header_level(buf, level);
4475 btrfs_set_header_bytenr(buf, buf->start);
4476 btrfs_set_header_generation(buf, trans->transid);
4477 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4478 btrfs_set_header_owner(buf, owner);
de37aa51 4479 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
bc877d28 4480 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
d0c803c4 4481 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
656f30db 4482 buf->log_index = root->log_transid % 2;
8cef4e16
YZ
4483 /*
4484 * we allow two log transactions at a time, use different
52042d8e 4485 * EXTENT bit to differentiate dirty pages.
8cef4e16 4486 */
656f30db 4487 if (buf->log_index == 0)
8cef4e16
YZ
4488 set_extent_dirty(&root->dirty_log_pages, buf->start,
4489 buf->start + buf->len - 1, GFP_NOFS);
4490 else
4491 set_extent_new(&root->dirty_log_pages, buf->start,
3744dbeb 4492 buf->start + buf->len - 1);
d0c803c4 4493 } else {
656f30db 4494 buf->log_index = -1;
d0c803c4 4495 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
65b51a00 4496 buf->start + buf->len - 1, GFP_NOFS);
d0c803c4 4497 }
64c12921 4498 trans->dirty = true;
b4ce94de 4499 /* this returns a buffer locked for blocking */
65b51a00
CM
4500 return buf;
4501}
4502
fec577fb 4503/*
f0486c68 4504 * finds a free extent and does all the dirty work required for allocation
67b7859e 4505 * returns the tree buffer or an ERR_PTR on error.
fec577fb 4506 */
4d75f8a9 4507struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
310712b2
OS
4508 struct btrfs_root *root,
4509 u64 parent, u64 root_objectid,
4510 const struct btrfs_disk_key *key,
4511 int level, u64 hint,
4512 u64 empty_size)
fec577fb 4513{
0b246afa 4514 struct btrfs_fs_info *fs_info = root->fs_info;
e2fa7227 4515 struct btrfs_key ins;
f0486c68 4516 struct btrfs_block_rsv *block_rsv;
5f39d397 4517 struct extent_buffer *buf;
67b7859e 4518 struct btrfs_delayed_extent_op *extent_op;
ed4f255b 4519 struct btrfs_ref generic_ref = { 0 };
f0486c68
YZ
4520 u64 flags = 0;
4521 int ret;
0b246afa
JM
4522 u32 blocksize = fs_info->nodesize;
4523 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
fec577fb 4524
05653ef3 4525#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
0b246afa 4526 if (btrfs_is_testing(fs_info)) {
faa2dbf0 4527 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
bc877d28 4528 level, root_objectid);
faa2dbf0
JB
4529 if (!IS_ERR(buf))
4530 root->alloc_bytenr += blocksize;
4531 return buf;
4532 }
05653ef3 4533#endif
fccb84c9 4534
67f9c220 4535 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
f0486c68
YZ
4536 if (IS_ERR(block_rsv))
4537 return ERR_CAST(block_rsv);
4538
18513091 4539 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
e570fd27 4540 empty_size, hint, &ins, 0, 0);
67b7859e
OS
4541 if (ret)
4542 goto out_unuse;
55c69072 4543
bc877d28
NB
4544 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4545 root_objectid);
67b7859e
OS
4546 if (IS_ERR(buf)) {
4547 ret = PTR_ERR(buf);
4548 goto out_free_reserved;
4549 }
f0486c68
YZ
4550
4551 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4552 if (parent == 0)
4553 parent = ins.objectid;
4554 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4555 } else
4556 BUG_ON(parent > 0);
4557
4558 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
78a6184a 4559 extent_op = btrfs_alloc_delayed_extent_op();
67b7859e
OS
4560 if (!extent_op) {
4561 ret = -ENOMEM;
4562 goto out_free_buf;
4563 }
f0486c68
YZ
4564 if (key)
4565 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4566 else
4567 memset(&extent_op->key, 0, sizeof(extent_op->key));
4568 extent_op->flags_to_set = flags;
35b3ad50
DS
4569 extent_op->update_key = skinny_metadata ? false : true;
4570 extent_op->update_flags = true;
4571 extent_op->is_data = false;
b1c79e09 4572 extent_op->level = level;
f0486c68 4573
ed4f255b
QW
4574 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4575 ins.objectid, ins.offset, parent);
4576 generic_ref.real_root = root->root_key.objectid;
4577 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
8a5040f7 4578 btrfs_ref_tree_mod(fs_info, &generic_ref);
ed4f255b 4579 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
7be07912 4580 extent_op, NULL, NULL);
67b7859e
OS
4581 if (ret)
4582 goto out_free_delayed;
f0486c68 4583 }
fec577fb 4584 return buf;
67b7859e
OS
4585
4586out_free_delayed:
4587 btrfs_free_delayed_extent_op(extent_op);
4588out_free_buf:
4589 free_extent_buffer(buf);
4590out_free_reserved:
2ff7e61e 4591 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
67b7859e 4592out_unuse:
67f9c220 4593 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
67b7859e 4594 return ERR_PTR(ret);
fec577fb 4595}
a28ec197 4596
2c47e605
YZ
4597struct walk_control {
4598 u64 refs[BTRFS_MAX_LEVEL];
4599 u64 flags[BTRFS_MAX_LEVEL];
4600 struct btrfs_key update_progress;
aea6f028
JB
4601 struct btrfs_key drop_progress;
4602 int drop_level;
2c47e605
YZ
4603 int stage;
4604 int level;
4605 int shared_level;
4606 int update_ref;
4607 int keep_locks;
1c4850e2
YZ
4608 int reada_slot;
4609 int reada_count;
78c52d9e 4610 int restarted;
2c47e605
YZ
4611};
4612
4613#define DROP_REFERENCE 1
4614#define UPDATE_BACKREF 2
4615
1c4850e2
YZ
4616static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4617 struct btrfs_root *root,
4618 struct walk_control *wc,
4619 struct btrfs_path *path)
6407bf6d 4620{
0b246afa 4621 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
4622 u64 bytenr;
4623 u64 generation;
4624 u64 refs;
94fcca9f 4625 u64 flags;
5d4f98a2 4626 u32 nritems;
1c4850e2
YZ
4627 struct btrfs_key key;
4628 struct extent_buffer *eb;
6407bf6d 4629 int ret;
1c4850e2
YZ
4630 int slot;
4631 int nread = 0;
6407bf6d 4632
1c4850e2
YZ
4633 if (path->slots[wc->level] < wc->reada_slot) {
4634 wc->reada_count = wc->reada_count * 2 / 3;
4635 wc->reada_count = max(wc->reada_count, 2);
4636 } else {
4637 wc->reada_count = wc->reada_count * 3 / 2;
4638 wc->reada_count = min_t(int, wc->reada_count,
0b246afa 4639 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1c4850e2 4640 }
7bb86316 4641
1c4850e2
YZ
4642 eb = path->nodes[wc->level];
4643 nritems = btrfs_header_nritems(eb);
bd56b302 4644
1c4850e2
YZ
4645 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4646 if (nread >= wc->reada_count)
4647 break;
bd56b302 4648
2dd3e67b 4649 cond_resched();
1c4850e2
YZ
4650 bytenr = btrfs_node_blockptr(eb, slot);
4651 generation = btrfs_node_ptr_generation(eb, slot);
2dd3e67b 4652
1c4850e2
YZ
4653 if (slot == path->slots[wc->level])
4654 goto reada;
5d4f98a2 4655
1c4850e2
YZ
4656 if (wc->stage == UPDATE_BACKREF &&
4657 generation <= root->root_key.offset)
bd56b302
CM
4658 continue;
4659
94fcca9f 4660 /* We don't lock the tree block, it's OK to be racy here */
2ff7e61e 4661 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
3173a18f
JB
4662 wc->level - 1, 1, &refs,
4663 &flags);
79787eaa
JM
4664 /* We don't care about errors in readahead. */
4665 if (ret < 0)
4666 continue;
94fcca9f
YZ
4667 BUG_ON(refs == 0);
4668
1c4850e2 4669 if (wc->stage == DROP_REFERENCE) {
1c4850e2
YZ
4670 if (refs == 1)
4671 goto reada;
bd56b302 4672
94fcca9f
YZ
4673 if (wc->level == 1 &&
4674 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4675 continue;
1c4850e2
YZ
4676 if (!wc->update_ref ||
4677 generation <= root->root_key.offset)
4678 continue;
4679 btrfs_node_key_to_cpu(eb, &key, slot);
4680 ret = btrfs_comp_cpu_keys(&key,
4681 &wc->update_progress);
4682 if (ret < 0)
4683 continue;
94fcca9f
YZ
4684 } else {
4685 if (wc->level == 1 &&
4686 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4687 continue;
6407bf6d 4688 }
1c4850e2 4689reada:
2ff7e61e 4690 readahead_tree_block(fs_info, bytenr);
1c4850e2 4691 nread++;
20524f02 4692 }
1c4850e2 4693 wc->reada_slot = slot;
20524f02 4694}
2c47e605 4695
f82d02d9 4696/*
2c016dc2 4697 * helper to process tree block while walking down the tree.
2c47e605 4698 *
2c47e605
YZ
4699 * when wc->stage == UPDATE_BACKREF, this function updates
4700 * back refs for pointers in the block.
4701 *
4702 * NOTE: return value 1 means we should stop walking down.
f82d02d9 4703 */
2c47e605 4704static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5d4f98a2 4705 struct btrfs_root *root,
2c47e605 4706 struct btrfs_path *path,
94fcca9f 4707 struct walk_control *wc, int lookup_info)
f82d02d9 4708{
2ff7e61e 4709 struct btrfs_fs_info *fs_info = root->fs_info;
2c47e605
YZ
4710 int level = wc->level;
4711 struct extent_buffer *eb = path->nodes[level];
2c47e605 4712 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
f82d02d9
YZ
4713 int ret;
4714
2c47e605
YZ
4715 if (wc->stage == UPDATE_BACKREF &&
4716 btrfs_header_owner(eb) != root->root_key.objectid)
4717 return 1;
f82d02d9 4718
2c47e605
YZ
4719 /*
4720 * when reference count of tree block is 1, it won't increase
4721 * again. once full backref flag is set, we never clear it.
4722 */
94fcca9f
YZ
4723 if (lookup_info &&
4724 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4725 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
2c47e605 4726 BUG_ON(!path->locks[level]);
2ff7e61e 4727 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 4728 eb->start, level, 1,
2c47e605
YZ
4729 &wc->refs[level],
4730 &wc->flags[level]);
79787eaa
JM
4731 BUG_ON(ret == -ENOMEM);
4732 if (ret)
4733 return ret;
2c47e605
YZ
4734 BUG_ON(wc->refs[level] == 0);
4735 }
5d4f98a2 4736
2c47e605
YZ
4737 if (wc->stage == DROP_REFERENCE) {
4738 if (wc->refs[level] > 1)
4739 return 1;
f82d02d9 4740
2c47e605 4741 if (path->locks[level] && !wc->keep_locks) {
bd681513 4742 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
4743 path->locks[level] = 0;
4744 }
4745 return 0;
4746 }
f82d02d9 4747
2c47e605
YZ
4748 /* wc->stage == UPDATE_BACKREF */
4749 if (!(wc->flags[level] & flag)) {
4750 BUG_ON(!path->locks[level]);
e339a6b0 4751 ret = btrfs_inc_ref(trans, root, eb, 1);
79787eaa 4752 BUG_ON(ret); /* -ENOMEM */
e339a6b0 4753 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 4754 BUG_ON(ret); /* -ENOMEM */
f5c8daa5 4755 ret = btrfs_set_disk_extent_flags(trans, eb->start,
b1c79e09
JB
4756 eb->len, flag,
4757 btrfs_header_level(eb), 0);
79787eaa 4758 BUG_ON(ret); /* -ENOMEM */
2c47e605
YZ
4759 wc->flags[level] |= flag;
4760 }
4761
4762 /*
4763 * the block is shared by multiple trees, so it's not good to
4764 * keep the tree lock
4765 */
4766 if (path->locks[level] && level > 0) {
bd681513 4767 btrfs_tree_unlock_rw(eb, path->locks[level]);
2c47e605
YZ
4768 path->locks[level] = 0;
4769 }
4770 return 0;
4771}
4772
78c52d9e
JB
4773/*
4774 * This is used to verify a ref exists for this root to deal with a bug where we
4775 * would have a drop_progress key that hadn't been updated properly.
4776 */
4777static int check_ref_exists(struct btrfs_trans_handle *trans,
4778 struct btrfs_root *root, u64 bytenr, u64 parent,
4779 int level)
4780{
4781 struct btrfs_path *path;
4782 struct btrfs_extent_inline_ref *iref;
4783 int ret;
4784
4785 path = btrfs_alloc_path();
4786 if (!path)
4787 return -ENOMEM;
4788
4789 ret = lookup_extent_backref(trans, path, &iref, bytenr,
4790 root->fs_info->nodesize, parent,
4791 root->root_key.objectid, level, 0);
4792 btrfs_free_path(path);
4793 if (ret == -ENOENT)
4794 return 0;
4795 if (ret < 0)
4796 return ret;
4797 return 1;
4798}
4799
1c4850e2 4800/*
2c016dc2 4801 * helper to process tree block pointer.
1c4850e2
YZ
4802 *
4803 * when wc->stage == DROP_REFERENCE, this function checks
4804 * reference count of the block pointed to. if the block
4805 * is shared and we need update back refs for the subtree
4806 * rooted at the block, this function changes wc->stage to
4807 * UPDATE_BACKREF. if the block is shared and there is no
4808 * need to update back, this function drops the reference
4809 * to the block.
4810 *
4811 * NOTE: return value 1 means we should stop walking down.
4812 */
4813static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4814 struct btrfs_root *root,
4815 struct btrfs_path *path,
94fcca9f 4816 struct walk_control *wc, int *lookup_info)
1c4850e2 4817{
0b246afa 4818 struct btrfs_fs_info *fs_info = root->fs_info;
1c4850e2
YZ
4819 u64 bytenr;
4820 u64 generation;
4821 u64 parent;
1c4850e2 4822 struct btrfs_key key;
581c1760 4823 struct btrfs_key first_key;
ffd4bb2a 4824 struct btrfs_ref ref = { 0 };
1c4850e2
YZ
4825 struct extent_buffer *next;
4826 int level = wc->level;
4827 int reada = 0;
4828 int ret = 0;
1152651a 4829 bool need_account = false;
1c4850e2
YZ
4830
4831 generation = btrfs_node_ptr_generation(path->nodes[level],
4832 path->slots[level]);
4833 /*
4834 * if the lower level block was created before the snapshot
4835 * was created, we know there is no need to update back refs
4836 * for the subtree
4837 */
4838 if (wc->stage == UPDATE_BACKREF &&
94fcca9f
YZ
4839 generation <= root->root_key.offset) {
4840 *lookup_info = 1;
1c4850e2 4841 return 1;
94fcca9f 4842 }
1c4850e2
YZ
4843
4844 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
581c1760
QW
4845 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
4846 path->slots[level]);
1c4850e2 4847
0b246afa 4848 next = find_extent_buffer(fs_info, bytenr);
1c4850e2 4849 if (!next) {
2ff7e61e 4850 next = btrfs_find_create_tree_block(fs_info, bytenr);
c871b0f2
LB
4851 if (IS_ERR(next))
4852 return PTR_ERR(next);
4853
b2aaaa3b
JB
4854 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
4855 level - 1);
1c4850e2
YZ
4856 reada = 1;
4857 }
4858 btrfs_tree_lock(next);
8bead258 4859 btrfs_set_lock_blocking_write(next);
1c4850e2 4860
2ff7e61e 4861 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
94fcca9f
YZ
4862 &wc->refs[level - 1],
4863 &wc->flags[level - 1]);
4867268c
JB
4864 if (ret < 0)
4865 goto out_unlock;
79787eaa 4866
c2cf52eb 4867 if (unlikely(wc->refs[level - 1] == 0)) {
0b246afa 4868 btrfs_err(fs_info, "Missing references.");
4867268c
JB
4869 ret = -EIO;
4870 goto out_unlock;
c2cf52eb 4871 }
94fcca9f 4872 *lookup_info = 0;
1c4850e2 4873
94fcca9f 4874 if (wc->stage == DROP_REFERENCE) {
1c4850e2 4875 if (wc->refs[level - 1] > 1) {
1152651a 4876 need_account = true;
94fcca9f
YZ
4877 if (level == 1 &&
4878 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4879 goto skip;
4880
1c4850e2
YZ
4881 if (!wc->update_ref ||
4882 generation <= root->root_key.offset)
4883 goto skip;
4884
4885 btrfs_node_key_to_cpu(path->nodes[level], &key,
4886 path->slots[level]);
4887 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
4888 if (ret < 0)
4889 goto skip;
4890
4891 wc->stage = UPDATE_BACKREF;
4892 wc->shared_level = level - 1;
4893 }
94fcca9f
YZ
4894 } else {
4895 if (level == 1 &&
4896 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4897 goto skip;
1c4850e2
YZ
4898 }
4899
b9fab919 4900 if (!btrfs_buffer_uptodate(next, generation, 0)) {
1c4850e2
YZ
4901 btrfs_tree_unlock(next);
4902 free_extent_buffer(next);
4903 next = NULL;
94fcca9f 4904 *lookup_info = 1;
1c4850e2
YZ
4905 }
4906
4907 if (!next) {
4908 if (reada && level == 1)
4909 reada_walk_down(trans, root, wc, path);
581c1760
QW
4910 next = read_tree_block(fs_info, bytenr, generation, level - 1,
4911 &first_key);
64c043de
LB
4912 if (IS_ERR(next)) {
4913 return PTR_ERR(next);
4914 } else if (!extent_buffer_uptodate(next)) {
416bc658 4915 free_extent_buffer(next);
97d9a8a4 4916 return -EIO;
416bc658 4917 }
1c4850e2 4918 btrfs_tree_lock(next);
8bead258 4919 btrfs_set_lock_blocking_write(next);
1c4850e2
YZ
4920 }
4921
4922 level--;
4867268c
JB
4923 ASSERT(level == btrfs_header_level(next));
4924 if (level != btrfs_header_level(next)) {
4925 btrfs_err(root->fs_info, "mismatched level");
4926 ret = -EIO;
4927 goto out_unlock;
4928 }
1c4850e2
YZ
4929 path->nodes[level] = next;
4930 path->slots[level] = 0;
bd681513 4931 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
1c4850e2
YZ
4932 wc->level = level;
4933 if (wc->level == 1)
4934 wc->reada_slot = 0;
4935 return 0;
4936skip:
4937 wc->refs[level - 1] = 0;
4938 wc->flags[level - 1] = 0;
94fcca9f
YZ
4939 if (wc->stage == DROP_REFERENCE) {
4940 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
4941 parent = path->nodes[level]->start;
4942 } else {
4867268c 4943 ASSERT(root->root_key.objectid ==
94fcca9f 4944 btrfs_header_owner(path->nodes[level]));
4867268c
JB
4945 if (root->root_key.objectid !=
4946 btrfs_header_owner(path->nodes[level])) {
4947 btrfs_err(root->fs_info,
4948 "mismatched block owner");
4949 ret = -EIO;
4950 goto out_unlock;
4951 }
94fcca9f
YZ
4952 parent = 0;
4953 }
1c4850e2 4954
78c52d9e
JB
4955 /*
4956 * If we had a drop_progress we need to verify the refs are set
4957 * as expected. If we find our ref then we know that from here
4958 * on out everything should be correct, and we can clear the
4959 * ->restarted flag.
4960 */
4961 if (wc->restarted) {
4962 ret = check_ref_exists(trans, root, bytenr, parent,
4963 level - 1);
4964 if (ret < 0)
4965 goto out_unlock;
4966 if (ret == 0)
4967 goto no_delete;
4968 ret = 0;
4969 wc->restarted = 0;
4970 }
4971
2cd86d30
QW
4972 /*
4973 * Reloc tree doesn't contribute to qgroup numbers, and we have
4974 * already accounted them at merge time (replace_path),
4975 * thus we could skip expensive subtree trace here.
4976 */
4977 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
4978 need_account) {
deb40627 4979 ret = btrfs_qgroup_trace_subtree(trans, next,
33d1f05c 4980 generation, level - 1);
1152651a 4981 if (ret) {
0b246afa 4982 btrfs_err_rl(fs_info,
5d163e0e
JM
4983 "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
4984 ret);
1152651a
MF
4985 }
4986 }
aea6f028
JB
4987
4988 /*
4989 * We need to update the next key in our walk control so we can
4990 * update the drop_progress key accordingly. We don't care if
4991 * find_next_key doesn't find a key because that means we're at
4992 * the end and are going to clean up now.
4993 */
4994 wc->drop_level = level;
4995 find_next_key(path, level, &wc->drop_progress);
4996
ffd4bb2a
QW
4997 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
4998 fs_info->nodesize, parent);
4999 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5000 ret = btrfs_free_extent(trans, &ref);
4867268c
JB
5001 if (ret)
5002 goto out_unlock;
1c4850e2 5003 }
78c52d9e 5004no_delete:
4867268c
JB
5005 *lookup_info = 1;
5006 ret = 1;
5007
5008out_unlock:
1c4850e2
YZ
5009 btrfs_tree_unlock(next);
5010 free_extent_buffer(next);
4867268c
JB
5011
5012 return ret;
1c4850e2
YZ
5013}
5014
2c47e605 5015/*
2c016dc2 5016 * helper to process tree block while walking up the tree.
2c47e605
YZ
5017 *
5018 * when wc->stage == DROP_REFERENCE, this function drops
5019 * reference count on the block.
5020 *
5021 * when wc->stage == UPDATE_BACKREF, this function changes
5022 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5023 * to UPDATE_BACKREF previously while processing the block.
5024 *
5025 * NOTE: return value 1 means we should stop walking up.
5026 */
5027static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5028 struct btrfs_root *root,
5029 struct btrfs_path *path,
5030 struct walk_control *wc)
5031{
0b246afa 5032 struct btrfs_fs_info *fs_info = root->fs_info;
f0486c68 5033 int ret;
2c47e605
YZ
5034 int level = wc->level;
5035 struct extent_buffer *eb = path->nodes[level];
5036 u64 parent = 0;
5037
5038 if (wc->stage == UPDATE_BACKREF) {
5039 BUG_ON(wc->shared_level < level);
5040 if (level < wc->shared_level)
5041 goto out;
5042
2c47e605
YZ
5043 ret = find_next_key(path, level + 1, &wc->update_progress);
5044 if (ret > 0)
5045 wc->update_ref = 0;
5046
5047 wc->stage = DROP_REFERENCE;
5048 wc->shared_level = -1;
5049 path->slots[level] = 0;
5050
5051 /*
5052 * check reference count again if the block isn't locked.
5053 * we should start walking down the tree again if reference
5054 * count is one.
5055 */
5056 if (!path->locks[level]) {
5057 BUG_ON(level == 0);
5058 btrfs_tree_lock(eb);
8bead258 5059 btrfs_set_lock_blocking_write(eb);
bd681513 5060 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 5061
2ff7e61e 5062 ret = btrfs_lookup_extent_info(trans, fs_info,
3173a18f 5063 eb->start, level, 1,
2c47e605
YZ
5064 &wc->refs[level],
5065 &wc->flags[level]);
79787eaa
JM
5066 if (ret < 0) {
5067 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5068 path->locks[level] = 0;
79787eaa
JM
5069 return ret;
5070 }
2c47e605
YZ
5071 BUG_ON(wc->refs[level] == 0);
5072 if (wc->refs[level] == 1) {
bd681513 5073 btrfs_tree_unlock_rw(eb, path->locks[level]);
3268a246 5074 path->locks[level] = 0;
2c47e605
YZ
5075 return 1;
5076 }
f82d02d9 5077 }
2c47e605 5078 }
f82d02d9 5079
2c47e605
YZ
5080 /* wc->stage == DROP_REFERENCE */
5081 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5d4f98a2 5082
2c47e605
YZ
5083 if (wc->refs[level] == 1) {
5084 if (level == 0) {
5085 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
e339a6b0 5086 ret = btrfs_dec_ref(trans, root, eb, 1);
2c47e605 5087 else
e339a6b0 5088 ret = btrfs_dec_ref(trans, root, eb, 0);
79787eaa 5089 BUG_ON(ret); /* -ENOMEM */
c4140cbf
QW
5090 if (is_fstree(root->root_key.objectid)) {
5091 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5092 if (ret) {
5093 btrfs_err_rl(fs_info,
5094 "error %d accounting leaf items, quota is out of sync, rescan required",
5d163e0e 5095 ret);
c4140cbf 5096 }
1152651a 5097 }
2c47e605 5098 }
6a884d7d 5099 /* make block locked assertion in btrfs_clean_tree_block happy */
2c47e605
YZ
5100 if (!path->locks[level] &&
5101 btrfs_header_generation(eb) == trans->transid) {
5102 btrfs_tree_lock(eb);
8bead258 5103 btrfs_set_lock_blocking_write(eb);
bd681513 5104 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 5105 }
6a884d7d 5106 btrfs_clean_tree_block(eb);
2c47e605
YZ
5107 }
5108
5109 if (eb == root->node) {
5110 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5111 parent = eb->start;
65c6e82b
QW
5112 else if (root->root_key.objectid != btrfs_header_owner(eb))
5113 goto owner_mismatch;
2c47e605
YZ
5114 } else {
5115 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5116 parent = path->nodes[level + 1]->start;
65c6e82b
QW
5117 else if (root->root_key.objectid !=
5118 btrfs_header_owner(path->nodes[level + 1]))
5119 goto owner_mismatch;
f82d02d9 5120 }
f82d02d9 5121
5581a51a 5122 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
2c47e605
YZ
5123out:
5124 wc->refs[level] = 0;
5125 wc->flags[level] = 0;
f0486c68 5126 return 0;
65c6e82b
QW
5127
5128owner_mismatch:
5129 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5130 btrfs_header_owner(eb), root->root_key.objectid);
5131 return -EUCLEAN;
2c47e605
YZ
5132}
5133
5134static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5135 struct btrfs_root *root,
5136 struct btrfs_path *path,
5137 struct walk_control *wc)
5138{
2c47e605 5139 int level = wc->level;
94fcca9f 5140 int lookup_info = 1;
2c47e605
YZ
5141 int ret;
5142
5143 while (level >= 0) {
94fcca9f 5144 ret = walk_down_proc(trans, root, path, wc, lookup_info);
2c47e605
YZ
5145 if (ret > 0)
5146 break;
5147
5148 if (level == 0)
5149 break;
5150
7a7965f8
YZ
5151 if (path->slots[level] >=
5152 btrfs_header_nritems(path->nodes[level]))
5153 break;
5154
94fcca9f 5155 ret = do_walk_down(trans, root, path, wc, &lookup_info);
1c4850e2
YZ
5156 if (ret > 0) {
5157 path->slots[level]++;
5158 continue;
90d2c51d
MX
5159 } else if (ret < 0)
5160 return ret;
1c4850e2 5161 level = wc->level;
f82d02d9 5162 }
f82d02d9
YZ
5163 return 0;
5164}
5165
d397712b 5166static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
98ed5174 5167 struct btrfs_root *root,
f82d02d9 5168 struct btrfs_path *path,
2c47e605 5169 struct walk_control *wc, int max_level)
20524f02 5170{
2c47e605 5171 int level = wc->level;
20524f02 5172 int ret;
9f3a7427 5173
2c47e605
YZ
5174 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5175 while (level < max_level && path->nodes[level]) {
5176 wc->level = level;
5177 if (path->slots[level] + 1 <
5178 btrfs_header_nritems(path->nodes[level])) {
5179 path->slots[level]++;
20524f02
CM
5180 return 0;
5181 } else {
2c47e605
YZ
5182 ret = walk_up_proc(trans, root, path, wc);
5183 if (ret > 0)
5184 return 0;
65c6e82b
QW
5185 if (ret < 0)
5186 return ret;
bd56b302 5187
2c47e605 5188 if (path->locks[level]) {
bd681513
CM
5189 btrfs_tree_unlock_rw(path->nodes[level],
5190 path->locks[level]);
2c47e605 5191 path->locks[level] = 0;
f82d02d9 5192 }
2c47e605
YZ
5193 free_extent_buffer(path->nodes[level]);
5194 path->nodes[level] = NULL;
5195 level++;
20524f02
CM
5196 }
5197 }
5198 return 1;
5199}
5200
9aca1d51 5201/*
2c47e605
YZ
5202 * drop a subvolume tree.
5203 *
5204 * this function traverses the tree freeing any blocks that only
5205 * referenced by the tree.
5206 *
5207 * when a shared tree block is found. this function decreases its
5208 * reference count by one. if update_ref is true, this function
5209 * also make sure backrefs for the shared block and all lower level
5210 * blocks are properly updated.
9d1a2a3a
DS
5211 *
5212 * If called with for_reloc == 0, may exit early with -EAGAIN
9aca1d51 5213 */
2c536799 5214int btrfs_drop_snapshot(struct btrfs_root *root,
66d7e7f0
AJ
5215 struct btrfs_block_rsv *block_rsv, int update_ref,
5216 int for_reloc)
20524f02 5217{
ab8d0fc4 5218 struct btrfs_fs_info *fs_info = root->fs_info;
5caf2a00 5219 struct btrfs_path *path;
2c47e605 5220 struct btrfs_trans_handle *trans;
ab8d0fc4 5221 struct btrfs_root *tree_root = fs_info->tree_root;
9f3a7427 5222 struct btrfs_root_item *root_item = &root->root_item;
2c47e605
YZ
5223 struct walk_control *wc;
5224 struct btrfs_key key;
5225 int err = 0;
5226 int ret;
5227 int level;
d29a9f62 5228 bool root_dropped = false;
20524f02 5229
4fd786e6 5230 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
1152651a 5231
5caf2a00 5232 path = btrfs_alloc_path();
cb1b69f4
TI
5233 if (!path) {
5234 err = -ENOMEM;
5235 goto out;
5236 }
20524f02 5237
2c47e605 5238 wc = kzalloc(sizeof(*wc), GFP_NOFS);
38a1a919
MF
5239 if (!wc) {
5240 btrfs_free_path(path);
cb1b69f4
TI
5241 err = -ENOMEM;
5242 goto out;
38a1a919 5243 }
2c47e605 5244
a22285a6 5245 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
5246 if (IS_ERR(trans)) {
5247 err = PTR_ERR(trans);
5248 goto out_free;
5249 }
98d5dc13 5250
0568e82d
JB
5251 err = btrfs_run_delayed_items(trans);
5252 if (err)
5253 goto out_end_trans;
5254
3fd0a558
YZ
5255 if (block_rsv)
5256 trans->block_rsv = block_rsv;
2c47e605 5257
83354f07
JB
5258 /*
5259 * This will help us catch people modifying the fs tree while we're
5260 * dropping it. It is unsafe to mess with the fs tree while it's being
5261 * dropped as we unlock the root node and parent nodes as we walk down
5262 * the tree, assuming nothing will change. If something does change
5263 * then we'll have stale information and drop references to blocks we've
5264 * already dropped.
5265 */
5266 set_bit(BTRFS_ROOT_DELETING, &root->state);
9f3a7427 5267 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2c47e605 5268 level = btrfs_header_level(root->node);
5d4f98a2 5269 path->nodes[level] = btrfs_lock_root_node(root);
8bead258 5270 btrfs_set_lock_blocking_write(path->nodes[level]);
9f3a7427 5271 path->slots[level] = 0;
bd681513 5272 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
5273 memset(&wc->update_progress, 0,
5274 sizeof(wc->update_progress));
9f3a7427 5275 } else {
9f3a7427 5276 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2c47e605
YZ
5277 memcpy(&wc->update_progress, &key,
5278 sizeof(wc->update_progress));
5279
6702ed49 5280 level = root_item->drop_level;
2c47e605 5281 BUG_ON(level == 0);
6702ed49 5282 path->lowest_level = level;
2c47e605
YZ
5283 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5284 path->lowest_level = 0;
5285 if (ret < 0) {
5286 err = ret;
79787eaa 5287 goto out_end_trans;
9f3a7427 5288 }
1c4850e2 5289 WARN_ON(ret > 0);
2c47e605 5290
7d9eb12c
CM
5291 /*
5292 * unlock our path, this is safe because only this
5293 * function is allowed to delete this snapshot
5294 */
5d4f98a2 5295 btrfs_unlock_up_safe(path, 0);
2c47e605
YZ
5296
5297 level = btrfs_header_level(root->node);
5298 while (1) {
5299 btrfs_tree_lock(path->nodes[level]);
8bead258 5300 btrfs_set_lock_blocking_write(path->nodes[level]);
fec386ac 5301 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605 5302
2ff7e61e 5303 ret = btrfs_lookup_extent_info(trans, fs_info,
2c47e605 5304 path->nodes[level]->start,
3173a18f 5305 level, 1, &wc->refs[level],
2c47e605 5306 &wc->flags[level]);
79787eaa
JM
5307 if (ret < 0) {
5308 err = ret;
5309 goto out_end_trans;
5310 }
2c47e605
YZ
5311 BUG_ON(wc->refs[level] == 0);
5312
5313 if (level == root_item->drop_level)
5314 break;
5315
5316 btrfs_tree_unlock(path->nodes[level]);
fec386ac 5317 path->locks[level] = 0;
2c47e605
YZ
5318 WARN_ON(wc->refs[level] != 1);
5319 level--;
5320 }
9f3a7427 5321 }
2c47e605 5322
78c52d9e 5323 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
2c47e605
YZ
5324 wc->level = level;
5325 wc->shared_level = -1;
5326 wc->stage = DROP_REFERENCE;
5327 wc->update_ref = update_ref;
5328 wc->keep_locks = 0;
0b246afa 5329 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
2c47e605 5330
d397712b 5331 while (1) {
9d1a2a3a 5332
2c47e605
YZ
5333 ret = walk_down_tree(trans, root, path, wc);
5334 if (ret < 0) {
5335 err = ret;
20524f02 5336 break;
2c47e605 5337 }
9aca1d51 5338
2c47e605
YZ
5339 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5340 if (ret < 0) {
5341 err = ret;
20524f02 5342 break;
2c47e605
YZ
5343 }
5344
5345 if (ret > 0) {
5346 BUG_ON(wc->stage != DROP_REFERENCE);
e7a84565
CM
5347 break;
5348 }
2c47e605
YZ
5349
5350 if (wc->stage == DROP_REFERENCE) {
aea6f028
JB
5351 wc->drop_level = wc->level;
5352 btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5353 &wc->drop_progress,
5354 path->slots[wc->drop_level]);
5355 }
5356 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5357 &wc->drop_progress);
5358 root_item->drop_level = wc->drop_level;
2c47e605
YZ
5359
5360 BUG_ON(wc->level == 0);
3a45bb20 5361 if (btrfs_should_end_transaction(trans) ||
2ff7e61e 5362 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
2c47e605
YZ
5363 ret = btrfs_update_root(trans, tree_root,
5364 &root->root_key,
5365 root_item);
79787eaa 5366 if (ret) {
66642832 5367 btrfs_abort_transaction(trans, ret);
79787eaa
JM
5368 err = ret;
5369 goto out_end_trans;
5370 }
2c47e605 5371
3a45bb20 5372 btrfs_end_transaction_throttle(trans);
2ff7e61e 5373 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
ab8d0fc4
JM
5374 btrfs_debug(fs_info,
5375 "drop snapshot early exit");
3c8f2422
JB
5376 err = -EAGAIN;
5377 goto out_free;
5378 }
5379
a22285a6 5380 trans = btrfs_start_transaction(tree_root, 0);
79787eaa
JM
5381 if (IS_ERR(trans)) {
5382 err = PTR_ERR(trans);
5383 goto out_free;
5384 }
3fd0a558
YZ
5385 if (block_rsv)
5386 trans->block_rsv = block_rsv;
c3e69d58 5387 }
20524f02 5388 }
b3b4aa74 5389 btrfs_release_path(path);
79787eaa
JM
5390 if (err)
5391 goto out_end_trans;
2c47e605 5392
ab9ce7d4 5393 ret = btrfs_del_root(trans, &root->root_key);
79787eaa 5394 if (ret) {
66642832 5395 btrfs_abort_transaction(trans, ret);
e19182c0 5396 err = ret;
79787eaa
JM
5397 goto out_end_trans;
5398 }
2c47e605 5399
76dda93c 5400 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
cb517eab
MX
5401 ret = btrfs_find_root(tree_root, &root->root_key, path,
5402 NULL, NULL);
79787eaa 5403 if (ret < 0) {
66642832 5404 btrfs_abort_transaction(trans, ret);
79787eaa
JM
5405 err = ret;
5406 goto out_end_trans;
5407 } else if (ret > 0) {
84cd948c
JB
5408 /* if we fail to delete the orphan item this time
5409 * around, it'll get picked up the next time.
5410 *
5411 * The most common failure here is just -ENOENT.
5412 */
5413 btrfs_del_orphan_item(trans, tree_root,
5414 root->root_key.objectid);
76dda93c
YZ
5415 }
5416 }
5417
27cdeb70 5418 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
2b9dbef2 5419 btrfs_add_dropped_root(trans, root);
76dda93c
YZ
5420 } else {
5421 free_extent_buffer(root->node);
5422 free_extent_buffer(root->commit_root);
b0feb9d9 5423 btrfs_put_fs_root(root);
76dda93c 5424 }
d29a9f62 5425 root_dropped = true;
79787eaa 5426out_end_trans:
3a45bb20 5427 btrfs_end_transaction_throttle(trans);
79787eaa 5428out_free:
2c47e605 5429 kfree(wc);
5caf2a00 5430 btrfs_free_path(path);
cb1b69f4 5431out:
d29a9f62
JB
5432 /*
5433 * So if we need to stop dropping the snapshot for whatever reason we
5434 * need to make sure to add it back to the dead root list so that we
5435 * keep trying to do the work later. This also cleans up roots if we
5436 * don't have it in the radix (like when we recover after a power fail
5437 * or unmount) so we don't leak memory.
5438 */
897ca819 5439 if (!for_reloc && !root_dropped)
d29a9f62 5440 btrfs_add_dead_root(root);
90515e7f 5441 if (err && err != -EAGAIN)
ab8d0fc4 5442 btrfs_handle_fs_error(fs_info, err, NULL);
2c536799 5443 return err;
20524f02 5444}
9078a3e1 5445
2c47e605
YZ
5446/*
5447 * drop subtree rooted at tree block 'node'.
5448 *
5449 * NOTE: this function will unlock and release tree block 'node'
66d7e7f0 5450 * only used by relocation code
2c47e605 5451 */
f82d02d9
YZ
5452int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5453 struct btrfs_root *root,
5454 struct extent_buffer *node,
5455 struct extent_buffer *parent)
5456{
0b246afa 5457 struct btrfs_fs_info *fs_info = root->fs_info;
f82d02d9 5458 struct btrfs_path *path;
2c47e605 5459 struct walk_control *wc;
f82d02d9
YZ
5460 int level;
5461 int parent_level;
5462 int ret = 0;
5463 int wret;
5464
2c47e605
YZ
5465 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5466
f82d02d9 5467 path = btrfs_alloc_path();
db5b493a
TI
5468 if (!path)
5469 return -ENOMEM;
f82d02d9 5470
2c47e605 5471 wc = kzalloc(sizeof(*wc), GFP_NOFS);
db5b493a
TI
5472 if (!wc) {
5473 btrfs_free_path(path);
5474 return -ENOMEM;
5475 }
2c47e605 5476
b9447ef8 5477 btrfs_assert_tree_locked(parent);
f82d02d9 5478 parent_level = btrfs_header_level(parent);
67439dad 5479 atomic_inc(&parent->refs);
f82d02d9
YZ
5480 path->nodes[parent_level] = parent;
5481 path->slots[parent_level] = btrfs_header_nritems(parent);
5482
b9447ef8 5483 btrfs_assert_tree_locked(node);
f82d02d9 5484 level = btrfs_header_level(node);
f82d02d9
YZ
5485 path->nodes[level] = node;
5486 path->slots[level] = 0;
bd681513 5487 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
2c47e605
YZ
5488
5489 wc->refs[parent_level] = 1;
5490 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5491 wc->level = level;
5492 wc->shared_level = -1;
5493 wc->stage = DROP_REFERENCE;
5494 wc->update_ref = 0;
5495 wc->keep_locks = 1;
0b246afa 5496 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
f82d02d9
YZ
5497
5498 while (1) {
2c47e605
YZ
5499 wret = walk_down_tree(trans, root, path, wc);
5500 if (wret < 0) {
f82d02d9 5501 ret = wret;
f82d02d9 5502 break;
2c47e605 5503 }
f82d02d9 5504
2c47e605 5505 wret = walk_up_tree(trans, root, path, wc, parent_level);
f82d02d9
YZ
5506 if (wret < 0)
5507 ret = wret;
5508 if (wret != 0)
5509 break;
5510 }
5511
2c47e605 5512 kfree(wc);
f82d02d9
YZ
5513 btrfs_free_path(path);
5514 return ret;
5515}
5516
6d07bcec
MX
5517/*
5518 * helper to account the unused space of all the readonly block group in the
633c0aad 5519 * space_info. takes mirrors into account.
6d07bcec 5520 */
633c0aad 5521u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6d07bcec 5522{
32da5386 5523 struct btrfs_block_group *block_group;
6d07bcec
MX
5524 u64 free_bytes = 0;
5525 int factor;
5526
01327610 5527 /* It's df, we don't care if it's racy */
633c0aad
JB
5528 if (list_empty(&sinfo->ro_bgs))
5529 return 0;
5530
5531 spin_lock(&sinfo->lock);
5532 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
6d07bcec
MX
5533 spin_lock(&block_group->lock);
5534
5535 if (!block_group->ro) {
5536 spin_unlock(&block_group->lock);
5537 continue;
5538 }
5539
46df06b8 5540 factor = btrfs_bg_type_to_factor(block_group->flags);
b3470b5d 5541 free_bytes += (block_group->length -
bf38be65 5542 block_group->used) * factor;
6d07bcec
MX
5543
5544 spin_unlock(&block_group->lock);
5545 }
6d07bcec
MX
5546 spin_unlock(&sinfo->lock);
5547
5548 return free_bytes;
5549}
5550
2ff7e61e
JM
5551int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5552 u64 start, u64 end)
acce952b 5553{
2ff7e61e 5554 return unpin_extent_range(fs_info, start, end, false);
acce952b 5555}
5556
499f377f
JM
5557/*
5558 * It used to be that old block groups would be left around forever.
5559 * Iterating over them would be enough to trim unused space. Since we
5560 * now automatically remove them, we also need to iterate over unallocated
5561 * space.
5562 *
5563 * We don't want a transaction for this since the discard may take a
5564 * substantial amount of time. We don't require that a transaction be
5565 * running, but we do need to take a running transaction into account
fee7acc3
JM
5566 * to ensure that we're not discarding chunks that were released or
5567 * allocated in the current transaction.
499f377f
JM
5568 *
5569 * Holding the chunks lock will prevent other threads from allocating
5570 * or releasing chunks, but it won't prevent a running transaction
5571 * from committing and releasing the memory that the pending chunks
5572 * list head uses. For that, we need to take a reference to the
fee7acc3
JM
5573 * transaction and hold the commit root sem. We only need to hold
5574 * it while performing the free space search since we have already
5575 * held back allocations.
499f377f 5576 */
8103d10b 5577static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
499f377f 5578{
8103d10b 5579 u64 start = SZ_1M, len = 0, end = 0;
499f377f
JM
5580 int ret;
5581
5582 *trimmed = 0;
5583
0be88e36
JM
5584 /* Discard not supported = nothing to do. */
5585 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
5586 return 0;
5587
52042d8e 5588 /* Not writable = nothing to do. */
ebbede42 5589 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
499f377f
JM
5590 return 0;
5591
5592 /* No free space = nothing to do. */
5593 if (device->total_bytes <= device->bytes_used)
5594 return 0;
5595
5596 ret = 0;
5597
5598 while (1) {
fb456252 5599 struct btrfs_fs_info *fs_info = device->fs_info;
499f377f
JM
5600 u64 bytes;
5601
5602 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5603 if (ret)
fee7acc3 5604 break;
499f377f 5605
929be17a
NB
5606 find_first_clear_extent_bit(&device->alloc_state, start,
5607 &start, &end,
5608 CHUNK_TRIMMED | CHUNK_ALLOCATED);
53460a45
NB
5609
5610 /* Ensure we skip the reserved area in the first 1M */
5611 start = max_t(u64, start, SZ_1M);
5612
929be17a
NB
5613 /*
5614 * If find_first_clear_extent_bit find a range that spans the
5615 * end of the device it will set end to -1, in this case it's up
5616 * to the caller to trim the value to the size of the device.
5617 */
5618 end = min(end, device->total_bytes - 1);
53460a45 5619
929be17a 5620 len = end - start + 1;
499f377f 5621
929be17a
NB
5622 /* We didn't find any extents */
5623 if (!len) {
499f377f 5624 mutex_unlock(&fs_info->chunk_mutex);
929be17a 5625 ret = 0;
499f377f
JM
5626 break;
5627 }
5628
929be17a
NB
5629 ret = btrfs_issue_discard(device->bdev, start, len,
5630 &bytes);
5631 if (!ret)
5632 set_extent_bits(&device->alloc_state, start,
5633 start + bytes - 1,
5634 CHUNK_TRIMMED);
499f377f
JM
5635 mutex_unlock(&fs_info->chunk_mutex);
5636
5637 if (ret)
5638 break;
5639
5640 start += len;
5641 *trimmed += bytes;
5642
5643 if (fatal_signal_pending(current)) {
5644 ret = -ERESTARTSYS;
5645 break;
5646 }
5647
5648 cond_resched();
5649 }
5650
5651 return ret;
5652}
5653
93bba24d
QW
5654/*
5655 * Trim the whole filesystem by:
5656 * 1) trimming the free space in each block group
5657 * 2) trimming the unallocated space on each device
5658 *
5659 * This will also continue trimming even if a block group or device encounters
5660 * an error. The return value will be the last error, or 0 if nothing bad
5661 * happens.
5662 */
2ff7e61e 5663int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
f7039b1d 5664{
32da5386 5665 struct btrfs_block_group *cache = NULL;
499f377f
JM
5666 struct btrfs_device *device;
5667 struct list_head *devices;
f7039b1d 5668 u64 group_trimmed;
07301df7 5669 u64 range_end = U64_MAX;
f7039b1d
LD
5670 u64 start;
5671 u64 end;
5672 u64 trimmed = 0;
93bba24d
QW
5673 u64 bg_failed = 0;
5674 u64 dev_failed = 0;
5675 int bg_ret = 0;
5676 int dev_ret = 0;
f7039b1d
LD
5677 int ret = 0;
5678
07301df7
QW
5679 /*
5680 * Check range overflow if range->len is set.
5681 * The default range->len is U64_MAX.
5682 */
5683 if (range->len != U64_MAX &&
5684 check_add_overflow(range->start, range->len, &range_end))
5685 return -EINVAL;
5686
6ba9fc8e 5687 cache = btrfs_lookup_first_block_group(fs_info, range->start);
2e405ad8 5688 for (; cache; cache = btrfs_next_block_group(cache)) {
b3470b5d 5689 if (cache->start >= range_end) {
f7039b1d
LD
5690 btrfs_put_block_group(cache);
5691 break;
5692 }
5693
b3470b5d
DS
5694 start = max(range->start, cache->start);
5695 end = min(range_end, cache->start + cache->length);
f7039b1d
LD
5696
5697 if (end - start >= range->minlen) {
32da5386 5698 if (!btrfs_block_group_done(cache)) {
676f1f75 5699 ret = btrfs_cache_block_group(cache, 0);
1be41b78 5700 if (ret) {
93bba24d
QW
5701 bg_failed++;
5702 bg_ret = ret;
5703 continue;
1be41b78 5704 }
676f1f75 5705 ret = btrfs_wait_block_group_cache_done(cache);
1be41b78 5706 if (ret) {
93bba24d
QW
5707 bg_failed++;
5708 bg_ret = ret;
5709 continue;
1be41b78 5710 }
f7039b1d
LD
5711 }
5712 ret = btrfs_trim_block_group(cache,
5713 &group_trimmed,
5714 start,
5715 end,
5716 range->minlen);
5717
5718 trimmed += group_trimmed;
5719 if (ret) {
93bba24d
QW
5720 bg_failed++;
5721 bg_ret = ret;
5722 continue;
f7039b1d
LD
5723 }
5724 }
f7039b1d
LD
5725 }
5726
93bba24d
QW
5727 if (bg_failed)
5728 btrfs_warn(fs_info,
5729 "failed to trim %llu block group(s), last error %d",
5730 bg_failed, bg_ret);
0b246afa 5731 mutex_lock(&fs_info->fs_devices->device_list_mutex);
d4e329de
JM
5732 devices = &fs_info->fs_devices->devices;
5733 list_for_each_entry(device, devices, dev_list) {
8103d10b 5734 ret = btrfs_trim_free_extents(device, &group_trimmed);
93bba24d
QW
5735 if (ret) {
5736 dev_failed++;
5737 dev_ret = ret;
499f377f 5738 break;
93bba24d 5739 }
499f377f
JM
5740
5741 trimmed += group_trimmed;
5742 }
0b246afa 5743 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
499f377f 5744
93bba24d
QW
5745 if (dev_failed)
5746 btrfs_warn(fs_info,
5747 "failed to trim %llu device(s), last error %d",
5748 dev_failed, dev_ret);
f7039b1d 5749 range->len = trimmed;
93bba24d
QW
5750 if (bg_ret)
5751 return bg_ret;
5752 return dev_ret;
f7039b1d 5753}
8257b2dc
MX
5754
5755/*
ea14b57f 5756 * btrfs_{start,end}_write_no_snapshotting() are similar to
9ea24bbe
FM
5757 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
5758 * data into the page cache through nocow before the subvolume is snapshoted,
5759 * but flush the data into disk after the snapshot creation, or to prevent
ea14b57f 5760 * operations while snapshotting is ongoing and that cause the snapshot to be
9ea24bbe 5761 * inconsistent (writes followed by expanding truncates for example).
8257b2dc 5762 */
ea14b57f 5763void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
8257b2dc
MX
5764{
5765 percpu_counter_dec(&root->subv_writers->counter);
093258e6 5766 cond_wake_up(&root->subv_writers->wait);
8257b2dc
MX
5767}
5768
ea14b57f 5769int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
8257b2dc 5770{
ea14b57f 5771 if (atomic_read(&root->will_be_snapshotted))
8257b2dc
MX
5772 return 0;
5773
5774 percpu_counter_inc(&root->subv_writers->counter);
5775 /*
5776 * Make sure counter is updated before we check for snapshot creation.
5777 */
5778 smp_mb();
ea14b57f
DS
5779 if (atomic_read(&root->will_be_snapshotted)) {
5780 btrfs_end_write_no_snapshotting(root);
8257b2dc
MX
5781 return 0;
5782 }
5783 return 1;
5784}
0bc19f90 5785
0bc19f90
ZL
5786void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
5787{
5788 while (true) {
5789 int ret;
5790
ea14b57f 5791 ret = btrfs_start_write_no_snapshotting(root);
0bc19f90
ZL
5792 if (ret)
5793 break;
4625956a
PZ
5794 wait_var_event(&root->will_be_snapshotted,
5795 !atomic_read(&root->will_be_snapshotted));
0bc19f90
ZL
5796 }
5797}